From ba9ad595cfc5e1e39214d919340c43d223a5688a Mon Sep 17 00:00:00 2001 From: "Matthias J. Kannwischer" Date: Fri, 15 Mar 2024 13:31:14 +0800 Subject: [PATCH] NTTs: remove ldr/str macros that are no longer needed --- .../naive/aarch64/intt_dilithium_1234_5678.s | 133 +++++++--------- .../naive/aarch64/ntt_dilithium_1234_5678.s | 125 +++++++-------- .../ntt_dilithium_1234_5678_manual_st4.s | 133 +++++++--------- .../naive/aarch64/ntt_dilithium_123_45678.s | 118 ++++++-------- .../ntt_dilithium_123_45678_manual_st4.s | 118 ++++++-------- .../aarch64/ntt_dilithium_123_45678_red.s | 110 ++++++------- .../ntt_dilithium_123_45678_w_scalar.s | 124 ++++++--------- .../ntt_dilithium_123_45678_w_scalar_red.s | 112 ++++++-------- examples/naive/aarch64/ntt_kyber_1234_567.s | 137 +++++++---------- .../aarch64/ntt_kyber_1234_567_manual_st4.s | 145 ++++++++---------- examples/naive/aarch64/ntt_kyber_123_4567.s | 81 ++++------ .../aarch64/ntt_kyber_123_4567_manual_st4.s | 90 ++++------- .../aarch64/ntt_kyber_123_4567_scalar_load.s | 88 ++++------- .../ntt_kyber_123_4567_scalar_load_store.s | 88 ++++------- .../aarch64/ntt_kyber_123_4567_scalar_store.s | 81 ++++------ 15 files changed, 665 insertions(+), 1018 deletions(-) diff --git a/examples/naive/aarch64/intt_dilithium_1234_5678.s b/examples/naive/aarch64/intt_dilithium_1234_5678.s index 02175873..f7ca48f0 100644 --- a/examples/naive/aarch64/intt_dilithium_1234_5678.s +++ b/examples/naive/aarch64/intt_dilithium_1234_5678.s @@ -26,27 +26,6 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vsub d,a,b sub \d\().4s, \a\().4s, \b\().4s .endm @@ -138,31 +117,31 @@ .endm .macro load_roots_1234 r_ptr - ldr_vi root0, \r_ptr, (8*16) - ldr_vo root1, \r_ptr, (-8*16 + 1*16) - ldr_vo root2, \r_ptr, (-8*16 + 2*16) - ldr_vo root3, \r_ptr, (-8*16 + 3*16) - ldr_vo root4, \r_ptr, (-8*16 + 4*16) - ldr_vo root5, \r_ptr, (-8*16 + 5*16) - ldr_vo root6, \r_ptr, (-8*16 + 6*16) - ldr_vo root7, \r_ptr, (-8*16 + 7*16) + ldr qform_root0, [\r_ptr], #(8*16) + ldr qform_root1, [\r_ptr, #(-8*16 + 1*16)] + ldr qform_root2, [\r_ptr, #(-8*16 + 2*16)] + ldr qform_root3, [\r_ptr, #(-8*16 + 3*16)] + ldr qform_root4, [\r_ptr, #(-8*16 + 4*16)] + ldr qform_root5, [\r_ptr, #(-8*16 + 5*16)] + ldr qform_root6, [\r_ptr, #(-8*16 + 6*16)] + ldr qform_root7, [\r_ptr, #(-8*16 + 7*16)] .endm .macro load_next_roots_56 root0, r_ptr0 - ldr_vi \root0, \r_ptr0, 16 + ldr qform_\root0, [\r_ptr0], #16 .endm .macro load_next_roots_6 root0, r_ptr0 - ldr_vi \root0, \r_ptr0, 8 + ldr qform_\root0, [\r_ptr0], #8 .endm .macro load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 - ldr_vi \root0, \r_ptr1, (6*16) - ldr_vo \root0_tw, \r_ptr1, (-6*16 + 1*16) - ldr_vo \root1, \r_ptr1, (-6*16 + 2*16) - ldr_vo \root1_tw, \r_ptr1, (-6*16 + 3*16) - ldr_vo \root2, \r_ptr1, (-6*16 + 4*16) - ldr_vo \root2_tw, \r_ptr1, (-6*16 + 5*16) + ldr qform_\root0, [ \r_ptr1], #(6*16) + ldr qform_\root0_tw, [\r_ptr1, #(-6*16 + 1*16)] + ldr qform_\root1, [ \r_ptr1, #(-6*16 + 2*16)] + ldr qform_\root1_tw, [\r_ptr1, #(-6*16 + 3*16)] + ldr qform_\root2, [ \r_ptr1, #(-6*16 + 4*16)] + ldr qform_\root2_tw, [\r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data @@ -334,10 +313,10 @@ _intt_dilithium_1234_5678: .p2align 2 layer5678_start: - ldr_vo data0, inp, (16*0) - ldr_vo data1, inp, (16*1) - ldr_vo data2, inp, (16*2) - ldr_vo data3, inp, (16*3) + ldr qform_data0, [inp, #(16*0)] + ldr qform_data1, [inp, #(16*1)] + ldr qform_data2, [inp, #(16*2)] + ldr qform_data3, [inp, #(16*3)] load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr0 @@ -359,10 +338,10 @@ layer5678_start: montg_reduce data0 montg_reduce data1 - str_vi data0, inp, (16*4) - str_vo data1, inp, (-16*4 + 1*16) - str_vo data2, inp, (-16*4 + 2*16) - str_vo data3, inp, (-16*4 + 3*16) + str qform_data0, [inp], #(16*4) + str qform_data1, [inp, #(-16*4 + 1*16)] + str qform_data2, [inp, #(-16*4 + 2*16)] + str qform_data3, [inp, #(-16*4 + 3*16)] // layer5678_end: subs count, count, #1 cbnz count, layer5678_start @@ -407,22 +386,22 @@ layer5678_start: .p2align 2 layer1234_start: - ldr_vo data0, in, 0 - ldr_vo data1, in, (1*(512/8)) - ldr_vo data2, in, (2*(512/8)) - ldr_vo data3, in, (3*(512/8)) - ldr_vo data4, in, (4*(512/8)) - ldr_vo data5, in, (5*(512/8)) - ldr_vo data6, in, (6*(512/8)) - ldr_vo data7, in, (7*(512/8)) - ldr_vo data8, in, (8*(512/8)) - ldr_vo data9, in, (9*(512/8)) - ldr_vo data10, in, (10*(512/8)) - ldr_vo data11, in, (11*(512/8)) - ldr_vo data12, in, (12*(512/8)) - ldr_vo data13, in, (13*(512/8)) - ldr_vo data14, in, (14*(512/8)) - ldr_vo data15, in, (15*(512/8)) + ldr qform_data0, [in, #0] + ldr qform_data1, [in, #(1*(512/8))] + ldr qform_data2, [in, #(2*(512/8))] + ldr qform_data3, [in, #(3*(512/8))] + ldr qform_data4, [in, #(4*(512/8))] + ldr qform_data5, [in, #(5*(512/8))] + ldr qform_data6, [in, #(6*(512/8))] + ldr qform_data7, [in, #(7*(512/8))] + ldr qform_data8, [in, #(8*(512/8))] + ldr qform_data9, [in, #(9*(512/8))] + ldr qform_data10, [in, #(10*(512/8))] + ldr qform_data11, [in, #(11*(512/8))] + ldr qform_data12, [in, #(12*(512/8))] + ldr qform_data13, [in, #(13*(512/8))] + ldr qform_data14, [in, #(14*(512/8))] + ldr qform_data15, [in, #(15*(512/8))] // layer4 gs_butterfly data0, data1, root3, 2, 3 @@ -473,14 +452,14 @@ layer1234_start: canonical_reduce data14, modulus_half, neg_modulus_half, t2, t3 canonical_reduce data15, modulus_half, neg_modulus_half, t2, t3 - str_vo data8, in, (8*(512/8)) - str_vo data9, in, (9*(512/8)) - str_vo data10, in, (10*(512/8)) - str_vo data11, in, (11*(512/8)) - str_vo data12, in, (12*(512/8)) - str_vo data13, in, (13*(512/8)) - str_vo data14, in, (14*(512/8)) - str_vo data15, in, (15*(512/8)) + str qform_data8, [in, # (8*(512/8))] + str qform_data9, [in, # (9*(512/8))] + str qform_data10, [in, #(10*(512/8))] + str qform_data11, [in, #(11*(512/8))] + str qform_data12, [in, #(12*(512/8))] + str qform_data13, [in, #(13*(512/8))] + str qform_data14, [in, #(14*(512/8))] + str qform_data15, [in, #(15*(512/8))] mul_ninv data8, data9, data10, data11, data12, data13, data14, data15, data0, data1, data2, data3, data4, data5, data6, data7 @@ -493,14 +472,14 @@ layer1234_start: canonical_reduce data14, modulus_half, neg_modulus_half, t2, t3 canonical_reduce data15, modulus_half, neg_modulus_half, t2, t3 - str_vi data8, in, (16) - str_vo data9, in, (-16 + 1*(512/8)) - str_vo data10, in, (-16 + 2*(512/8)) - str_vo data11, in, (-16 + 3*(512/8)) - str_vo data12, in, (-16 + 4*(512/8)) - str_vo data13, in, (-16 + 5*(512/8)) - str_vo data14, in, (-16 + 6*(512/8)) - str_vo data15, in, (-16 + 7*(512/8)) + str qform_data8, [in], #(16) + str qform_data9, [in, #(-16 + 1*(512/8))] + str qform_data10, [in, #(-16 + 2*(512/8))] + str qform_data11, [in, #(-16 + 3*(512/8))] + str qform_data12, [in, #(-16 + 4*(512/8))] + str qform_data13, [in, #(-16 + 5*(512/8))] + str qform_data14, [in, #(-16 + 6*(512/8))] + str qform_data15, [in, #(-16 + 7*(512/8))] // layer1234_end: subs count, count, #1 diff --git a/examples/naive/aarch64/ntt_dilithium_1234_5678.s b/examples/naive/aarch64/ntt_dilithium_1234_5678.s index d153f5ac..dbb2a3b2 100644 --- a/examples/naive/aarch64/ntt_dilithium_1234_5678.s +++ b/examples/naive/aarch64/ntt_dilithium_1234_5678.s @@ -26,27 +26,6 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().4s, \a\().4s, \b\().4s .endm @@ -98,31 +77,31 @@ .endm .macro load_roots_1234 - ldr_vi root0, r_ptr0, (8*16) - ldr_vo root1, r_ptr0, (-8*16 + 1*16) - ldr_vo root2, r_ptr0, (-8*16 + 2*16) - ldr_vo root3, r_ptr0, (-8*16 + 3*16) - ldr_vo root4, r_ptr0, (-8*16 + 4*16) - ldr_vo root5, r_ptr0, (-8*16 + 5*16) - ldr_vo root6, r_ptr0, (-8*16 + 6*16) - ldr_vo root7, r_ptr0, (-8*16 + 7*16) + ldr qform_root0, [r_ptr0], #(8*16) + ldr qform_root1, [r_ptr0, #(-8*16 + 1*16)] + ldr qform_root2, [r_ptr0, #(-8*16 + 2*16)] + ldr qform_root3, [r_ptr0, #(-8*16 + 3*16)] + ldr qform_root4, [r_ptr0, #(-8*16 + 4*16)] + ldr qform_root5, [r_ptr0, #(-8*16 + 5*16)] + ldr qform_root6, [r_ptr0, #(-8*16 + 6*16)] + ldr qform_root7, [r_ptr0, #(-8*16 + 7*16)] .endm .macro load_next_roots_56 root0, r_ptr0 - ldr_vi \root0, \r_ptr0, 16 + ldr qform_\root0, [\r_ptr0], #16 .endm .macro load_next_roots_6 root0, r_ptr0 - ldr_vi \root0, \r_ptr0, 8 + ldr qform_\root0, [\r_ptr0], #8 .endm .macro load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 - ldr_vi \root0, \r_ptr1, (6*16) - ldr_vo \root0_tw, \r_ptr1, (-6*16 + 1*16) - ldr_vo \root1, \r_ptr1, (-6*16 + 2*16) - ldr_vo \root1_tw, \r_ptr1, (-6*16 + 3*16) - ldr_vo \root2, \r_ptr1, (-6*16 + 4*16) - ldr_vo \root2_tw, \r_ptr1, (-6*16 + 5*16) + ldr qform_\root0, [ \r_ptr1], #(6*16) + ldr qform_\root0_tw, [\r_ptr1, #(-6*16 + 1*16)] + ldr qform_\root1, [ \r_ptr1, #(-6*16 + 2*16)] + ldr qform_\root1_tw, [\r_ptr1, #(-6*16 + 3*16)] + ldr qform_\root2, [ \r_ptr1, #(-6*16 + 4*16)] + ldr qform_\root2_tw, [\r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data @@ -324,22 +303,22 @@ _ntt_dilithium_1234_5678: .p2align 2 layer1234_start: - ldr_vo data0, in, 0 - ldr_vo data1, in, (1*(512/8)) - ldr_vo data2, in, (2*(512/8)) - ldr_vo data3, in, (3*(512/8)) - ldr_vo data4, in, (4*(512/8)) - ldr_vo data5, in, (5*(512/8)) - ldr_vo data6, in, (6*(512/8)) - ldr_vo data7, in, (7*(512/8)) - ldr_vo data8, in, (8*(512/8)) - ldr_vo data9, in, (9*(512/8)) - ldr_vo data10, in, (10*(512/8)) - ldr_vo data11, in, (11*(512/8)) - ldr_vo data12, in, (12*(512/8)) - ldr_vo data13, in, (13*(512/8)) - ldr_vo data14, in, (14*(512/8)) - ldr_vo data15, in, (15*(512/8)) + ldr qform_data0, [in, #0] + ldr qform_data1, [in, #(1*(512/8))] + ldr qform_data2, [in, #(2*(512/8))] + ldr qform_data3, [in, #(3*(512/8))] + ldr qform_data4, [in, #(4*(512/8))] + ldr qform_data5, [in, #(5*(512/8))] + ldr qform_data6, [in, #(6*(512/8))] + ldr qform_data7, [in, #(7*(512/8))] + ldr qform_data8, [in, #(8*(512/8))] + ldr qform_data9, [in, #(9*(512/8))] + ldr qform_data10, [in, #(10*(512/8))] + ldr qform_data11, [in, #(11*(512/8))] + ldr qform_data12, [in, #(12*(512/8))] + ldr qform_data13, [in, #(13*(512/8))] + ldr qform_data14, [in, #(14*(512/8))] + ldr qform_data15, [in, #(15*(512/8))] // layer 1 ct_butterfly data0, data8, root0, 0, 1 @@ -381,22 +360,22 @@ layer1234_start: ct_butterfly data12, data13, root6, 2, 3 ct_butterfly data14, data15, root7, 0, 1 - str_vi data0, in, (16) - str_vo data1, in, (-16 + 1*(512/8)) - str_vo data2, in, (-16 + 2*(512/8)) - str_vo data3, in, (-16 + 3*(512/8)) - str_vo data4, in, (-16 + 4*(512/8)) - str_vo data5, in, (-16 + 5*(512/8)) - str_vo data6, in, (-16 + 6*(512/8)) - str_vo data7, in, (-16 + 7*(512/8)) - str_vo data8, in, (-16 + 8*(512/8)) - str_vo data9, in, (-16 + 9*(512/8)) - str_vo data10, in, (-16 + 10*(512/8)) - str_vo data11, in, (-16 + 11*(512/8)) - str_vo data12, in, (-16 + 12*(512/8)) - str_vo data13, in, (-16 + 13*(512/8)) - str_vo data14, in, (-16 + 14*(512/8)) - str_vo data15, in, (-16 + 15*(512/8)) + str qform_data0, [in], #(16) + str qform_data1, [in, #(-16 + 1*(512/8))] + str qform_data2, [in, #(-16 + 2*(512/8))] + str qform_data3, [in, #(-16 + 3*(512/8))] + str qform_data4, [in, #(-16 + 4*(512/8))] + str qform_data5, [in, #(-16 + 5*(512/8))] + str qform_data6, [in, #(-16 + 6*(512/8))] + str qform_data7, [in, #(-16 + 7*(512/8))] + str qform_data8, [in, #(-16 + 8*(512/8))] + str qform_data9, [in, #(-16 + 9*(512/8))] + str qform_data10, [in, #(-16 + 10*(512/8))] + str qform_data11, [in, #(-16 + 11*(512/8))] + str qform_data12, [in, #(-16 + 12*(512/8))] + str qform_data13, [in, #(-16 + 13*(512/8))] + str qform_data14, [in, #(-16 + 14*(512/8))] + str qform_data15, [in, #(-16 + 15*(512/8))] // layer1234_end: subs count, count, #1 cbnz count, layer1234_start @@ -423,10 +402,10 @@ layer1234_start: .p2align 2 layer5678_start: - ldr_vo data0, inp, (16*0) - ldr_vo data1, inp, (16*1) - ldr_vo data2, inp, (16*2) - ldr_vo data3, inp, (16*3) + ldr qform_data0, [inp, #(16*0)] + ldr qform_data1, [inp, #(16*1)] + ldr qform_data2, [inp, #(16*2)] + ldr qform_data3, [inp, #(16*3)] load_next_roots_56 root0, r_ptr0 load_next_roots_6 root1, r_ptr0 diff --git a/examples/naive/aarch64/ntt_dilithium_1234_5678_manual_st4.s b/examples/naive/aarch64/ntt_dilithium_1234_5678_manual_st4.s index 814609b3..ab5ed527 100644 --- a/examples/naive/aarch64/ntt_dilithium_1234_5678_manual_st4.s +++ b/examples/naive/aarch64/ntt_dilithium_1234_5678_manual_st4.s @@ -26,27 +26,6 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().4s, \a\().4s, \b\().4s .endm @@ -97,38 +76,38 @@ .endm .macro load_roots_1234 - ldr_vi root0, r_ptr0, (8*16) - ldr_vo root1, r_ptr0, (-8*16 + 1*16) - ldr_vo root2, r_ptr0, (-8*16 + 2*16) - ldr_vo root3, r_ptr0, (-8*16 + 3*16) - ldr_vo root4, r_ptr0, (-8*16 + 4*16) - ldr_vo root5, r_ptr0, (-8*16 + 5*16) - ldr_vo root6, r_ptr0, (-8*16 + 6*16) - ldr_vo root7, r_ptr0, (-8*16 + 7*16) + ldr qform_root0, [r_ptr0], #(8*16) + ldr qform_root1, [r_ptr0, #(-8*16 + 1*16)] + ldr qform_root2, [r_ptr0, #(-8*16 + 2*16)] + ldr qform_root3, [r_ptr0, #(-8*16 + 3*16)] + ldr qform_root4, [r_ptr0, #(-8*16 + 4*16)] + ldr qform_root5, [r_ptr0, #(-8*16 + 5*16)] + ldr qform_root6, [r_ptr0, #(-8*16 + 6*16)] + ldr qform_root7, [r_ptr0, #(-8*16 + 7*16)] .endm .macro load_next_roots_56 root0, r_ptr0 - ldr_vi \root0, \r_ptr0, 16 + ldr qform_\root0, [\r_ptr0], #16 .endm .macro load_next_roots_6 root0, r_ptr0 - ldr_vi \root0, \r_ptr0, 8 + ldr qform_\root0, [\r_ptr0], #8 .endm .macro load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 - ldr_vi \root0, \r_ptr1, (6*16) - ldr_vo \root0_tw, \r_ptr1, (-6*16 + 1*16) - ldr_vo \root1, \r_ptr1, (-6*16 + 2*16) - ldr_vo \root1_tw, \r_ptr1, (-6*16 + 3*16) - ldr_vo \root2, \r_ptr1, (-6*16 + 4*16) - ldr_vo \root2_tw, \r_ptr1, (-6*16 + 5*16) + ldr qform_\root0, [ \r_ptr1], #(6*16) + ldr qform_\root0_tw, [\r_ptr1, #(-6*16 + 1*16)] + ldr qform_\root1, [ \r_ptr1, #(-6*16 + 2*16)] + ldr qform_\root1_tw, [\r_ptr1, #(-6*16 + 3*16)] + ldr qform_\root2, [ \r_ptr1, #(-6*16 + 4*16)] + ldr qform_\root2_tw, [\r_ptr1, #(-6*16 + 5*16)] .endm .macro store_vectors_with_inc a0, a1, a2, a3, addr, inc - str_vi \a0, \addr, \inc - str_vo \a1, \addr, (-(\inc) + 16*1) - str_vo \a2, \addr, (-(\inc) + 16*2) - str_vo \a3, \addr, (-(\inc) + 16*3) + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] .endm .macro transpose4 data0, data1, data2, data3 @@ -297,22 +276,22 @@ _ntt_dilithium_1234_5678_manual_st4: .p2align 2 layer1234_start: - ldr_vo data0, in, 0 - ldr_vo data1, in, (1*(512/8)) - ldr_vo data2, in, (2*(512/8)) - ldr_vo data3, in, (3*(512/8)) - ldr_vo data4, in, (4*(512/8)) - ldr_vo data5, in, (5*(512/8)) - ldr_vo data6, in, (6*(512/8)) - ldr_vo data7, in, (7*(512/8)) - ldr_vo data8, in, (8*(512/8)) - ldr_vo data9, in, (9*(512/8)) - ldr_vo data10, in, (10*(512/8)) - ldr_vo data11, in, (11*(512/8)) - ldr_vo data12, in, (12*(512/8)) - ldr_vo data13, in, (13*(512/8)) - ldr_vo data14, in, (14*(512/8)) - ldr_vo data15, in, (15*(512/8)) + ldr qform_data0, [in, #0] + ldr qform_data1, [in, #(1*(512/8))] + ldr qform_data2, [in, #(2*(512/8))] + ldr qform_data3, [in, #(3*(512/8))] + ldr qform_data4, [in, #(4*(512/8))] + ldr qform_data5, [in, #(5*(512/8))] + ldr qform_data6, [in, #(6*(512/8))] + ldr qform_data7, [in, #(7*(512/8))] + ldr qform_data8, [in, #(8*(512/8))] + ldr qform_data9, [in, #(9*(512/8))] + ldr qform_data10, [in, #(10*(512/8))] + ldr qform_data11, [in, #(11*(512/8))] + ldr qform_data12, [in, #(12*(512/8))] + ldr qform_data13, [in, #(13*(512/8))] + ldr qform_data14, [in, #(14*(512/8))] + ldr qform_data15, [in, #(15*(512/8))] // layer 1 ct_butterfly data0, data8, root0, 0, 1 @@ -354,22 +333,22 @@ layer1234_start: ct_butterfly data12, data13, root6, 2, 3 ct_butterfly data14, data15, root7, 0, 1 - str_vi data0, in, (16) - str_vo data1, in, (-16 + 1*(512/8)) - str_vo data2, in, (-16 + 2*(512/8)) - str_vo data3, in, (-16 + 3*(512/8)) - str_vo data4, in, (-16 + 4*(512/8)) - str_vo data5, in, (-16 + 5*(512/8)) - str_vo data6, in, (-16 + 6*(512/8)) - str_vo data7, in, (-16 + 7*(512/8)) - str_vo data8, in, (-16 + 8*(512/8)) - str_vo data9, in, (-16 + 9*(512/8)) - str_vo data10, in, (-16 + 10*(512/8)) - str_vo data11, in, (-16 + 11*(512/8)) - str_vo data12, in, (-16 + 12*(512/8)) - str_vo data13, in, (-16 + 13*(512/8)) - str_vo data14, in, (-16 + 14*(512/8)) - str_vo data15, in, (-16 + 15*(512/8)) + str qform_data0, [in], #(16) + str qform_data1, [in, #(-16 + 1*(512/8))] + str qform_data2, [in, #(-16 + 2*(512/8))] + str qform_data3, [in, #(-16 + 3*(512/8))] + str qform_data4, [in, #(-16 + 4*(512/8))] + str qform_data5, [in, #(-16 + 5*(512/8))] + str qform_data6, [in, #(-16 + 6*(512/8))] + str qform_data7, [in, #(-16 + 7*(512/8))] + str qform_data8, [in, #(-16 + 8*(512/8))] + str qform_data9, [in, #(-16 + 9*(512/8))] + str qform_data10, [in, #(-16 + 10*(512/8))] + str qform_data11, [in, #(-16 + 11*(512/8))] + str qform_data12, [in, #(-16 + 12*(512/8))] + str qform_data13, [in, #(-16 + 13*(512/8))] + str qform_data14, [in, #(-16 + 14*(512/8))] + str qform_data15, [in, #(-16 + 15*(512/8))] // layer1234_end: subs count, count, #1 cbnz count, layer1234_start @@ -396,10 +375,10 @@ layer1234_start: .p2align 2 layer5678_start: - ldr_vo data0, inp, (16*0) - ldr_vo data1, inp, (16*1) - ldr_vo data2, inp, (16*2) - ldr_vo data3, inp, (16*3) + ldr qform_data0, [inp, #(16*0)] + ldr qform_data1, [inp, #(16*1)] + ldr qform_data2, [inp, #(16*2)] + ldr qform_data3, [inp, #(16*3)] load_next_roots_56 root0, r_ptr0 load_next_roots_6 root1, r_ptr0 diff --git a/examples/naive/aarch64/ntt_dilithium_123_45678.s b/examples/naive/aarch64/ntt_dilithium_123_45678.s index cb85b7f1..3cba8d65 100644 --- a/examples/naive/aarch64/ntt_dilithium_123_45678.s +++ b/examples/naive/aarch64/ntt_dilithium_123_45678.s @@ -2,31 +2,9 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. xtmp0 .req x10 xtmp1 .req x11 -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm - -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().4s, \a\().4s, \b\().4s .endm @@ -89,24 +67,24 @@ xtmp1 .req x11 .endm .macro load_vectors a0, a1, a2, a3, addr - ldr_vo \a0, \addr, (16*0) - ldr_vo \a1, \addr, (16*1) - ldr_vo \a2, \addr, (16*2) - ldr_vo \a3, \addr, (16*3) + ldr qform_\a0, [\addr, #(16*0)] + ldr qform_\a1, [\addr, #(16*1)] + ldr qform_\a2, [\addr, #(16*2)] + ldr qform_\a3, [\addr, #(16*3)] .endm .macro load_vectors_with_offset a0, a1, a2, a3, addr, offset - ldr_vo \a0, \addr, (16*0 + (\offset)) - ldr_vo \a1, \addr, (16*1 + (\offset)) - ldr_vo \a2, \addr, (16*2 + (\offset)) - ldr_vo \a3, \addr, (16*3 + (\offset)) + ldr qform_\a0, [\addr, #(16*0 + (\offset))] + ldr qform_\a1, [\addr, #(16*1 + (\offset))] + ldr qform_\a2, [\addr, #(16*2 + (\offset))] + ldr qform_\a3, [\addr, #(16*3 + (\offset))] .endm .macro store_vectors_with_inc a0, a1, a2, a3, addr, inc - str_vi \a0, \addr, \inc - str_vo \a1, \addr, (-(\inc) + 16*1) - str_vo \a2, \addr, (-(\inc) + 16*2) - str_vo \a3, \addr, (-(\inc) + 16*3) + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] .endm .macro vec_to_scalar_matrix out, in @@ -136,35 +114,35 @@ xtmp1 .req x11 .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_456 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_78_part1 - ldr_vi root0, r_ptr1, (12*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) - ldr_vo root1, r_ptr1, (-12*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) - ldr_vo root2, r_ptr1, (-12*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(12*16) + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-12*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-12*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 5*16)] .endm .macro load_roots_78_part2 - ldr_vo root0, r_ptr1, (-12*16 + 6*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) - ldr_vo root1, r_ptr1, (-12*16 + 8*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) - ldr_vo root2, r_ptr1, (-12*16 + 10*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) + ldr qform_root0, [ r_ptr1, #(-12*16 + 6*16)] + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 7*16)] + ldr qform_root1, [ r_ptr1, #(-12*16 + 8*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 9*16)] + ldr qform_root2, [ r_ptr1, #(-12*16 + 10*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 11*16)] .endm .macro transpose4 data0, data1, data2, data3 @@ -377,14 +355,14 @@ _ntt_dilithium_123_45678: .p2align 2 layer123_start: - ldr_vo data0, in, 0 - ldr_vo data1, in, (1*(1024/8)) - ldr_vo data2, in, (2*(1024/8)) - ldr_vo data3, in, (3*(1024/8)) - ldr_vo data4, in, (4*(1024/8)) - ldr_vo data5, in, (5*(1024/8)) - ldr_vo data6, in, (6*(1024/8)) - ldr_vo data7, in, (7*(1024/8)) + ldr qform_data0, [in, #0] + ldr qform_data1, [in, #(1*(1024/8))] + ldr qform_data2, [in, #(2*(1024/8))] + ldr qform_data3, [in, #(3*(1024/8))] + ldr qform_data4, [in, #(4*(1024/8))] + ldr qform_data5, [in, #(5*(1024/8))] + ldr qform_data6, [in, #(6*(1024/8))] + ldr qform_data7, [in, #(7*(1024/8))] ct_butterfly data0, data4, root0, 0, 1 ct_butterfly data1, data5, root0, 0, 1 @@ -401,14 +379,14 @@ layer123_start: ct_butterfly data4, data5, root2, 2, 3 ct_butterfly data6, data7, root3, 0, 1 - str_vi data0, in, (16) - str_vo data1, in, (-16 + 1*(1024/8)) - str_vo data2, in, (-16 + 2*(1024/8)) - str_vo data3, in, (-16 + 3*(1024/8)) - str_vo data4, in, (-16 + 4*(1024/8)) - str_vo data5, in, (-16 + 5*(1024/8)) - str_vo data6, in, (-16 + 6*(1024/8)) - str_vo data7, in, (-16 + 7*(1024/8)) + str qform_data0, [in], #(16) + str qform_data1, [in, #(-16 + 1*(1024/8))] + str qform_data2, [in, #(-16 + 2*(1024/8))] + str qform_data3, [in, #(-16 + 3*(1024/8))] + str qform_data4, [in, #(-16 + 4*(1024/8))] + str qform_data5, [in, #(-16 + 5*(1024/8))] + str qform_data6, [in, #(-16 + 6*(1024/8))] + str qform_data7, [in, #(-16 + 7*(1024/8))] subs count, count, #1 cbnz count, layer123_start diff --git a/examples/naive/aarch64/ntt_dilithium_123_45678_manual_st4.s b/examples/naive/aarch64/ntt_dilithium_123_45678_manual_st4.s index 8bc63220..e77bdafa 100644 --- a/examples/naive/aarch64/ntt_dilithium_123_45678_manual_st4.s +++ b/examples/naive/aarch64/ntt_dilithium_123_45678_manual_st4.s @@ -2,31 +2,9 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. xtmp0 .req x10 xtmp1 .req x11 -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm - -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().4s, \a\().4s, \b\().4s .endm @@ -89,24 +67,24 @@ xtmp1 .req x11 .endm .macro load_vectors a0, a1, a2, a3, addr - ldr_vo \a0, \addr, (16*0) - ldr_vo \a1, \addr, (16*1) - ldr_vo \a2, \addr, (16*2) - ldr_vo \a3, \addr, (16*3) + ldr qform_\a0, [\addr, #(16*0)] + ldr qform_\a1, [\addr, #(16*1)] + ldr qform_\a2, [\addr, #(16*2)] + ldr qform_\a3, [\addr, #(16*3)] .endm .macro load_vectors_with_offset a0, a1, a2, a3, addr, offset - ldr_vo \a0, \addr, (16*0 + (\offset)) - ldr_vo \a1, \addr, (16*1 + (\offset)) - ldr_vo \a2, \addr, (16*2 + (\offset)) - ldr_vo \a3, \addr, (16*3 + (\offset)) + ldr qform_\a0, [\addr, #(16*0 + (\offset))] + ldr qform_\a1, [\addr, #(16*1 + (\offset))] + ldr qform_\a2, [\addr, #(16*2 + (\offset))] + ldr qform_\a3, [\addr, #(16*3 + (\offset))] .endm .macro store_vectors_with_inc a0, a1, a2, a3, addr, inc - str_vi \a0, \addr, \inc - str_vo \a1, \addr, (-(\inc) + 16*1) - str_vo \a2, \addr, (-(\inc) + 16*2) - str_vo \a3, \addr, (-(\inc) + 16*3) + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] .endm .macro vec_to_scalar_matrix out, in @@ -136,35 +114,35 @@ xtmp1 .req x11 .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_456 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_78_part1 - ldr_vi root0, r_ptr1, (12*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) - ldr_vo root1, r_ptr1, (-12*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) - ldr_vo root2, r_ptr1, (-12*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(12*16) + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-12*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-12*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 5*16)] .endm .macro load_roots_78_part2 - ldr_vo root0, r_ptr1, (-12*16 + 6*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) - ldr_vo root1, r_ptr1, (-12*16 + 8*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) - ldr_vo root2, r_ptr1, (-12*16 + 10*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) + ldr qform_root0, [ r_ptr1, #(-12*16 + 6*16)] + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 7*16)] + ldr qform_root1, [ r_ptr1, #(-12*16 + 8*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 9*16)] + ldr qform_root2, [ r_ptr1, #(-12*16 + 10*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 11*16)] .endm .macro transpose4 data0, data1, data2, data3 @@ -377,14 +355,14 @@ _ntt_dilithium_123_45678_manual_st4: .p2align 2 layer123_start: - ldr_vo data0, in, 0 - ldr_vo data1, in, (1*(1024/8)) - ldr_vo data2, in, (2*(1024/8)) - ldr_vo data3, in, (3*(1024/8)) - ldr_vo data4, in, (4*(1024/8)) - ldr_vo data5, in, (5*(1024/8)) - ldr_vo data6, in, (6*(1024/8)) - ldr_vo data7, in, (7*(1024/8)) + ldr qform_data0, [in, #0] + ldr qform_data1, [in, #(1*(1024/8))] + ldr qform_data2, [in, #(2*(1024/8))] + ldr qform_data3, [in, #(3*(1024/8))] + ldr qform_data4, [in, #(4*(1024/8))] + ldr qform_data5, [in, #(5*(1024/8))] + ldr qform_data6, [in, #(6*(1024/8))] + ldr qform_data7, [in, #(7*(1024/8))] ct_butterfly data0, data4, root0, 0, 1 ct_butterfly data1, data5, root0, 0, 1 @@ -401,14 +379,14 @@ layer123_start: ct_butterfly data4, data5, root2, 2, 3 ct_butterfly data6, data7, root3, 0, 1 - str_vi data0, in, (16) - str_vo data1, in, (-16 + 1*(1024/8)) - str_vo data2, in, (-16 + 2*(1024/8)) - str_vo data3, in, (-16 + 3*(1024/8)) - str_vo data4, in, (-16 + 4*(1024/8)) - str_vo data5, in, (-16 + 5*(1024/8)) - str_vo data6, in, (-16 + 6*(1024/8)) - str_vo data7, in, (-16 + 7*(1024/8)) + str qform_data0, [in], #(16) + str qform_data1, [in, #(-16 + 1*(1024/8))] + str qform_data2, [in, #(-16 + 2*(1024/8))] + str qform_data3, [in, #(-16 + 3*(1024/8))] + str qform_data4, [in, #(-16 + 4*(1024/8))] + str qform_data5, [in, #(-16 + 5*(1024/8))] + str qform_data6, [in, #(-16 + 6*(1024/8))] + str qform_data7, [in, #(-16 + 7*(1024/8))] subs count, count, #1 cbnz count, layer123_start diff --git a/examples/naive/aarch64/ntt_dilithium_123_45678_red.s b/examples/naive/aarch64/ntt_dilithium_123_45678_red.s index 030cfc60..5213eb44 100644 --- a/examples/naive/aarch64/ntt_dilithium_123_45678_red.s +++ b/examples/naive/aarch64/ntt_dilithium_123_45678_red.s @@ -2,32 +2,10 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - .macro vins vec_out, gpr_in, lane ins \vec_out\().d[\lane], \gpr_in .endm -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm - -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().4s, \a\().4s, \b\().4s .endm @@ -90,49 +68,49 @@ .endm .macro load_vectors a0, a1, a2, a3, addr - ldr_vo \a0, \addr, (16*0) - ldr_vo \a1, \addr, (16*1) - ldr_vo \a2, \addr, (16*2) - ldr_vo \a3, \addr, (16*3) + ldr qform_\a0, [\addr, #(16*0)] + ldr qform_\a1, [\addr, #(16*1)] + ldr qform_\a2, [\addr, #(16*2)] + ldr qform_\a3, [\addr, #(16*3)] .endm .macro store_vectors_with_inc a0, a1, a2, a3, addr, inc - str_vi \a0, \addr, \inc - str_vo \a1, \addr, (-(\inc) + 16*1) - str_vo \a2, \addr, (-(\inc) + 16*2) - str_vo \a3, \addr, (-(\inc) + 16*3) + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_456 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_78_part1 - ldr_vi root0, r_ptr1, (12*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) - ldr_vo root1, r_ptr1, (-12*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) - ldr_vo root2, r_ptr1, (-12*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(12*16) + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-12*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-12*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 5*16)] .endm .macro load_roots_78_part2 - ldr_vo root0, r_ptr1, (-12*16 + 6*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) - ldr_vo root1, r_ptr1, (-12*16 + 8*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) - ldr_vo root2, r_ptr1, (-12*16 + 10*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) + ldr qform_root0, [ r_ptr1, #(-12*16 + 6*16)] + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 7*16)] + ldr qform_root1, [ r_ptr1, #(-12*16 + 8*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 9*16)] + ldr qform_root2, [ r_ptr1, #(-12*16 + 10*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 11*16)] .endm .macro transpose4 data0, data1, data2, data3 @@ -345,14 +323,14 @@ _ntt_dilithium_123_45678_red: .p2align 2 layer123_start: - ldr_vo data0, in, 0 - ldr_vo data1, in, (1*(1024/8)) - ldr_vo data2, in, (2*(1024/8)) - ldr_vo data3, in, (3*(1024/8)) - ldr_vo data4, in, (4*(1024/8)) - ldr_vo data5, in, (5*(1024/8)) - ldr_vo data6, in, (6*(1024/8)) - ldr_vo data7, in, (7*(1024/8)) + ldr qform_data0, [in, #0] + ldr qform_data1, [in, #(1*(1024/8))] + ldr qform_data2, [in, #(2*(1024/8))] + ldr qform_data3, [in, #(3*(1024/8))] + ldr qform_data4, [in, #(4*(1024/8))] + ldr qform_data5, [in, #(5*(1024/8))] + ldr qform_data6, [in, #(6*(1024/8))] + ldr qform_data7, [in, #(7*(1024/8))] ct_butterfly data0, data4, root0, 0, 1 ct_butterfly data1, data5, root0, 0, 1 @@ -369,14 +347,14 @@ layer123_start: ct_butterfly data4, data5, root2, 2, 3 ct_butterfly data6, data7, root3, 0, 1 - str_vi data0, in, (16) - str_vo data1, in, (-16 + 1*(1024/8)) - str_vo data2, in, (-16 + 2*(1024/8)) - str_vo data3, in, (-16 + 3*(1024/8)) - str_vo data4, in, (-16 + 4*(1024/8)) - str_vo data5, in, (-16 + 5*(1024/8)) - str_vo data6, in, (-16 + 6*(1024/8)) - str_vo data7, in, (-16 + 7*(1024/8)) + str qform_data0, [in], #(16) + str qform_data1, [in, #(-16 + 1*(1024/8))] + str qform_data2, [in, #(-16 + 2*(1024/8))] + str qform_data3, [in, #(-16 + 3*(1024/8))] + str qform_data4, [in, #(-16 + 4*(1024/8))] + str qform_data5, [in, #(-16 + 5*(1024/8))] + str qform_data6, [in, #(-16 + 6*(1024/8))] + str qform_data7, [in, #(-16 + 7*(1024/8))] subs count, count, #1 cbnz count, layer123_start diff --git a/examples/naive/aarch64/ntt_dilithium_123_45678_w_scalar.s b/examples/naive/aarch64/ntt_dilithium_123_45678_w_scalar.s index 479d11bb..c416a0b4 100644 --- a/examples/naive/aarch64/ntt_dilithium_123_45678_w_scalar.s +++ b/examples/naive/aarch64/ntt_dilithium_123_45678_w_scalar.s @@ -2,41 +2,13 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - .macro vins vec_out, gpr_in, lane ins \vec_out\().d[\lane], \gpr_in .endm xtmp0 .req x10 xtmp1 .req x11 -.macro ldr_vo vec, base, offset - ldr xtmp0, [\base, #\offset] - ldr xtmp1, [\base, #(\offset+8)] - vins \vec, xtmp0, 0 - vins \vec, xtmp1, 1 -.endm - -.macro ldr_vi vec, base, inc - ldr xtmp0, [\base], #\inc - ldr xtmp1, [\base, #(-\inc+8)] - vins \vec, xtmp0, 0 - vins \vec, xtmp1, 1 -.endm -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().4s, \a\().4s, \b\().4s .endm @@ -99,24 +71,24 @@ xtmp1 .req x11 .endm .macro load_vectors a0, a1, a2, a3, addr - ldr_vo \a0, \addr, (16*0) - ldr_vo \a1, \addr, (16*1) - ldr_vo \a2, \addr, (16*2) - ldr_vo \a3, \addr, (16*3) + ldr qform_\a0, [\addr, #(16*0)] + ldr qform_\a1, [\addr, #(16*1)] + ldr qform_\a2, [\addr, #(16*2)] + ldr qform_\a3, [\addr, #(16*3)] .endm .macro load_vectors_with_offset a0, a1, a2, a3, addr, offset - ldr_vo \a0, \addr, (16*0 + (\offset)) - ldr_vo \a1, \addr, (16*1 + (\offset)) - ldr_vo \a2, \addr, (16*2 + (\offset)) - ldr_vo \a3, \addr, (16*3 + (\offset)) + ldr qform_\a0, [\addr, #(16*0 + (\offset))] + ldr qform_\a1, [\addr, #(16*1 + (\offset))] + ldr qform_\a2, [\addr, #(16*2 + (\offset))] + ldr qform_\a3, [\addr, #(16*3 + (\offset))] .endm .macro store_vectors_with_inc a0, a1, a2, a3, addr, inc - str_vi \a0, \addr, \inc - str_vo \a1, \addr, (-(\inc) + 16*1) - str_vo \a2, \addr, (-(\inc) + 16*2) - str_vo \a3, \addr, (-(\inc) + 16*3) + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] .endm .macro vec_to_scalar_matrix out, in @@ -146,35 +118,35 @@ xtmp1 .req x11 .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_456 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_78_part1 - ldr_vi root0, r_ptr1, (12*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) - ldr_vo root1, r_ptr1, (-12*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) - ldr_vo root2, r_ptr1, (-12*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(12*16) + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-12*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-12*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 5*16)] .endm .macro load_roots_78_part2 - ldr_vo root0, r_ptr1, (-12*16 + 6*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) - ldr_vo root1, r_ptr1, (-12*16 + 8*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) - ldr_vo root2, r_ptr1, (-12*16 + 10*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) + ldr qform_root0, [ r_ptr1, #(-12*16 + 6*16)] + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 7*16)] + ldr qform_root1, [ r_ptr1, #(-12*16 + 8*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 9*16)] + ldr qform_root2, [ r_ptr1, #(-12*16 + 10*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 11*16)] .endm .macro transpose4 data0, data1, data2, data3 @@ -387,14 +359,14 @@ _ntt_dilithium_123_45678_w_scalar: .p2align 2 layer123_start: - ldr_vo data0, in, 0 - ldr_vo data1, in, (1*(1024/8)) - ldr_vo data2, in, (2*(1024/8)) - ldr_vo data3, in, (3*(1024/8)) - ldr_vo data4, in, (4*(1024/8)) - ldr_vo data5, in, (5*(1024/8)) - ldr_vo data6, in, (6*(1024/8)) - ldr_vo data7, in, (7*(1024/8)) + ldr qform_data0, [in, #0] + ldr qform_data1, [in, #(1*(1024/8))] + ldr qform_data2, [in, #(2*(1024/8))] + ldr qform_data3, [in, #(3*(1024/8))] + ldr qform_data4, [in, #(4*(1024/8))] + ldr qform_data5, [in, #(5*(1024/8))] + ldr qform_data6, [in, #(6*(1024/8))] + ldr qform_data7, [in, #(7*(1024/8))] ct_butterfly data0, data4, root0, 0, 1 ct_butterfly data1, data5, root0, 0, 1 @@ -411,14 +383,14 @@ layer123_start: ct_butterfly data4, data5, root2, 2, 3 ct_butterfly data6, data7, root3, 0, 1 - str_vi data0, in, (16) - str_vo data1, in, (-16 + 1*(1024/8)) - str_vo data2, in, (-16 + 2*(1024/8)) - str_vo data3, in, (-16 + 3*(1024/8)) - str_vo data4, in, (-16 + 4*(1024/8)) - str_vo data5, in, (-16 + 5*(1024/8)) - str_vo data6, in, (-16 + 6*(1024/8)) - str_vo data7, in, (-16 + 7*(1024/8)) + str qform_data0, [in], #(16) + str qform_data1, [in, #(-16 + 1*(1024/8))] + str qform_data2, [in, #(-16 + 2*(1024/8))] + str qform_data3, [in, #(-16 + 3*(1024/8))] + str qform_data4, [in, #(-16 + 4*(1024/8))] + str qform_data5, [in, #(-16 + 5*(1024/8))] + str qform_data6, [in, #(-16 + 6*(1024/8))] + str qform_data7, [in, #(-16 + 7*(1024/8))] subs count, count, #1 cbnz count, layer123_start diff --git a/examples/naive/aarch64/ntt_dilithium_123_45678_w_scalar_red.s b/examples/naive/aarch64/ntt_dilithium_123_45678_w_scalar_red.s index 9e57ec8d..c0a8ef0d 100644 --- a/examples/naive/aarch64/ntt_dilithium_123_45678_w_scalar_red.s +++ b/examples/naive/aarch64/ntt_dilithium_123_45678_w_scalar_red.s @@ -2,37 +2,13 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - .macro vins vec_out, gpr_in, lane ins \vec_out\().d[\lane], \gpr_in .endm xtmp0 .req x10 xtmp1 .req x11 -.macro ldr_vo vec, base, offset - ldr xtmp0, [\base, #\offset] - ldr xtmp1, [\base, #(\offset+8)] - vins \vec, xtmp0, 0 - vins \vec, xtmp1, 1 -.endm -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().4s, \a\().4s, \b\().4s .endm @@ -95,17 +71,17 @@ xtmp1 .req x11 .endm .macro load_vectors a0, a1, a2, a3, addr - ldr_vo \a0, \addr, (16*0) - ldr_vo \a1, \addr, (16*1) - ldr_vo \a2, \addr, (16*2) - ldr_vo \a3, \addr, (16*3) + ldr qform_\a0, [\addr, #(16*0)] + ldr qform_\a1, [\addr, #(16*1)] + ldr qform_\a2, [\addr, #(16*2)] + ldr qform_\a3, [\addr, #(16*3)] .endm .macro store_vectors_with_inc a0, a1, a2, a3, addr, inc - str_vi \a0, \addr, \inc - str_vo \a1, \addr, (-(\inc) + 16*1) - str_vo \a2, \addr, (-(\inc) + 16*2) - str_vo \a3, \addr, (-(\inc) + 16*3) + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] .endm .macro vec_to_scalar_matrix out, in @@ -135,35 +111,35 @@ xtmp1 .req x11 .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_456 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_78_part1 - ldr_vi root0, r_ptr1, (12*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) - ldr_vo root1, r_ptr1, (-12*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) - ldr_vo root2, r_ptr1, (-12*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(12*16) + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-12*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-12*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 5*16)] .endm .macro load_roots_78_part2 - ldr_vo root0, r_ptr1, (-12*16 + 6*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) - ldr_vo root1, r_ptr1, (-12*16 + 8*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) - ldr_vo root2, r_ptr1, (-12*16 + 10*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) + ldr qform_root0, [ r_ptr1, #(-12*16 + 6*16)] + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 7*16)] + ldr qform_root1, [ r_ptr1, #(-12*16 + 8*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 9*16)] + ldr qform_root2, [ r_ptr1, #(-12*16 + 10*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 11*16)] .endm .macro transpose4 data0, data1, data2, data3 @@ -376,14 +352,14 @@ _ntt_dilithium_123_45678_w_scalar_red: .p2align 2 layer123_start: - ldr_vo data0, in, 0 - ldr_vo data1, in, (1*(1024/8)) - ldr_vo data2, in, (2*(1024/8)) - ldr_vo data3, in, (3*(1024/8)) - ldr_vo data4, in, (4*(1024/8)) - ldr_vo data5, in, (5*(1024/8)) - ldr_vo data6, in, (6*(1024/8)) - ldr_vo data7, in, (7*(1024/8)) + ldr qform_data0, [in, #0] + ldr qform_data1, [in, #(1*(1024/8))] + ldr qform_data2, [in, #(2*(1024/8))] + ldr qform_data3, [in, #(3*(1024/8))] + ldr qform_data4, [in, #(4*(1024/8))] + ldr qform_data5, [in, #(5*(1024/8))] + ldr qform_data6, [in, #(6*(1024/8))] + ldr qform_data7, [in, #(7*(1024/8))] ct_butterfly data0, data4, root0, 0, 1 ct_butterfly data1, data5, root0, 0, 1 @@ -400,14 +376,14 @@ layer123_start: ct_butterfly data4, data5, root2, 2, 3 ct_butterfly data6, data7, root3, 0, 1 - str_vi data0, in, (16) - str_vo data1, in, (-16 + 1*(1024/8)) - str_vo data2, in, (-16 + 2*(1024/8)) - str_vo data3, in, (-16 + 3*(1024/8)) - str_vo data4, in, (-16 + 4*(1024/8)) - str_vo data5, in, (-16 + 5*(1024/8)) - str_vo data6, in, (-16 + 6*(1024/8)) - str_vo data7, in, (-16 + 7*(1024/8)) + str qform_data0, [in], #(16) + str qform_data1, [in, #(-16 + 1*(1024/8))] + str qform_data2, [in, #(-16 + 2*(1024/8))] + str qform_data3, [in, #(-16 + 3*(1024/8))] + str qform_data4, [in, #(-16 + 4*(1024/8))] + str qform_data5, [in, #(-16 + 5*(1024/8))] + str qform_data6, [in, #(-16 + 6*(1024/8))] + str qform_data7, [in, #(-16 + 7*(1024/8))] subs count, count, #1 cbnz count, layer123_start diff --git a/examples/naive/aarch64/ntt_kyber_1234_567.s b/examples/naive/aarch64/ntt_kyber_1234_567.s index 581d19fb..00f5f0de 100644 --- a/examples/naive/aarch64/ntt_kyber_1234_567.s +++ b/examples/naive/aarch64/ntt_kyber_1234_567.s @@ -26,33 +26,12 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - .macro trn1_s d,a,b trn1 \d\().4s, \a\().4s, \b\().4s .endm .macro trn2_s d,a,b trn2 \d\().4s, \a\().4s, \b\().4s .endm -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().8h, \a\().8h, \b\().8h .endm @@ -109,21 +88,21 @@ .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] .endm .macro load_next_roots_45 root0, r_ptr0 - ldr_vi \root0, \r_ptr0, 16 + ldr qform_\root0, [\r_ptr0], #16 .endm .macro load_next_roots_67 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 - ldr_vi \root0, \r_ptr1, (6*16) - ldr_vo \root0_tw, \r_ptr1, (-6*16 + 1*16) - ldr_vo \root1, \r_ptr1, (-6*16 + 2*16) - ldr_vo \root1_tw, \r_ptr1, (-6*16 + 3*16) - ldr_vo \root2, \r_ptr1, (-6*16 + 4*16) - ldr_vo \root2_tw, \r_ptr1, (-6*16 + 5*16) + ldr qform_\root0, [ \r_ptr1], #(6*16) + ldr qform_\root0_tw, [\r_ptr1, #(-6*16 + 1*16)] + ldr qform_\root1, [ \r_ptr1, #(-6*16 + 2*16)] + ldr qform_\root1_tw, [\r_ptr1, #(-6*16 + 3*16)] + ldr qform_\root2, [ \r_ptr1, #(-6*16 + 4*16)] + ldr qform_\root2_tw, [\r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data @@ -353,23 +332,23 @@ _ntt_kyber_1234_567: .p2align 2 layer1234_start: - ldr_vo data0, src0, 0 - ldr_vo data1, src0, 1*32 - ldr_vo data2, src0, 2*32 - ldr_vo data3, src0, 3*32 - ldr_vo data4, src0, 4*32 - ldr_vo data5, src0, 5*32 - ldr_vo data6, src0, 6*32 - ldr_vo data7, src0, 7*32 - - ldr_vo data8, src8, 0 - ldr_vo data9, src8, 1*32 - ldr_vo data10, src8, 2*32 - ldr_vo data11, src8, 3*32 - ldr_vo data12, src8, 4*32 - ldr_vo data13, src8, 5*32 - ldr_vo data14, src8, 6*32 - ldr_vo data15, src8, 7*32 + ldr qform_data0, [src0, #0] + ldr qform_data1, [src0, #1*32] + ldr qform_data2, [src0, #2*32] + ldr qform_data3, [src0, #3*32] + ldr qform_data4, [src0, #4*32] + ldr qform_data5, [src0, #5*32] + ldr qform_data6, [src0, #6*32] + ldr qform_data7, [src0, #7*32] + + ldr qform_data8, [src8, #0] + ldr qform_data9, [src8, #1*32] + ldr qform_data10, [src8, #2*32] + ldr qform_data11, [src8, #3*32] + ldr qform_data12, [src8, #4*32] + ldr qform_data13, [src8, #5*32] + ldr qform_data14, [src8, #6*32] + ldr qform_data15, [src8, #7*32] ct_butterfly data0, data8, root0, 0, 1 ct_butterfly data1, data9, root0, 0, 1 @@ -407,23 +386,23 @@ layer1234_start: ct_butterfly data12, data13, root3, 2, 3 ct_butterfly data14, data15, root3, 4, 5 - str_vi data0, src0, 16 - str_vo data1, src0, -16+1*32 - str_vo data2, src0, -16+2*32 - str_vo data3, src0, -16+3*32 - str_vo data4, src0, -16+4*32 - str_vo data5, src0, -16+5*32 - str_vo data6, src0, -16+6*32 - str_vo data7, src0, -16+7*32 - - str_vi data8, src8, 16 - str_vo data9, src8, -16+1*32 - str_vo data10, src8, -16+2*32 - str_vo data11, src8, -16+3*32 - str_vo data12, src8, -16+4*32 - str_vo data13, src8, -16+5*32 - str_vo data14, src8, -16+6*32 - str_vo data15, src8, -16+7*32 + str qform_data0, [src0], #16 + str qform_data1, [src0, #-16+1*32] + str qform_data2, [src0, #-16+2*32] + str qform_data3, [src0, #-16+3*32] + str qform_data4, [src0, #-16+4*32] + str qform_data5, [src0, #-16+5*32] + str qform_data6, [src0, #-16+6*32] + str qform_data7, [src0, #-16+7*32] + + str qform_data8, [src8], #16 + str qform_data9, [src8, #-16+1*32] + str qform_data10, [src8, #-16+2*32] + str qform_data11, [src8, #-16+3*32] + str qform_data12, [src8, #-16+4*32] + str qform_data13, [src8, #-16+5*32] + str qform_data14, [src8, #-16+6*32] + str qform_data15, [src8, #-16+7*32] subs count, count, #1 cbnz count, layer1234_start @@ -452,21 +431,21 @@ layer567_start: trn2_s data7, data11, data15 // load twiddle factors - ldr_vi root0, r_ptr1, 16*14 - ldr_vo root0_tw, r_ptr1, -16*14+16*1 - ldr_vo root1, r_ptr1, -16*14+16*2 - ldr_vo root1_tw, r_ptr1, -16*14+16*3 - ldr_vo root2, r_ptr1, -16*14+16*4 - ldr_vo root2_tw, r_ptr1, -16*14+16*5 - ldr_vo root3, r_ptr1, -16*14+16*6 - ldr_vo root3_tw, r_ptr1, -16*14+16*7 - - ldr_vo data8, r_ptr1, -16*14+16*8 - ldr_vo data9, r_ptr1, -16*14+16*9 - ldr_vo data10, r_ptr1, -16*14+16*10 - ldr_vo data11, r_ptr1, -16*14+16*11 - ldr_vo data12, r_ptr1, -16*14+16*12 - ldr_vo data13, r_ptr1, -16*14+16*13 + ldr qform_root0, [ r_ptr1], #16*14 + ldr qform_root0_tw, [r_ptr1, #-16*14+16*1] + ldr qform_root1, [ r_ptr1, #-16*14+16*2] + ldr qform_root1_tw, [r_ptr1, #-16*14+16*3] + ldr qform_root2, [ r_ptr1, #-16*14+16*4] + ldr qform_root2_tw, [r_ptr1, #-16*14+16*5] + ldr qform_root3, [ r_ptr1, #-16*14+16*6] + ldr qform_root3_tw, [r_ptr1, #-16*14+16*7] + + ldr qform_data8, [ r_ptr1, #-16*14+16*8] + ldr qform_data9, [ r_ptr1, #-16*14+16*9] + ldr qform_data10, [ r_ptr1, #-16*14+16*10] + ldr qform_data11, [ r_ptr1, #-16*14+16*11] + ldr qform_data12, [ r_ptr1, #-16*14+16*12] + ldr qform_data13, [ r_ptr1, #-16*14+16*13] // butterflies ct_butterfly_v data0, data4, root0, root0_tw diff --git a/examples/naive/aarch64/ntt_kyber_1234_567_manual_st4.s b/examples/naive/aarch64/ntt_kyber_1234_567_manual_st4.s index 23e0997d..89de7d00 100644 --- a/examples/naive/aarch64/ntt_kyber_1234_567_manual_st4.s +++ b/examples/naive/aarch64/ntt_kyber_1234_567_manual_st4.s @@ -26,15 +26,6 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - .macro trn1_s d,a,b trn1 \d\().4s, \a\().4s, \b\().4s .endm @@ -42,18 +33,6 @@ trn2 \d\().4s, \a\().4s, \b\().4s .endm -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().8h, \a\().8h, \b\().8h .endm @@ -110,21 +89,21 @@ .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] .endm .macro load_next_roots_45 root0, r_ptr0 - ldr_vi \root0, \r_ptr0, 16 + ldr qform_\root0, [\r_ptr0], #16 .endm .macro load_next_roots_67 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 - ldr_vi \root0, \r_ptr1, (6*16) - ldr_vo \root0_tw, \r_ptr1, (-6*16 + 1*16) - ldr_vo \root1, \r_ptr1, (-6*16 + 2*16) - ldr_vo \root1_tw, \r_ptr1, (-6*16 + 3*16) - ldr_vo \root2, \r_ptr1, (-6*16 + 4*16) - ldr_vo \root2_tw, \r_ptr1, (-6*16 + 5*16) + ldr qform_\root0, [ \r_ptr1], #(6*16) + ldr qform_\root0_tw, [\r_ptr1, #(-6*16 + 1*16)] + ldr qform_\root1, [ \r_ptr1, #(-6*16 + 2*16)] + ldr qform_\root1_tw, [\r_ptr1, #(-6*16 + 3*16)] + ldr qform_\root2, [ \r_ptr1, #(-6*16 + 4*16)] + ldr qform_\root2_tw, [\r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data0, data1, data2, data3 @@ -140,10 +119,10 @@ .endm .macro store_vectors_with_inc a0, a1, a2, a3, addr, inc - str_vi \a0, \addr, \inc - str_vo \a1, \addr, (-(\inc) + 16*1) - str_vo \a2, \addr, (-(\inc) + 16*2) - str_vo \a3, \addr, (-(\inc) + 16*3) + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] .endm .macro save_gprs // slothy:no-unfold @@ -361,23 +340,23 @@ _ntt_kyber_1234_567: .p2align 2 layer1234_start: - ldr_vo data0, src0, 0 - ldr_vo data1, src0, 1*32 - ldr_vo data2, src0, 2*32 - ldr_vo data3, src0, 3*32 - ldr_vo data4, src0, 4*32 - ldr_vo data5, src0, 5*32 - ldr_vo data6, src0, 6*32 - ldr_vo data7, src0, 7*32 - - ldr_vo data8, src8, 0 - ldr_vo data9, src8, 1*32 - ldr_vo data10, src8, 2*32 - ldr_vo data11, src8, 3*32 - ldr_vo data12, src8, 4*32 - ldr_vo data13, src8, 5*32 - ldr_vo data14, src8, 6*32 - ldr_vo data15, src8, 7*32 + ldr qform_data0, [src0, #0] + ldr qform_data1, [src0, #1*32] + ldr qform_data2, [src0, #2*32] + ldr qform_data3, [src0, #3*32] + ldr qform_data4, [src0, #4*32] + ldr qform_data5, [src0, #5*32] + ldr qform_data6, [src0, #6*32] + ldr qform_data7, [src0, #7*32] + + ldr qform_data8, [src8, #0] + ldr qform_data9, [src8, #1*32] + ldr qform_data10, [src8, #2*32] + ldr qform_data11, [src8, #3*32] + ldr qform_data12, [src8, #4*32] + ldr qform_data13, [src8, #5*32] + ldr qform_data14, [src8, #6*32] + ldr qform_data15, [src8, #7*32] ct_butterfly data0, data8, root0, 0, 1 ct_butterfly data1, data9, root0, 0, 1 @@ -415,23 +394,23 @@ layer1234_start: ct_butterfly data12, data13, root3, 2, 3 ct_butterfly data14, data15, root3, 4, 5 - str_vi data0, src0, 16 - str_vo data1, src0, -16+1*32 - str_vo data2, src0, -16+2*32 - str_vo data3, src0, -16+3*32 - str_vo data4, src0, -16+4*32 - str_vo data5, src0, -16+5*32 - str_vo data6, src0, -16+6*32 - str_vo data7, src0, -16+7*32 - - str_vi data8, src8, 16 - str_vo data9, src8, -16+1*32 - str_vo data10, src8, -16+2*32 - str_vo data11, src8, -16+3*32 - str_vo data12, src8, -16+4*32 - str_vo data13, src8, -16+5*32 - str_vo data14, src8, -16+6*32 - str_vo data15, src8, -16+7*32 + str qform_data0, [src0], #16 + str qform_data1, [src0, #-16+1*32] + str qform_data2, [src0, #-16+2*32] + str qform_data3, [src0, #-16+3*32] + str qform_data4, [src0, #-16+4*32] + str qform_data5, [src0, #-16+5*32] + str qform_data6, [src0, #-16+6*32] + str qform_data7, [src0, #-16+7*32] + + str qform_data8, [src8], #16 + str qform_data9, [src8, #-16+1*32] + str qform_data10, [src8, #-16+2*32] + str qform_data11, [src8, #-16+3*32] + str qform_data12, [src8, #-16+4*32] + str qform_data13, [src8, #-16+5*32] + str qform_data14, [src8, #-16+6*32] + str qform_data15, [src8, #-16+7*32] subs count, count, #1 cbnz count, layer1234_start @@ -460,21 +439,21 @@ layer567_start: trn2_s data7, data11, data15 // load twiddle factors - ldr_vi root0, r_ptr1, 16*14 - ldr_vo root0_tw, r_ptr1, -16*14+16*1 - ldr_vo root1, r_ptr1, -16*14+16*2 - ldr_vo root1_tw, r_ptr1, -16*14+16*3 - ldr_vo root2, r_ptr1, -16*14+16*4 - ldr_vo root2_tw, r_ptr1, -16*14+16*5 - ldr_vo root3, r_ptr1, -16*14+16*6 - ldr_vo root3_tw, r_ptr1, -16*14+16*7 - - ldr_vo data8, r_ptr1, -16*14+16*8 - ldr_vo data9, r_ptr1, -16*14+16*9 - ldr_vo data10, r_ptr1, -16*14+16*10 - ldr_vo data11, r_ptr1, -16*14+16*11 - ldr_vo data12, r_ptr1, -16*14+16*12 - ldr_vo data13, r_ptr1, -16*14+16*13 + ldr qform_root0, [ r_ptr1], #16*14 + ldr qform_root0_tw, [r_ptr1, #-16*14+16*1] + ldr qform_root1, [ r_ptr1, #-16*14+16*2] + ldr qform_root1_tw, [r_ptr1, #-16*14+16*3] + ldr qform_root2, [ r_ptr1, #-16*14+16*4] + ldr qform_root2_tw, [r_ptr1, #-16*14+16*5] + ldr qform_root3, [ r_ptr1, #-16*14+16*6] + ldr qform_root3_tw, [r_ptr1, #-16*14+16*7] + + ldr qform_data8, [ r_ptr1, #-16*14+16*8] + ldr qform_data9, [ r_ptr1, #-16*14+16*9] + ldr qform_data10, [ r_ptr1, #-16*14+16*10] + ldr qform_data11, [ r_ptr1, #-16*14+16*11] + ldr qform_data12, [ r_ptr1, #-16*14+16*12] + ldr qform_data13, [ r_ptr1, #-16*14+16*13] // butterflies ct_butterfly_v data0, data4, root0, root0_tw diff --git a/examples/naive/aarch64/ntt_kyber_123_4567.s b/examples/naive/aarch64/ntt_kyber_123_4567.s index 919bd2be..d68c2e71 100644 --- a/examples/naive/aarch64/ntt_kyber_123_4567.s +++ b/examples/naive/aarch64/ntt_kyber_123_4567.s @@ -26,29 +26,6 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm - -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().8h, \a\().8h, \b\().8h @@ -103,21 +80,21 @@ .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] .endm .macro load_next_roots_45 - ldr_vi root0, r_ptr0, 16 + ldr qform_root0, [r_ptr0], #16 .endm .macro load_next_roots_67 - ldr_vi root0, r_ptr1, (6*16) - ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) - ldr_vo root1, r_ptr1, (-6*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) - ldr_vo root2, r_ptr1, (-6*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(6*16) + ldr qform_root0_tw, [r_ptr1, #(-6*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-6*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-6*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-6*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data @@ -332,14 +309,14 @@ _ntt_kyber_123_4567: .p2align 2 layer123_start: - ldr_vo data0, in, 0 - ldr_vo data1, in, (1*(512/8)) - ldr_vo data2, in, (2*(512/8)) - ldr_vo data3, in, (3*(512/8)) - ldr_vo data4, in, (4*(512/8)) - ldr_vo data5, in, (5*(512/8)) - ldr_vo data6, in, (6*(512/8)) - ldr_vo data7, in, (7*(512/8)) + ldr qform_data0, [in, #0] + ldr qform_data1, [in, #(1*(512/8))] + ldr qform_data2, [in, #(2*(512/8))] + ldr qform_data3, [in, #(3*(512/8))] + ldr qform_data4, [in, #(4*(512/8))] + ldr qform_data5, [in, #(5*(512/8))] + ldr qform_data6, [in, #(6*(512/8))] + ldr qform_data7, [in, #(7*(512/8))] ct_butterfly data0, data4, root0, 0, 1 ct_butterfly data1, data5, root0, 0, 1 @@ -356,14 +333,14 @@ layer123_start: ct_butterfly data4, data5, root1, 2, 3 ct_butterfly data6, data7, root1, 4, 5 - str_vi data0, in, (16) - str_vo data1, in, (-16 + 1*(512/8)) - str_vo data2, in, (-16 + 2*(512/8)) - str_vo data3, in, (-16 + 3*(512/8)) - str_vo data4, in, (-16 + 4*(512/8)) - str_vo data5, in, (-16 + 5*(512/8)) - str_vo data6, in, (-16 + 6*(512/8)) - str_vo data7, in, (-16 + 7*(512/8)) + str qform_data0, [in], #(16) + str qform_data1, [in, #(-16 + 1*(512/8))] + str qform_data2, [in, #(-16 + 2*(512/8))] + str qform_data3, [in, #(-16 + 3*(512/8))] + str qform_data4, [in, #(-16 + 4*(512/8))] + str qform_data5, [in, #(-16 + 5*(512/8))] + str qform_data6, [in, #(-16 + 6*(512/8))] + str qform_data7, [in, #(-16 + 7*(512/8))] subs count, count, #1 cbnz count, layer123_start @@ -373,10 +350,10 @@ layer123_start: .p2align 2 layer4567_start: - ldr_vo data0, inp, (16*0) - ldr_vo data1, inp, (16*1) - ldr_vo data2, inp, (16*2) - ldr_vo data3, inp, (16*3) + ldr qform_data0, [inp, #(16*0)] + ldr qform_data1, [inp, #(16*1)] + ldr qform_data2, [inp, #(16*2)] + ldr qform_data3, [inp, #(16*3)] load_next_roots_45 diff --git a/examples/naive/aarch64/ntt_kyber_123_4567_manual_st4.s b/examples/naive/aarch64/ntt_kyber_123_4567_manual_st4.s index 2d562c9d..63f7f647 100644 --- a/examples/naive/aarch64/ntt_kyber_123_4567_manual_st4.s +++ b/examples/naive/aarch64/ntt_kyber_123_4567_manual_st4.s @@ -26,30 +26,6 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm - -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm - .macro vqrdmulh d,a,b sqrdmulh \d\().8h, \a\().8h, \b\().8h .endm @@ -103,28 +79,28 @@ .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] .endm .macro load_next_roots_45 - ldr_vi root0, r_ptr0, 16 + ldr qform_root0, [r_ptr0], #16 .endm .macro load_next_roots_67 - ldr_vi root0, r_ptr1, (6*16) - ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) - ldr_vo root1, r_ptr1, (-6*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) - ldr_vo root2, r_ptr1, (-6*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(6*16) + ldr qform_root0_tw, [r_ptr1, #(-6*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-6*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-6*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-6*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-6*16 + 5*16)] .endm .macro store_vectors_with_inc a0, a1, a2, a3, addr, inc - str_vi \a0, \addr, \inc - str_vo \a1, \addr, (-(\inc) + 16*1) - str_vo \a2, \addr, (-(\inc) + 16*2) - str_vo \a3, \addr, (-(\inc) + 16*3) + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] .endm .macro transpose4 data @@ -339,14 +315,14 @@ _ntt_kyber_123_4567_manual_st4: .p2align 2 layer123_start: - ldr_vo data0, in, 0 - ldr_vo data1, in, (1*(512/8)) - ldr_vo data2, in, (2*(512/8)) - ldr_vo data3, in, (3*(512/8)) - ldr_vo data4, in, (4*(512/8)) - ldr_vo data5, in, (5*(512/8)) - ldr_vo data6, in, (6*(512/8)) - ldr_vo data7, in, (7*(512/8)) + ldr qform_data0, [in, #0] + ldr qform_data1, [in, #(1*(512/8))] + ldr qform_data2, [in, #(2*(512/8))] + ldr qform_data3, [in, #(3*(512/8))] + ldr qform_data4, [in, #(4*(512/8))] + ldr qform_data5, [in, #(5*(512/8))] + ldr qform_data6, [in, #(6*(512/8))] + ldr qform_data7, [in, #(7*(512/8))] ct_butterfly data0, data4, root0, 0, 1 ct_butterfly data1, data5, root0, 0, 1 @@ -363,14 +339,14 @@ layer123_start: ct_butterfly data4, data5, root1, 2, 3 ct_butterfly data6, data7, root1, 4, 5 - str_vi data0, in, (16) - str_vo data1, in, (-16 + 1*(512/8)) - str_vo data2, in, (-16 + 2*(512/8)) - str_vo data3, in, (-16 + 3*(512/8)) - str_vo data4, in, (-16 + 4*(512/8)) - str_vo data5, in, (-16 + 5*(512/8)) - str_vo data6, in, (-16 + 6*(512/8)) - str_vo data7, in, (-16 + 7*(512/8)) + str qform_data0, [in], #(16) + str qform_data1, [in, #(-16 + 1*(512/8))] + str qform_data2, [in, #(-16 + 2*(512/8))] + str qform_data3, [in, #(-16 + 3*(512/8))] + str qform_data4, [in, #(-16 + 4*(512/8))] + str qform_data5, [in, #(-16 + 5*(512/8))] + str qform_data6, [in, #(-16 + 6*(512/8))] + str qform_data7, [in, #(-16 + 7*(512/8))] subs count, count, #1 cbnz count, layer123_start @@ -380,10 +356,10 @@ layer123_start: .p2align 2 layer4567_start: - ldr_vo data0, inp, (16*0) - ldr_vo data1, inp, (16*1) - ldr_vo data2, inp, (16*2) - ldr_vo data3, inp, (16*3) + ldr qform_data0, [inp, #(16*0)] + ldr qform_data1, [inp, #(16*1)] + ldr qform_data2, [inp, #(16*2)] + ldr qform_data3, [inp, #(16*3)] load_next_roots_45 diff --git a/examples/naive/aarch64/ntt_kyber_123_4567_scalar_load.s b/examples/naive/aarch64/ntt_kyber_123_4567_scalar_load.s index aa5f7c12..a3248203 100644 --- a/examples/naive/aarch64/ntt_kyber_123_4567_scalar_load.s +++ b/examples/naive/aarch64/ntt_kyber_123_4567_scalar_load.s @@ -26,42 +26,12 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - xtmp0 .req x10 xtmp1 .req x11 .macro vins vec_out, gpr_in, lane ins \vec_out\().d[\lane], \gpr_in .endm -.macro ldr_vo vec, base, offset - ldr xtmp0, [\base, #\offset] - ldr xtmp1, [\base, #(\offset+8)] - vins \vec, xtmp0, 0 - vins \vec, xtmp1, 1 -.endm - -.macro ldr_vi vec, base, inc - ldr xtmp0, [\base], #\inc - ldr xtmp1, [\base, #(-\inc+8)] - vins \vec, xtmp0, 0 - vins \vec, xtmp1, 1 -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm - .macro vqrdmulh d,a,b sqrdmulh \d\().8h, \a\().8h, \b\().8h .endm @@ -115,21 +85,21 @@ xtmp1 .req x11 .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] .endm .macro load_next_roots_45 - ldr_vi root0, r_ptr0, 16 + ldr qform_root0, [r_ptr0], #16 .endm .macro load_next_roots_67 - ldr_vi root0, r_ptr1, (6*16) - ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) - ldr_vo root1, r_ptr1, (-6*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) - ldr_vo root2, r_ptr1, (-6*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(6*16) + ldr qform_root0_tw, [r_ptr1, #(-6*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-6*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-6*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-6*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data @@ -344,14 +314,14 @@ _ntt_kyber_123_4567_scalar_load: .p2align 2 layer123_start: - ldr_vo data0, in, 0 - ldr_vo data1, in, (1*(512/8)) - ldr_vo data2, in, (2*(512/8)) - ldr_vo data3, in, (3*(512/8)) - ldr_vo data4, in, (4*(512/8)) - ldr_vo data5, in, (5*(512/8)) - ldr_vo data6, in, (6*(512/8)) - ldr_vo data7, in, (7*(512/8)) + ldr qform_data0, [in, #0] + ldr qform_data1, [in, #(1*(512/8))] + ldr qform_data2, [in, #(2*(512/8))] + ldr qform_data3, [in, #(3*(512/8))] + ldr qform_data4, [in, #(4*(512/8))] + ldr qform_data5, [in, #(5*(512/8))] + ldr qform_data6, [in, #(6*(512/8))] + ldr qform_data7, [in, #(7*(512/8))] ct_butterfly data0, data4, root0, 0, 1 ct_butterfly data1, data5, root0, 0, 1 @@ -368,14 +338,14 @@ layer123_start: ct_butterfly data4, data5, root1, 2, 3 ct_butterfly data6, data7, root1, 4, 5 - str_vi data0, in, (16) - str_vo data1, in, (-16 + 1*(512/8)) - str_vo data2, in, (-16 + 2*(512/8)) - str_vo data3, in, (-16 + 3*(512/8)) - str_vo data4, in, (-16 + 4*(512/8)) - str_vo data5, in, (-16 + 5*(512/8)) - str_vo data6, in, (-16 + 6*(512/8)) - str_vo data7, in, (-16 + 7*(512/8)) + str qform_data0, [in], #(16) + str qform_data1, [in, #(-16 + 1*(512/8))] + str qform_data2, [in, #(-16 + 2*(512/8))] + str qform_data3, [in, #(-16 + 3*(512/8))] + str qform_data4, [in, #(-16 + 4*(512/8))] + str qform_data5, [in, #(-16 + 5*(512/8))] + str qform_data6, [in, #(-16 + 6*(512/8))] + str qform_data7, [in, #(-16 + 7*(512/8))] subs count, count, #1 cbnz count, layer123_start @@ -385,10 +355,10 @@ layer123_start: .p2align 2 layer4567_start: - ldr_vo data0, inp, (16*0) - ldr_vo data1, inp, (16*1) - ldr_vo data2, inp, (16*2) - ldr_vo data3, inp, (16*3) + ldr qform_data0, [inp, #(16*0)] + ldr qform_data1, [inp, #(16*1)] + ldr qform_data2, [inp, #(16*2)] + ldr qform_data3, [inp, #(16*3)] load_next_roots_45 diff --git a/examples/naive/aarch64/ntt_kyber_123_4567_scalar_load_store.s b/examples/naive/aarch64/ntt_kyber_123_4567_scalar_load_store.s index 1ab80d18..13a0679c 100644 --- a/examples/naive/aarch64/ntt_kyber_123_4567_scalar_load_store.s +++ b/examples/naive/aarch64/ntt_kyber_123_4567_scalar_load_store.s @@ -26,15 +26,6 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - xtmp0 .req x10 xtmp1 .req x11 .macro vins vec_out, gpr_in, lane @@ -45,27 +36,6 @@ xtmp1 .req x11 umov \gpr_out\(), \vec_in\().d[\lane] .endm -.macro ldr_vo vec, base, offset - ldr xtmp0, [\base, #\offset] - ldr xtmp1, [\base, #(\offset+8)] - vins \vec, xtmp0, 0 - vins \vec, xtmp1, 1 -.endm - -.macro ldr_vi vec, base, inc - ldr xtmp0, [\base], #\inc - ldr xtmp1, [\base, #(-\inc+8)] - vins \vec, xtmp0, 0 - vins \vec, xtmp1, 1 -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm - .macro vqrdmulh d,a,b sqrdmulh \d\().8h, \a\().8h, \b\().8h .endm @@ -119,21 +89,21 @@ xtmp1 .req x11 .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] .endm .macro load_next_roots_45 - ldr_vi root0, r_ptr0, 16 + ldr qform_root0, [r_ptr0], #16 .endm .macro load_next_roots_67 - ldr_vi root0, r_ptr1, (6*16) - ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) - ldr_vo root1, r_ptr1, (-6*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) - ldr_vo root2, r_ptr1, (-6*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(6*16) + ldr qform_root0_tw, [r_ptr1, #(-6*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-6*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-6*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-6*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data @@ -371,14 +341,14 @@ _ntt_kyber_123_4567_scalar_load_store: .p2align 2 layer123_start: - ldr_vo data0, in, 0 - ldr_vo data1, in, (1*(512/8)) - ldr_vo data2, in, (2*(512/8)) - ldr_vo data3, in, (3*(512/8)) - ldr_vo data4, in, (4*(512/8)) - ldr_vo data5, in, (5*(512/8)) - ldr_vo data6, in, (6*(512/8)) - ldr_vo data7, in, (7*(512/8)) + ldr qform_data0, [in, #0] + ldr qform_data1, [in, #(1*(512/8))] + ldr qform_data2, [in, #(2*(512/8))] + ldr qform_data3, [in, #(3*(512/8))] + ldr qform_data4, [in, #(4*(512/8))] + ldr qform_data5, [in, #(5*(512/8))] + ldr qform_data6, [in, #(6*(512/8))] + ldr qform_data7, [in, #(7*(512/8))] ct_butterfly data0, data4, root0, 0, 1 ct_butterfly data1, data5, root0, 0, 1 @@ -395,14 +365,14 @@ layer123_start: ct_butterfly data4, data5, root1, 2, 3 ct_butterfly data6, data7, root1, 4, 5 - str_vi data0, in, (16) - str_vo data1, in, (-16 + 1*(512/8)) - str_vo data2, in, (-16 + 2*(512/8)) - str_vo data3, in, (-16 + 3*(512/8)) - str_vo data4, in, (-16 + 4*(512/8)) - str_vo data5, in, (-16 + 5*(512/8)) - str_vo data6, in, (-16 + 6*(512/8)) - str_vo data7, in, (-16 + 7*(512/8)) + str qform_data0, [in], #(16) + str qform_data1, [in, #(-16 + 1*(512/8))] + str qform_data2, [in, #(-16 + 2*(512/8))] + str qform_data3, [in, #(-16 + 3*(512/8))] + str qform_data4, [in, #(-16 + 4*(512/8))] + str qform_data5, [in, #(-16 + 5*(512/8))] + str qform_data6, [in, #(-16 + 6*(512/8))] + str qform_data7, [in, #(-16 + 7*(512/8))] subs count, count, #1 cbnz count, layer123_start @@ -412,10 +382,10 @@ layer123_start: .p2align 2 layer4567_start: - ldr_vo data0, inp, (16*0) - ldr_vo data1, inp, (16*1) - ldr_vo data2, inp, (16*2) - ldr_vo data3, inp, (16*3) + ldr qform_data0, [inp, #(16*0)] + ldr qform_data1, [inp, #(16*1)] + ldr qform_data2, [inp, #(16*2)] + ldr qform_data3, [inp, #(16*3)] load_next_roots_45 diff --git a/examples/naive/aarch64/ntt_kyber_123_4567_scalar_store.s b/examples/naive/aarch64/ntt_kyber_123_4567_scalar_store.s index 0c2639ec..18c35d9f 100644 --- a/examples/naive/aarch64/ntt_kyber_123_4567_scalar_store.s +++ b/examples/naive/aarch64/ntt_kyber_123_4567_scalar_store.s @@ -26,29 +26,6 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm - .macro vqrdmulh d,a,b sqrdmulh \d\().8h, \a\().8h, \b\().8h .endm @@ -102,21 +79,21 @@ .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] .endm .macro load_next_roots_45 - ldr_vi root0, r_ptr0, 16 + ldr qform_root0, [r_ptr0], #16 .endm .macro load_next_roots_67 - ldr_vi root0, r_ptr1, (6*16) - ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) - ldr_vo root1, r_ptr1, (-6*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) - ldr_vo root2, r_ptr1, (-6*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(6*16) + ldr qform_root0_tw, [r_ptr1, #(-6*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-6*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-6*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-6*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data @@ -358,14 +335,14 @@ _ntt_kyber_123_4567_scalar_store: .p2align 2 layer123_start: - ldr_vo data0, in, 0 - ldr_vo data1, in, (1*(512/8)) - ldr_vo data2, in, (2*(512/8)) - ldr_vo data3, in, (3*(512/8)) - ldr_vo data4, in, (4*(512/8)) - ldr_vo data5, in, (5*(512/8)) - ldr_vo data6, in, (6*(512/8)) - ldr_vo data7, in, (7*(512/8)) + ldr qform_data0, [in, #0] + ldr qform_data1, [in, #(1*(512/8))] + ldr qform_data2, [in, #(2*(512/8))] + ldr qform_data3, [in, #(3*(512/8))] + ldr qform_data4, [in, #(4*(512/8))] + ldr qform_data5, [in, #(5*(512/8))] + ldr qform_data6, [in, #(6*(512/8))] + ldr qform_data7, [in, #(7*(512/8))] ct_butterfly data0, data4, root0, 0, 1 ct_butterfly data1, data5, root0, 0, 1 @@ -382,14 +359,14 @@ layer123_start: ct_butterfly data4, data5, root1, 2, 3 ct_butterfly data6, data7, root1, 4, 5 - str_vi data0, in, (16) - str_vo data1, in, (-16 + 1*(512/8)) - str_vo data2, in, (-16 + 2*(512/8)) - str_vo data3, in, (-16 + 3*(512/8)) - str_vo data4, in, (-16 + 4*(512/8)) - str_vo data5, in, (-16 + 5*(512/8)) - str_vo data6, in, (-16 + 6*(512/8)) - str_vo data7, in, (-16 + 7*(512/8)) + str qform_data0, [in], #(16) + str qform_data1, [in, #(-16 + 1*(512/8))] + str qform_data2, [in, #(-16 + 2*(512/8))] + str qform_data3, [in, #(-16 + 3*(512/8))] + str qform_data4, [in, #(-16 + 4*(512/8))] + str qform_data5, [in, #(-16 + 5*(512/8))] + str qform_data6, [in, #(-16 + 6*(512/8))] + str qform_data7, [in, #(-16 + 7*(512/8))] subs count, count, #1 cbnz count, layer123_start @@ -399,10 +376,10 @@ layer123_start: .p2align 2 layer4567_start: - ldr_vo data0, inp, (16*0) - ldr_vo data1, inp, (16*1) - ldr_vo data2, inp, (16*2) - ldr_vo data3, inp, (16*3) + ldr qform_data0, [inp, #(16*0)] + ldr qform_data1, [inp, #(16*1)] + ldr qform_data2, [inp, #(16*2)] + ldr qform_data3, [inp, #(16*3)] load_next_roots_45