diff --git a/example.py b/example.py index 681347d..6c9bedf 100644 --- a/example.py +++ b/example.py @@ -1395,30 +1395,39 @@ def core(self, slothy): slothy.optimize(start="loop_0", end="end_loop_0") slothy.optimize(start="loop_1", end="end_loop_1") -class neon_keccak_x1(Example): +class neon_keccak_x1_no_symbolic(Example): def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55): - name = "keccak_f1600_x1_scalar_slothy" + name = "keccak_f1600_x1_scalar_slothy_no_symbolic" infile = "keccak_f1600_x1_scalar_slothy" + outfile = "keccak_f1600_x1_scalar_no_symbolic" - if var != "": - name += f"_{var}" - infile += f"_{var}" - name += f"_{target_label_dict[target]}" - - super().__init__(infile, name, outfile=name, rename=True, arch=arch, target=target) + super().__init__(infile, name, outfile=outfile, rename=True, arch=arch, target=target) def core(self, slothy): + slothy.config.reserved_regs = ["x18", "sp"] + slothy.config.inputs_are_outputs = True - slothy.config.variable_size = True - slothy.config.visualize_expected_performance = True - slothy.config.timeout = 3600*24 + slothy.config.variable_size = False + slothy.config.visualize_expected_performance = False + slothy.config.timeout = 3600 - slothy.config.outputs = ["x27"] + slothy.config.selfcheck_failure_logfile = "selfcheck_fail.log" + + slothy.config.outputs = ["flags"] + slothy.config.constraints.stalls_first_attempt = 0 + slothy.config.ignore_objective = True + slothy.config.constraints.minimize_spills = True slothy.config.constraints.functional_only = True - slothy.config.constraints.stalls_first_attempt = 32 + slothy.config.constraints.allow_reordering = False + slothy.config.constraints.allow_spills = True + slothy.config.visualize_expected_performance = True +# slothy.config.visualize_show_old_code = True slothy.optimize(start="loop", end="end_loop") + slothy.config.outputs = ["hint_STACK_OFFSET_COUNT"] + slothy.optimize(start="initial_round_start", end="initial_round_end") + ############################################################################################# @@ -1563,7 +1572,7 @@ def main(): fft_fixedpoint_radix4(), # Keccak neon_keccak_x4(), - neon_keccak_x1(), + neon_keccak_x1_no_symbolic(), ] all_example_names = [e.name for e in examples] diff --git a/examples/naive/aarch64/keccak_f1600_x1_scalar_slothy.s b/examples/naive/aarch64/keccak_f1600_x1_scalar_slothy.s index a1fd292..e07cf1a 100644 --- a/examples/naive/aarch64/keccak_f1600_x1_scalar_slothy.s +++ b/examples/naive/aarch64/keccak_f1600_x1_scalar_slothy.s @@ -63,8 +63,6 @@ round_constants: input_addr .req x0 const_addr .req x26 - cur_const .req x26 - count .req w27 /* Mapping of Kecck-f1600 state to scalar registers * at the beginning and end of each round. */ @@ -81,7 +79,7 @@ round_constants: Aka .req x3 Ake .req x8 Aki .req x13 - Ako .req x18 + Ako .req x28 Aku .req x23 Ama .req x4 Ame .req x9 @@ -94,60 +92,58 @@ round_constants: Aso .req x20 Asu .req x25 - /* A_[y,2*x+3*y] = rot(A[x,y]) */ - Aba_ .req x30 - Abe_ .req x28 - Abi_ .req x11 - Abo_ .req x16 - Abu_ .req x21 - Aga_ .req x3 - Age_ .req x8 - Agi_ .req x12 - Ago_ .req x17 - Agu_ .req x22 - Aka_ .req x4 - Ake_ .req x9 - Aki_ .req x13 - Ako_ .req x18 - Aku_ .req x23 - Ama_ .req x5 - Ame_ .req x10 - Ami_ .req x14 - Amo_ .req x19 - Amu_ .req x24 - Asa_ .req x1 - Ase_ .req x6 - Asi_ .req x15 - Aso_ .req x20 - Asu_ .req x25 - - /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ - /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ - C0 .req x30 - E0 .req x29 - C1 .req x26 - E1 .req x0 - C2 .req x27 - E2 .req x26 - C3 .req x28 - E3 .req x27 - C4 .req x29 - E4 .req x28 - - tmp .req x0 - - - tmp0 .req x0 - tmp1 .req x29 - /************************ MACROS ****************************/ -#define STACK_SIZE (16*6 + 3*8 + 8) // GPRs (16*6), count (8), const (8), input (8), padding (8) +#define STACK_LOCS 40 + +#define STACK_SIZE (16*6 + 3*8 + 8 + (STACK_LOCS) * 8) // GPRs (16*6), count (8), const (8), input (8), padding (8) #define STACK_BASE_GPRS (3*8+8) #define STACK_OFFSET_INPUT (0*8) #define STACK_OFFSET_CONST (1*8) #define STACK_OFFSET_COUNT (2*8) +#define STACK_OFFSET_LOCS (16*6 + 4*8) +#define STACK_LOC_0 ((STACK_OFFSET_LOCS) + 0*8) +#define STACK_LOC_1 ((STACK_OFFSET_LOCS) + 1*8) +#define STACK_LOC_2 ((STACK_OFFSET_LOCS) + 2*8) +#define STACK_LOC_3 ((STACK_OFFSET_LOCS) + 3*8) +#define STACK_LOC_4 ((STACK_OFFSET_LOCS) + 4*8) +#define STACK_LOC_5 ((STACK_OFFSET_LOCS) + 5*8) +#define STACK_LOC_6 ((STACK_OFFSET_LOCS) + 6*8) +#define STACK_LOC_7 ((STACK_OFFSET_LOCS) + 7*8) +#define STACK_LOC_8 ((STACK_OFFSET_LOCS) + 8*8) +#define STACK_LOC_9 ((STACK_OFFSET_LOCS) + 9*8) +#define STACK_LOC_10 ((STACK_OFFSET_LOCS) + 10*8) +#define STACK_LOC_11 ((STACK_OFFSET_LOCS) + 11*8) +#define STACK_LOC_12 ((STACK_OFFSET_LOCS) + 12*8) +#define STACK_LOC_13 ((STACK_OFFSET_LOCS) + 13*8) +#define STACK_LOC_14 ((STACK_OFFSET_LOCS) + 14*8) +#define STACK_LOC_15 ((STACK_OFFSET_LOCS) + 15*8) +#define STACK_LOC_16 ((STACK_OFFSET_LOCS) + 16*8) +#define STACK_LOC_17 ((STACK_OFFSET_LOCS) + 17*8) +#define STACK_LOC_18 ((STACK_OFFSET_LOCS) + 18*8) +#define STACK_LOC_19 ((STACK_OFFSET_LOCS) + 19*8) +#define STACK_LOC_20 ((STACK_OFFSET_LOCS) + 20*8) +#define STACK_LOC_21 ((STACK_OFFSET_LOCS) + 21*8) +#define STACK_LOC_22 ((STACK_OFFSET_LOCS) + 22*8) +#define STACK_LOC_23 ((STACK_OFFSET_LOCS) + 23*8) +#define STACK_LOC_24 ((STACK_OFFSET_LOCS) + 24*8) +#define STACK_LOC_25 ((STACK_OFFSET_LOCS) + 25*8) +#define STACK_LOC_26 ((STACK_OFFSET_LOCS) + 26*8) +#define STACK_LOC_27 ((STACK_OFFSET_LOCS) + 27*8) +#define STACK_LOC_28 ((STACK_OFFSET_LOCS) + 28*8) +#define STACK_LOC_29 ((STACK_OFFSET_LOCS) + 29*8) +#define STACK_LOC_30 ((STACK_OFFSET_LOCS) + 30*8) +#define STACK_LOC_31 ((STACK_OFFSET_LOCS) + 31*8) +#define STACK_LOC_32 ((STACK_OFFSET_LOCS) + 32*8) +#define STACK_LOC_33 ((STACK_OFFSET_LOCS) + 33*8) +#define STACK_LOC_34 ((STACK_OFFSET_LOCS) + 34*8) +#define STACK_LOC_35 ((STACK_OFFSET_LOCS) + 35*8) +#define STACK_LOC_36 ((STACK_OFFSET_LOCS) + 36*8) +#define STACK_LOC_37 ((STACK_OFFSET_LOCS) + 37*8) +#define STACK_LOC_38 ((STACK_OFFSET_LOCS) + 38*8) +#define STACK_LOC_39 ((STACK_OFFSET_LOCS) + 39*8) + .macro alloc_stack sub sp, sp, #(STACK_SIZE) .endm @@ -181,220 +177,186 @@ round_constants: eor \dst, \dst, \src4 .endm - - -.macro addparity prty, dst0, src0, dst1, src1, dst2, src2, dst3, src3, dst4, src4 - eor \dst0, \src0, \prty - eor \dst1, \src1, \prty - eor \dst2, \src2, \prty - eor \dst3, \src3, \prty - eor \dst4, \src4, \prty +.macro chi_step_ror out, a, b, c, r1, r2 + bic X, \b\(), \c\(), ror #\r1 + eor \out\(), X, \a\(), ror #\r2 .endm - - - -.macro keccak_f1600_round_initial - eor5 C0, Ama, Asa, Aba, Aga, Aka - eor5 C1, Ame, Ase, Abe, Age, Ake - eor5 C2, Ami, Asi, Abi, Agi, Aki - eor5 C3, Amo, Aso, Abo, Ago, Ako - eor5 C4, Amu, Asu, Abu, Agu, Aku - - eor E1, C0, C2, ror #63 - eor E3, C2, C4, ror #63 - eor E0, C4, C1, ror #63 - eor E2, C1, C3, ror #63 - eor E4, C3, C0, ror #63 - - eor Aba_, Aba, E0 - eor Asa_, Abi, E2 - eor Abi_, Aki, E2 - eor Aki_, Ako, E3 - eor Ako_, Amu, E4 - eor Amu_, Aso, E3 - eor Aso_, Ama, E0 - eor Aka_, Abe, E1 - eor Ase_, Ago, E3 - eor Ago_, Ame, E1 - eor Ake_, Agi, E2 - eor Agi_, Aka, E0 - eor Aga_, Abo, E3 - eor Abo_, Amo, E3 - eor Amo_, Ami, E2 - eor Ami_, Ake, E1 - eor Age_, Agu, E4 - eor Agu_, Asi, E2 - eor Asi_, Aku, E4 - eor Aku_, Asa, E0 - eor Ama_, Abu, E4 - eor Abu_, Asu, E4 - eor Asu_, Ase, E1 - eor Ame_, Aga, E0 - eor Abe_, Age, E1 - - load_constant_ptr - - bic tmp0, Agi_, Age_, ror #47 - bic tmp1, Ago_, Agi_, ror #42 - eor Aga, tmp0, Aga_, ror #39 - bic tmp0, Agu_, Ago_, ror #16 - eor Age, tmp1, Age_, ror #25 - bic tmp1, Aga_, Agu_, ror #31 - eor Agi, tmp0, Agi_, ror #58 - bic tmp0, Age_, Aga_, ror #56 - eor Ago, tmp1, Ago_, ror #47 - bic tmp1, Aki_, Ake_, ror #19 - eor Agu, tmp0, Agu_, ror #23 - bic tmp0, Ako_, Aki_, ror #47 - eor Aka, tmp1, Aka_, ror #24 - bic tmp1, Aku_, Ako_, ror #10 - eor Ake, tmp0, Ake_, ror #2 - bic tmp0, Aka_, Aku_, ror #47 - eor Aki, tmp1, Aki_, ror #57 - bic tmp1, Ake_, Aka_, ror #5 - eor Ako, tmp0, Ako_, ror #57 - bic tmp0, Ami_, Ame_, ror #38 - eor Aku, tmp1, Aku_, ror #52 - bic tmp1, Amo_, Ami_, ror #5 - eor Ama, tmp0, Ama_, ror #47 - bic tmp0, Amu_, Amo_, ror #41 - eor Ame, tmp1, Ame_, ror #43 - bic tmp1, Ama_, Amu_, ror #35 - eor Ami, tmp0, Ami_, ror #46 - bic tmp0, Ame_, Ama_, ror #9 - - str const_addr, [sp, #(STACK_OFFSET_CONST)] - ldr cur_const, [const_addr] - - eor Amo, tmp1, Amo_, ror #12 - bic tmp1, Asi_, Ase_, ror #48 - eor Amu, tmp0, Amu_, ror #44 - bic tmp0, Aso_, Asi_, ror #2 - eor Asa, tmp1, Asa_, ror #41 - bic tmp1, Asu_, Aso_, ror #25 - eor Ase, tmp0, Ase_, ror #50 - bic tmp0, Asa_, Asu_, ror #60 - eor Asi, tmp1, Asi_, ror #27 - bic tmp1, Ase_, Asa_, ror #57 - eor Aso, tmp0, Aso_, ror #21 - - mov count, #1 - - bic tmp0, Abi_, Abe_, ror #63 - eor Asu, tmp1, Asu_, ror #53 - bic tmp1, Abo_, Abi_, ror #42 - eor Aba, Aba_, tmp0, ror #21 - bic tmp0, Abu_, Abo_, ror #57 - eor Abe, tmp1, Abe_, ror #41 - bic tmp1, Aba_, Abu_, ror #50 - eor Abi, tmp0, Abi_, ror #35 - bic tmp0, Abe_, Aba_, ror #44 - eor Abo, tmp1, Abo_, ror #43 - eor Abu, tmp0, Abu_, ror #30 - - eor Aba, Aba, cur_const - str count, [sp, #STACK_OFFSET_COUNT] // @slothy:writes=STACK_OFFSET_COUNT - +.macro chi_step_ror2 out, a, b, c, r1, r2 + bic X, \b\(), \c\(), ror #\r1 + eor \out\(), \a\(), X, ror #\r2 .endm -.macro eor5ror dst, src0, src1, rot1, src2, rot2, src3, rot3, src4, rot4 - eor \dst, \src0, \src1, ror \rot1 - eor \dst, \dst, \src2, ror \rot2 - eor \dst, \dst, \src3, ror \rot3 - eor \dst, \dst, \src4, ror \rot4 -.endm +.macro keccak_f1600_round_initial + eor5 X, Ama, Asa, Aba, Aga, Aka + eor5 X, Ame, Ase, Abe, Age, Ake + eor5 X, Ami, Asi, Abi, Agi, Aki + eor5 X, Amo, Aso, Abo, Ago, Ako + eor5 X, Amu, Asu, Abu, Agu, Aku + + eor X, X, X, ror #63 + eor X, X, X, ror #63 + eor X, X, X, ror #63 + eor X, X, X, ror #63 + eor X, X, X, ror #63 + + eor X, Aba, X + eor X, Abi, X + eor X, Aki, X + eor X, Ako, X + eor X, Amu, X + eor X, Aso, X + eor X, Ama, X + eor X, Abe, X + eor X, Ago, X + eor X, Ame, X + eor X, Agi, X + eor X, Aka, X + eor X, Abo, X + eor X, Amo, X + eor X, Ami, X + eor X, Ake, X + eor X, Agu, X + eor X, Asi, X + eor X, Aku, X + eor X, Asa, X + eor X, Abu, X + eor X, Asu, X + eor X, Ase, X + eor X, Aga, X + eor X, Age, X + + ldr X, [sp, #STACK_OFFSET_CONST] + ldr X, [X] + mov X, #1 + str X, [sp, #STACK_OFFSET_COUNT] // @slothy:writes=STACK_OFFSET_COUNT + + chi_step_ror Aga, X, X, X, 47, 39 + chi_step_ror Age, X, X, X, 42, 25 + chi_step_ror Agi, X, X, X, 16, 58 + chi_step_ror Ago, X, X, X, 31, 47 + chi_step_ror Agu, X, X, X, 56, 23 + chi_step_ror Aka, X, X, X, 19, 24 + chi_step_ror Ake, X, X, X, 47, 2 + chi_step_ror Aki, X, X, X, 10, 57 + chi_step_ror Ako, X, X, X, 47, 57 + chi_step_ror Aku, X, X, X, 5, 52 + chi_step_ror Ama, X, X, X, 38, 47 + chi_step_ror Ame, X, X, X, 5, 43 + chi_step_ror Ami, X, X, X, 41, 46 + chi_step_ror Amo, X, X, X, 35, 12 + chi_step_ror Amu, X, X, X, 9, 44 + chi_step_ror Asa, X, X, X, 48, 41 + chi_step_ror Ase, X, X, X, 2, 50 + chi_step_ror Asi, X, X, X, 25, 27 + chi_step_ror Aso, X, X, X, 60, 21 + chi_step_ror Asu, X, X, X, 57, 53 + chi_step_ror2 Aba, X, X, X, 63, 21 + chi_step_ror Abe, X, X, X, 42, 41 + chi_step_ror Abi, X, X, X, 57, 35 + chi_step_ror Abo, X, X, X, 50, 43 + chi_step_ror Abu, X, X, X, 44, 30 + + eor Aba, Aba, X -.macro addparityror prty, dst0, src0, rot0, dst1, src1, rot1, dst2, src2, rot2, dst3, src3, rot3, dst4, src4, rot4 - eor \dst0, \prty, \src0, ror \rot0 - eor \dst1, \prty, \src1, ror \rot1 - eor \dst2, \prty, \src2, ror \rot2 - eor \dst3, \prty, \src3, ror \rot3 - eor \dst4, \prty, \src4, ror \rot4 .endm .macro keccak_f1600_round_noninitial - eor5ror C0, Aba, Aga, #61, Ama, #54, Aka, #39, Asa, #25 - eor5ror C1, Ake, Ame, #57, Abe, #51, Ase, #31, Age, #27 - eor5ror C2, Asi, Abi, #52, Aki, #48, Ami, #10, Agi, #5 - eor5ror C3, Abo, Ako, #63, Amo, #37, Ago, #36, Aso, #2 - eor5ror C4, Aku, Agu, #50, Amu, #34, Abu, #26, Asu, #15 - - eor E1, C0, C2, ror #61 - ror C2, C2, #62 - eor E3, C2, C4, ror #57 - ror C4, C4, #58 - eor E0, C4, C1, ror #55 - ror C1, C1, #56 - eor E2, C1, C3, ror #63 - eor E4, C3, C0, ror #63 - - addparityror E0, X, Aba, #0, X, Ama, #54, X, Aka, #39, X, Asa, #25, X, Aga, #61 - addparityror E1, X, Abe, #43, X, Ame, #49, X, Ake, #56, X, Ase, #23, X, Age, #19 - addparityror E2, X, Abi, #50, X, Aki, #46, X, Agi, #3, X, Ami, #8, X, Asi, #62 - addparityror E3, X, Ako, #63, X, Aso, #2, X, Ago, #36, X, Abo, #0, X, Amo, #37 - addparityror E4, X, Amu, #28, X, Agu, #44, X, Aku, #58, X, Abu, #20, X, Asu, #9 - - load_constant_ptr_stack - ldr count, [sp, #STACK_OFFSET_COUNT] // @slothy:reads=STACK_OFFSET_COUNT - - bic tmp0, X, X, ror #47 - bic tmp1, X, X, ror #42 - eor Aga, tmp0, X, ror #39 - bic tmp0, X, X, ror #16 - eor Age, tmp1, X, ror #25 - bic tmp1, X, X, ror #31 - eor Agi, tmp0, X, ror #58 - bic tmp0, X, X, ror #56 - eor Ago, tmp1, X, ror #47 - bic tmp1, X, X, ror #19 - eor Agu, tmp0, X, ror #23 - bic tmp0, X, X, ror #47 - eor Aka, tmp1, X, ror #24 - bic tmp1, X, X, ror #10 - eor Ake, tmp0, X, ror #2 - bic tmp0, X, X, ror #47 - eor Aki, tmp1, X, ror #57 - bic tmp1, X, X, ror #5 - eor Ako, tmp0, X, ror #57 - bic tmp0, X, X, ror #38 - eor Aku, tmp1, X, ror #52 - bic tmp1, X, X, ror #5 - eor Ama, tmp0, X, ror #47 - bic tmp0, X, X, ror #41 - eor Ame, tmp1, X, ror #43 - bic tmp1, X, X, ror #35 - eor Ami, tmp0, X, ror #46 - bic tmp0, X, X, ror #9 - - ldr cur_const, [const_addr, count, UXTW #3] - - eor Amo, tmp1, X, ror #12 - bic tmp1, X, X, ror #48 - eor Amu, tmp0, X, ror #44 - bic tmp0, X, X, ror #2 - eor Asa, tmp1, X, ror #41 - bic tmp1, X, X, ror #25 - eor Ase, tmp0, X, ror #50 - bic tmp0, X, X, ror #60 - eor Asi, tmp1, X, ror #27 - bic tmp1, X, X, ror #57 - eor Aso, tmp0, X, ror #21 - bic tmp0, X, X, ror #63 - add count, count, #1 - str count, [sp, #STACK_OFFSET_COUNT] // @slothy:writes=STACK_OFFSET_COUNT - eor Asu, tmp1, X, ror #53 - bic tmp1, X, X, ror #42 - eor Aba, X, tmp0, ror #21 - bic tmp0, X, X, ror #57 - eor Abe, tmp1, X, ror #41 - bic tmp1, X, X, ror #50 - eor Abi, tmp0, X, ror #35 - bic tmp0, X, X, ror #44 - - eor Abo, tmp1, X, ror #43 - eor Abu, tmp0, X, ror #30 - eor Aba, Aba, cur_const + + eor X, Aba, Aga, ror #61 + eor X, X, Ama, ror #54 + eor X, X, Aka, ror #39 + eor X, X, Asa, ror #25 + + eor X, Ake, Ame, ror #57 + eor X, X, Abe, ror #51 + eor X, X, Ase, ror #31 + eor X, X, Age, ror #27 + + eor X, Asi, Abi, ror #52 + eor X, X, Aki, ror #48 + eor X, X, Ami, ror #10 + eor X, X, Agi, ror #5 + + eor X, Abo, Ako, ror #63 + eor X, X, Amo, ror #37 + eor X, X, Ago, ror #36 + eor X, X, Aso, ror #2 + + eor X, Aku, Agu, ror #50 + eor X, X, Amu, ror #34 + eor X, X, Abu, ror #26 + eor X, X, Asu, ror #15 + + eor X, X, X, ror #61 + ror X, X, #62 + eor X, X, X, ror #57 + ror X, X, #58 + eor X, X, X, ror #55 + ror X, X, #56 + eor X, X, X, ror #63 + eor X, X, X, ror #63 + + eor X, X, Aba + eor X, X, Abi, ror #50 + eor X, X, Aki, ror #46 + eor X, X, Ako, ror #63 + eor X, X, Amu, ror #28 + eor X, X, Aso, ror #2 + eor X, X, Ama, ror #54 + eor X, X, Abe, ror #43 + eor X, X, Ago, ror #36 + eor X, X, Ame, ror #49 + eor X, X, Agi, ror #3 + eor X, X, Aka, ror #39 + eor X, X, Abo + eor X, X, Amo, ror #37 + eor X, X, Ami, ror #8 + eor X, X, Ake, ror #56 + eor X, X, Agu, ror #44 + eor X, X, Asi, ror #62 + eor X, X, Aku, ror #58 + eor X, X, Asa, ror #25 + eor X, X, Abu, ror #20 + eor X, X, Asu, ror #9 + eor X, X, Ase, ror #23 + eor X, X, Aga, ror #61 + eor X, X, Age, ror #19 + + ldr X, [sp, #STACK_OFFSET_CONST] + ldr X, [sp, #STACK_OFFSET_COUNT] // @slothy:reads=STACK_OFFSET_COUNT + ldr X, [X, W, UXTW #3] + add X, X, #1 + cmp X, #(KECCAK_F1600_ROUNDS-1) + str X, [sp, #STACK_OFFSET_COUNT] // @slothy:writes=STACK_OFFSET_COUNT + + chi_step_ror Aga, X, X, X, 47, 39 + chi_step_ror Age, X, X, X, 42, 25 + chi_step_ror Agi, X, X, X, 16, 58 + chi_step_ror Ago, X, X, X, 31, 47 + chi_step_ror Agu, X, X, X, 56, 23 + chi_step_ror Aka, X, X, X, 19, 24 + chi_step_ror Ake, X, X, X, 47, 2 + chi_step_ror Aki, X, X, X, 10, 57 + chi_step_ror Ako, X, X, X, 47, 57 + chi_step_ror Aku, X, X, X, 5, 52 + chi_step_ror Ama, X, X, X, 38, 47 + chi_step_ror Ame, X, X, X, 5, 43 + chi_step_ror Ami, X, X, X, 41, 46 + chi_step_ror Amo, X, X, X, 35, 12 + chi_step_ror Amu, X, X, X, 9, 44 + chi_step_ror Asa, X, X, X, 48, 41 + chi_step_ror Ase, X, X, X, 2, 50 + chi_step_ror Asi, X, X, X, 25, 27 + chi_step_ror Aso, X, X, X, 60, 21 + chi_step_ror Asu, X, X, X, 57, 53 + chi_step_ror2 Aba, X, X, X, 63, 21 + chi_step_ror Abe, X, X, X, 42, 41 + chi_step_ror Abi, X, X, X, 57, 35 + chi_step_ror Abo, X, X, X, 50, 43 + chi_step_ror Abu, X, X, X, 44, 30 + + eor Aba, Aba, X .endm .macro load_state @@ -462,28 +424,32 @@ round_constants: .global keccak_f1600_x1_scalar_slothy .global _keccak_f1600_x1_scalar_slothy -.macro load_constant_ptr_stack - ldr const_addr, [sp, #(STACK_OFFSET_CONST)] -.endm keccak_f1600_x1_scalar_slothy: _keccak_f1600_x1_scalar_slothy: alloc_stack save_gprs initial: + load_constant_ptr + str const_addr, [sp, #STACK_OFFSET_CONST] load_state str input_addr, [sp, #STACK_OFFSET_INPUT] // @slothy:writes=STACK_OFFSET_INPUT - keccak_f1600_round_initial + +initial_round_start: + keccak_f1600_round_initial +initial_round_end: + loop: keccak_f1600_round_noninitial end_loop: - cmp count, #(KECCAK_F1600_ROUNDS-1) ble loop + final: final_rotate ldr input_addr, [sp, #STACK_OFFSET_INPUT] // @slothy:reads=STACK_OFFSET_INPUT store_state end_final: + restore_gprs free_stack ret diff --git a/examples/opt/aarch64/keccak_f1600_x1_scalar_no_symbolic_opt_a55.s b/examples/opt/aarch64/keccak_f1600_x1_scalar_no_symbolic_opt_a55.s new file mode 100644 index 0000000..eb42217 --- /dev/null +++ b/examples/opt/aarch64/keccak_f1600_x1_scalar_no_symbolic_opt_a55.s @@ -0,0 +1,1101 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + + +// Author: Hanno Becker +// Author: Matthias Kannwischer + + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .balign 64 +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x26 + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + Aba .req x1 + Abe .req x6 + Abi .req x11 + Abo .req x16 + Abu .req x21 + Aga .req x2 + Age .req x7 + Agi .req x12 + Ago .req x17 + Agu .req x22 + Aka .req x3 + Ake .req x8 + Aki .req x13 + Ako .req x28 + Aku .req x23 + Ama .req x4 + Ame .req x9 + Ami .req x14 + Amo .req x19 + Amu .req x24 + Asa .req x5 + Ase .req x10 + Asi .req x15 + Aso .req x20 + Asu .req x25 + +/************************ MACROS ****************************/ + +#define STACK_LOCS 40 + +#define STACK_SIZE (16*6 + 3*8 + 8 + (STACK_LOCS) * 8) // GPRs (16*6), count (8), const (8), input (8), padding (8) +#define STACK_BASE_GPRS (3*8+8) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) + +#define STACK_OFFSET_LOCS (16*6 + 4*8) +#define STACK_LOC_0 ((STACK_OFFSET_LOCS) + 0*8) +#define STACK_LOC_1 ((STACK_OFFSET_LOCS) + 1*8) +#define STACK_LOC_2 ((STACK_OFFSET_LOCS) + 2*8) +#define STACK_LOC_3 ((STACK_OFFSET_LOCS) + 3*8) +#define STACK_LOC_4 ((STACK_OFFSET_LOCS) + 4*8) +#define STACK_LOC_5 ((STACK_OFFSET_LOCS) + 5*8) +#define STACK_LOC_6 ((STACK_OFFSET_LOCS) + 6*8) +#define STACK_LOC_7 ((STACK_OFFSET_LOCS) + 7*8) +#define STACK_LOC_8 ((STACK_OFFSET_LOCS) + 8*8) +#define STACK_LOC_9 ((STACK_OFFSET_LOCS) + 9*8) +#define STACK_LOC_10 ((STACK_OFFSET_LOCS) + 10*8) +#define STACK_LOC_11 ((STACK_OFFSET_LOCS) + 11*8) +#define STACK_LOC_12 ((STACK_OFFSET_LOCS) + 12*8) +#define STACK_LOC_13 ((STACK_OFFSET_LOCS) + 13*8) +#define STACK_LOC_14 ((STACK_OFFSET_LOCS) + 14*8) +#define STACK_LOC_15 ((STACK_OFFSET_LOCS) + 15*8) +#define STACK_LOC_16 ((STACK_OFFSET_LOCS) + 16*8) +#define STACK_LOC_17 ((STACK_OFFSET_LOCS) + 17*8) +#define STACK_LOC_18 ((STACK_OFFSET_LOCS) + 18*8) +#define STACK_LOC_19 ((STACK_OFFSET_LOCS) + 19*8) +#define STACK_LOC_20 ((STACK_OFFSET_LOCS) + 20*8) +#define STACK_LOC_21 ((STACK_OFFSET_LOCS) + 21*8) +#define STACK_LOC_22 ((STACK_OFFSET_LOCS) + 22*8) +#define STACK_LOC_23 ((STACK_OFFSET_LOCS) + 23*8) +#define STACK_LOC_24 ((STACK_OFFSET_LOCS) + 24*8) +#define STACK_LOC_25 ((STACK_OFFSET_LOCS) + 25*8) +#define STACK_LOC_26 ((STACK_OFFSET_LOCS) + 26*8) +#define STACK_LOC_27 ((STACK_OFFSET_LOCS) + 27*8) +#define STACK_LOC_28 ((STACK_OFFSET_LOCS) + 28*8) +#define STACK_LOC_29 ((STACK_OFFSET_LOCS) + 29*8) +#define STACK_LOC_30 ((STACK_OFFSET_LOCS) + 30*8) +#define STACK_LOC_31 ((STACK_OFFSET_LOCS) + 31*8) +#define STACK_LOC_32 ((STACK_OFFSET_LOCS) + 32*8) +#define STACK_LOC_33 ((STACK_OFFSET_LOCS) + 33*8) +#define STACK_LOC_34 ((STACK_OFFSET_LOCS) + 34*8) +#define STACK_LOC_35 ((STACK_OFFSET_LOCS) + 35*8) +#define STACK_LOC_36 ((STACK_OFFSET_LOCS) + 36*8) +#define STACK_LOC_37 ((STACK_OFFSET_LOCS) + 37*8) +#define STACK_LOC_38 ((STACK_OFFSET_LOCS) + 38*8) +#define STACK_LOC_39 ((STACK_OFFSET_LOCS) + 39*8) + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro eor5 dst, src0, src1, src2, src3, src4 + eor \dst, \src0, \src1 + eor \dst, \dst, \src2 + eor \dst, \dst, \src3 + eor \dst, \dst, \src4 +.endm + +.macro chi_step_ror out, a, b, c, r1, r2 + bic X, \b\(), \c\(), ror #\r1 + eor \out\(), X, \a\(), ror #\r2 +.endm + +.macro chi_step_ror2 out, a, b, c, r1, r2 + bic X, \b\(), \c\(), ror #\r1 + eor \out\(), \a\(), X, ror #\r2 +.endm + +.macro keccak_f1600_round_initial + eor5 X, Ama, Asa, Aba, Aga, Aka + eor5 X, Ame, Ase, Abe, Age, Ake + eor5 X, Ami, Asi, Abi, Agi, Aki + eor5 X, Amo, Aso, Abo, Ago, Ako + eor5 X, Amu, Asu, Abu, Agu, Aku + + eor X, X, X, ror #63 + eor X, X, X, ror #63 + eor X, X, X, ror #63 + eor X, X, X, ror #63 + eor X, X, X, ror #63 + + eor X, Aba, X + eor X, Abi, X + eor X, Aki, X + eor X, Ako, X + eor X, Amu, X + eor X, Aso, X + eor X, Ama, X + eor X, Abe, X + eor X, Ago, X + eor X, Ame, X + eor X, Agi, X + eor X, Aka, X + eor X, Abo, X + eor X, Amo, X + eor X, Ami, X + eor X, Ake, X + eor X, Agu, X + eor X, Asi, X + eor X, Aku, X + eor X, Asa, X + eor X, Abu, X + eor X, Asu, X + eor X, Ase, X + eor X, Aga, X + eor X, Age, X + + ldr X, [sp, #STACK_OFFSET_CONST] + ldr X, [X] + mov X, #1 + str X, [sp, #STACK_OFFSET_COUNT] // @slothy:writes=STACK_OFFSET_COUNT + + chi_step_ror Aga, X, X, X, 47, 39 + chi_step_ror Age, X, X, X, 42, 25 + chi_step_ror Agi, X, X, X, 16, 58 + chi_step_ror Ago, X, X, X, 31, 47 + chi_step_ror Agu, X, X, X, 56, 23 + chi_step_ror Aka, X, X, X, 19, 24 + chi_step_ror Ake, X, X, X, 47, 2 + chi_step_ror Aki, X, X, X, 10, 57 + chi_step_ror Ako, X, X, X, 47, 57 + chi_step_ror Aku, X, X, X, 5, 52 + chi_step_ror Ama, X, X, X, 38, 47 + chi_step_ror Ame, X, X, X, 5, 43 + chi_step_ror Ami, X, X, X, 41, 46 + chi_step_ror Amo, X, X, X, 35, 12 + chi_step_ror Amu, X, X, X, 9, 44 + chi_step_ror Asa, X, X, X, 48, 41 + chi_step_ror Ase, X, X, X, 2, 50 + chi_step_ror Asi, X, X, X, 25, 27 + chi_step_ror Aso, X, X, X, 60, 21 + chi_step_ror Asu, X, X, X, 57, 53 + chi_step_ror2 Aba, X, X, X, 63, 21 + chi_step_ror Abe, X, X, X, 42, 41 + chi_step_ror Abi, X, X, X, 57, 35 + chi_step_ror Abo, X, X, X, 50, 43 + chi_step_ror Abu, X, X, X, 44, 30 + + eor Aba, Aba, X + +.endm + +.macro keccak_f1600_round_noninitial + + eor X, Aba, Aga, ror #61 + eor X, X, Ama, ror #54 + eor X, X, Aka, ror #39 + eor X, X, Asa, ror #25 + + eor X, Ake, Ame, ror #57 + eor X, X, Abe, ror #51 + eor X, X, Ase, ror #31 + eor X, X, Age, ror #27 + + eor X, Asi, Abi, ror #52 + eor X, X, Aki, ror #48 + eor X, X, Ami, ror #10 + eor X, X, Agi, ror #5 + + eor X, Abo, Ako, ror #63 + eor X, X, Amo, ror #37 + eor X, X, Ago, ror #36 + eor X, X, Aso, ror #2 + + eor X, Aku, Agu, ror #50 + eor X, X, Amu, ror #34 + eor X, X, Abu, ror #26 + eor X, X, Asu, ror #15 + + eor X, X, X, ror #61 + ror X, X, #62 + eor X, X, X, ror #57 + ror X, X, #58 + eor X, X, X, ror #55 + ror X, X, #56 + eor X, X, X, ror #63 + eor X, X, X, ror #63 + + eor X, X, Aba + eor X, X, Abi, ror #50 + eor X, X, Aki, ror #46 + eor X, X, Ako, ror #63 + eor X, X, Amu, ror #28 + eor X, X, Aso, ror #2 + eor X, X, Ama, ror #54 + eor X, X, Abe, ror #43 + eor X, X, Ago, ror #36 + eor X, X, Ame, ror #49 + eor X, X, Agi, ror #3 + eor X, X, Aka, ror #39 + eor X, X, Abo + eor X, X, Amo, ror #37 + eor X, X, Ami, ror #8 + eor X, X, Ake, ror #56 + eor X, X, Agu, ror #44 + eor X, X, Asi, ror #62 + eor X, X, Aku, ror #58 + eor X, X, Asa, ror #25 + eor X, X, Abu, ror #20 + eor X, X, Asu, ror #9 + eor X, X, Ase, ror #23 + eor X, X, Aga, ror #61 + eor X, X, Age, ror #19 + + ldr X, [sp, #STACK_OFFSET_CONST] + ldr X, [sp, #STACK_OFFSET_COUNT] // @slothy:reads=STACK_OFFSET_COUNT + ldr X, [X, W, UXTW #3] + add X, X, #1 + cmp X, #(KECCAK_F1600_ROUNDS-1) + str X, [sp, #STACK_OFFSET_COUNT] // @slothy:writes=STACK_OFFSET_COUNT + + chi_step_ror Aga, X, X, X, 47, 39 + chi_step_ror Age, X, X, X, 42, 25 + chi_step_ror Agi, X, X, X, 16, 58 + chi_step_ror Ago, X, X, X, 31, 47 + chi_step_ror Agu, X, X, X, 56, 23 + chi_step_ror Aka, X, X, X, 19, 24 + chi_step_ror Ake, X, X, X, 47, 2 + chi_step_ror Aki, X, X, X, 10, 57 + chi_step_ror Ako, X, X, X, 47, 57 + chi_step_ror Aku, X, X, X, 5, 52 + chi_step_ror Ama, X, X, X, 38, 47 + chi_step_ror Ame, X, X, X, 5, 43 + chi_step_ror Ami, X, X, X, 41, 46 + chi_step_ror Amo, X, X, X, 35, 12 + chi_step_ror Amu, X, X, X, 9, 44 + chi_step_ror Asa, X, X, X, 48, 41 + chi_step_ror Ase, X, X, X, 2, 50 + chi_step_ror Asi, X, X, X, 25, 27 + chi_step_ror Aso, X, X, X, 60, 21 + chi_step_ror Asu, X, X, X, 57, 53 + chi_step_ror2 Aba, X, X, X, 63, 21 + chi_step_ror Abe, X, X, X, 42, 41 + chi_step_ror Abi, X, X, X, 57, 35 + chi_step_ror Abo, X, X, X, 50, 43 + chi_step_ror Abu, X, X, X, 44, 30 + + eor Aba, Aba, X +.endm + +.macro load_state + ldp Aba, Abe, [input_addr, #(1*8*0)] + ldp Abi, Abo, [input_addr, #(1*8*2)] + ldp Abu, Aga, [input_addr, #(1*8*4)] + ldp Age, Agi, [input_addr, #(1*8*6)] + ldp Ago, Agu, [input_addr, #(1*8*8)] + ldp Aka, Ake, [input_addr, #(1*8*10)] + ldp Aki, Ako, [input_addr, #(1*8*12)] + ldp Aku, Ama, [input_addr, #(1*8*14)] + ldp Ame, Ami, [input_addr, #(1*8*16)] + ldp Amo, Amu, [input_addr, #(1*8*18)] + ldp Asa, Ase, [input_addr, #(1*8*20)] + ldp Asi, Aso, [input_addr, #(1*8*22)] + ldr Asu, [input_addr, #(1*8*24)] +.endm + +.macro store_state + stp Aba, Abe, [input_addr, #(1*8*0)] + stp Abi, Abo, [input_addr, #(1*8*2)] + stp Abu, Aga, [input_addr, #(1*8*4)] + stp Age, Agi, [input_addr, #(1*8*6)] + stp Ago, Agu, [input_addr, #(1*8*8)] + stp Aka, Ake, [input_addr, #(1*8*10)] + stp Aki, Ako, [input_addr, #(1*8*12)] + stp Aku, Ama, [input_addr, #(1*8*14)] + stp Ame, Ami, [input_addr, #(1*8*16)] + stp Amo, Amu, [input_addr, #(1*8*18)] + stp Asa, Ase, [input_addr, #(1*8*20)] + stp Asi, Aso, [input_addr, #(1*8*22)] + str Asu, [input_addr, #(1*8*24)] +.endm + +.macro final_rotate + ror Abe, Abe,#(64-21) + ror Abi, Abi,#(64-14) + ror Abu, Abu,#(64-44) + ror Aga, Aga,#(64-3) + ror Age, Age,#(64-45) + ror Agi, Agi,#(64-61) + ror Ago, Ago,#(64-28) + ror Agu, Agu,#(64-20) + ror Aka, Aka,#(64-25) + ror Ake, Ake,#(64-8) + ror Aki, Aki,#(64-18) + ror Ako, Ako,#(64-1) + ror Aku, Aku,#(64-6) + ror Ama, Ama,#(64-10) + ror Ame, Ame,#(64-15) + ror Ami, Ami,#(64-56) + ror Amo, Amo,#(64-27) + ror Amu, Amu,#(64-36) + ror Asa, Asa,#(64-39) + ror Ase, Ase,#(64-41) + ror Asi, Asi,#(64-2) + ror Aso, Aso,#(64-62) + ror Asu, Asu,#(64-55) +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.text +.balign 16 +.global keccak_f1600_x1_scalar_slothy_opt_a55 +.global _keccak_f1600_x1_scalar_slothy_opt_a55 + +keccak_f1600_x1_scalar_slothy_opt_a55: +_keccak_f1600_x1_scalar_slothy_opt_a55: + alloc_stack + save_gprs + +initial: + load_constant_ptr + str const_addr, [sp, #STACK_OFFSET_CONST] + load_state + str input_addr, [sp, #STACK_OFFSET_INPUT] // @slothy:writes=STACK_OFFSET_INPUT + + initial_round_start: + // Instructions: 105 + // Expected cycles: 53 + // Expected IPC: 1.98 + // + // Wall time: 7.19s + // User time: 7.19s + // + // ----------------- cycle (expected) -----------------> + // 0 25 50 + // |------------------------|------------------------|-- + eor x0, x4, x5 // *.................................................... + eor x26, x0, x1 // *.................................................... + eor x27, x26, x2 // .*................................................... + eor x26, x27, x3 // .*................................................... + str x26, [sp, #STACK_LOC_0] // .s................................................... // @slothy:is_spill // @slothy:writes=stack_0 + eor x29, x9, x10 // ..*.................................................. + eor x30, x29, x6 // ..*.................................................. + eor x0, x30, x7 // ...*................................................. + eor x27, x0, x8 // ...*................................................. + str x27, [sp, #STACK_LOC_1] // ...s................................................. // @slothy:is_spill // @slothy:writes=stack_1 + eor x26, x14, x15 // ....*................................................ + eor x27, x26, x11 // ....*................................................ + eor x29, x27, x12 // .....*............................................... + eor x27, x29, x13 // .....*............................................... + str x27, [sp, #STACK_LOC_2] // .....s............................................... // @slothy:is_spill // @slothy:writes=stack_2 + eor x30, x19, x20 // ......*.............................................. + eor x0, x30, x16 // ......*.............................................. + eor x26, x0, x17 // .......*............................................. + eor x29, x26, x28 // .......*............................................. + ldr x26, [sp, #STACK_LOC_0] // .......r............................................. // @slothy:is_restore // @slothy:reads=stack_0 + str x29, [sp, #STACK_LOC_0] // .......s............................................. // @slothy:is_spill // @slothy:writes=stack_0 + eor x27, x24, x25 // ........*............................................ + eor x29, x27, x21 // ........*............................................ + ldr x27, [sp, #STACK_LOC_2] // ........r............................................ // @slothy:is_restore // @slothy:reads=stack_2 + eor x30, x29, x22 // .........*........................................... + eor x0, x30, x23 // .........*........................................... + str x0, [sp, #STACK_LOC_2] // .........s........................................... // @slothy:is_spill // @slothy:writes=stack_2 + eor x29, x26, x27, ror #63 // ..........*.......................................... + ldr x0, [sp, #STACK_LOC_2] // ..........r.......................................... // @slothy:is_restore // @slothy:reads=stack_2 + str x29, [sp, #STACK_LOC_2] // ..........s.......................................... // @slothy:is_spill // @slothy:writes=stack_2 + eor x30, x27, x0, ror #63 // ..........*.......................................... + ldr x27, [sp, #STACK_LOC_1] // ..........r.......................................... // @slothy:is_restore // @slothy:reads=stack_1 + ldr x29, [sp, #STACK_LOC_0] // ..........r.......................................... // @slothy:is_restore // @slothy:reads=stack_0 + eor x0, x0, x27, ror #63 // ...........*......................................... + str x0, [sp, #STACK_LOC_0] // ...........s......................................... // @slothy:is_spill // @slothy:writes=stack_0 + eor x27, x27, x29, ror #63 // ...........*......................................... + ldr x0, [sp, #STACK_LOC_0] // ...........r......................................... // @slothy:is_restore // @slothy:reads=stack_0 + str x27, [sp, #STACK_LOC_0] // ...........s......................................... // @slothy:is_spill // @slothy:writes=stack_0 + eor x26, x29, x26, ror #63 // ............*........................................ + ldr x29, [sp, #STACK_LOC_2] // ............r........................................ // @slothy:is_restore // @slothy:reads=stack_2 + ldr x27, [sp, #STACK_LOC_0] // ............r........................................ // @slothy:is_restore // @slothy:reads=stack_0 + str x26, [sp, #STACK_LOC_0] // ............s........................................ // @slothy:is_spill // @slothy:writes=stack_0 + eor x26, x1, x0 // ............*........................................ + str x26, [sp, #STACK_LOC_1] // ............s........................................ // @slothy:is_spill // @slothy:writes=stack_1 + eor x26, x11, x27 // .............*....................................... + str x26, [sp, #STACK_LOC_2] // .............s....................................... // @slothy:is_spill // @slothy:writes=stack_2 + eor x13, x13, x27 // .............*....................................... + ldr x26, [sp, #STACK_LOC_0] // .............r....................................... // @slothy:is_restore // @slothy:reads=stack_0 + str x13, [sp, #STACK_LOC_0] // .............s....................................... // @slothy:is_spill // @slothy:writes=stack_0 + eor x13, x28, x30 // ..............*...................................... + str x13, [sp, #STACK_LOC_3] // ..............s...................................... // @slothy:is_spill // @slothy:writes=stack_3 + eor x24, x24, x26 // ..............*...................................... + str x24, [sp, #STACK_LOC_4] // ..............s...................................... // @slothy:is_spill // @slothy:writes=stack_4 + eor x20, x20, x30 // ...............*..................................... + str x20, [sp, #STACK_LOC_5] // ...............s..................................... // @slothy:is_spill // @slothy:writes=stack_5 + eor x20, x4, x0 // ...............*..................................... + str x20, [sp, #STACK_LOC_6] // ...............s..................................... // @slothy:is_spill // @slothy:writes=stack_6 + eor x20, x6, x29 // ................*.................................... + str x20, [sp, #STACK_LOC_7] // ................s.................................... // @slothy:is_spill // @slothy:writes=stack_7 + eor x4, x17, x30 // ................*.................................... + str x4, [sp, #STACK_LOC_8] // ................s.................................... // @slothy:is_spill // @slothy:writes=stack_8 + eor x13, x9, x29 // .................*................................... + eor x4, x12, x27 // .................*................................... + str x4, [sp, #STACK_LOC_9] // .................s................................... // @slothy:is_spill // @slothy:writes=stack_9 + eor x11, x3, x0 // ..................*.................................. + eor x24, x16, x30 // ..................*.................................. + eor x20, x19, x30 // ...................*................................. + str x20, [sp, #STACK_LOC_10] // ...................s................................. // @slothy:is_spill // @slothy:writes=stack_10 + eor x4, x14, x27 // ...................*................................. + str x4, [sp, #STACK_LOC_11] // ...................s................................. // @slothy:is_spill // @slothy:writes=stack_11 + eor x14, x8, x29 // ....................*................................ + eor x4, x22, x26 // ....................*................................ + str x4, [sp, #STACK_LOC_12] // ....................s................................ // @slothy:is_spill // @slothy:writes=stack_12 + eor x15, x15, x27 // .....................*............................... + ldr x4, [sp, #STACK_LOC_12] // .....................r............................... // @slothy:is_restore // @slothy:reads=stack_12 + eor x23, x23, x26 // .....................*............................... + str x23, [sp, #STACK_LOC_12] // .....................s............................... // @slothy:is_spill // @slothy:writes=stack_12 + eor x16, x5, x0 // ......................*.............................. + eor x27, x21, x26 // ......................*.............................. + eor x23, x25, x26 // .......................*............................. + ldr x26, [sp, #STACK_LOC_2] // .......................r............................. // @slothy:is_restore // @slothy:reads=stack_2 + str x23, [sp, #STACK_LOC_2] // .......................s............................. // @slothy:is_spill // @slothy:writes=stack_2 + eor x21, x10, x29 // .......................*............................. + str x21, [sp, #STACK_LOC_13] // .......................s............................. // @slothy:is_spill // @slothy:writes=stack_13 + eor x5, x2, x0 // ........................*............................ + eor x21, x7, x29 // ........................*............................ + str x21, [sp, #STACK_LOC_14] // ........................s............................ // @slothy:is_spill // @slothy:writes=stack_14 + ldr x20, [sp, #STACK_OFFSET_CONST] // .........................*........................... + ldr x20, [x20] // .........................*........................... + str x20, [sp, #STACK_LOC_15] // .........................s........................... // @slothy:is_spill // @slothy:writes=stack_15 + mov x20, #1 // ..........................*.......................... + str x20, [sp, #STACK_OFFSET_COUNT] // ..........................*.......................... // @slothy:writes=STACK_OFFSET_COUNT + bic x20, x11, x4, ror #47 // ...........................*......................... + eor x2, x20, x24, ror #39 // ...........................*......................... + str x2, [sp, #STACK_LOC_16] // ...........................s......................... // @slothy:is_spill // @slothy:writes=stack_16 + bic x1, x13, x11, ror #42 // ............................*........................ + eor x7, x1, x4, ror #25 // ............................*........................ + str x7, [sp, #STACK_LOC_17] // ............................s........................ // @slothy:is_spill // @slothy:writes=stack_17 + bic x28, x15, x13, ror #16 // .............................*....................... + eor x12, x28, x11, ror #58 // .............................*....................... + str x12, [sp, #STACK_LOC_18] // .............................s....................... // @slothy:is_spill // @slothy:writes=stack_18 + bic x20, x24, x15, ror #31 // ..............................*...................... + eor x17, x20, x13, ror #47 // ..............................*...................... + ldr x13, [sp, #STACK_LOC_3] // ..............................r...................... // @slothy:is_restore // @slothy:reads=stack_3 + ldr x20, [sp, #STACK_LOC_7] // ..............................r...................... // @slothy:is_restore // @slothy:reads=stack_7 + str x17, [sp, #STACK_LOC_3] // ..............................s...................... // @slothy:is_spill // @slothy:writes=stack_3 + bic x6, x4, x24, ror #56 // ...............................*..................... + ldr x24, [sp, #STACK_LOC_4] // ...............................r..................... // @slothy:is_restore // @slothy:reads=stack_4 + ldr x4, [sp, #STACK_LOC_9] // ...............................r..................... // @slothy:is_restore // @slothy:reads=stack_9 + eor x22, x6, x15, ror #23 // ...............................*..................... + str x22, [sp, #STACK_LOC_4] // ...............................s..................... // @slothy:is_spill // @slothy:writes=stack_4 + bic x17, x13, x4, ror #19 // ................................*.................... + eor x3, x17, x20, ror #24 // ................................*.................... + ldr x17, [sp, #STACK_LOC_3] // ................................r.................... // @slothy:is_restore // @slothy:reads=stack_3 + str x3, [sp, #STACK_LOC_3] // ................................s.................... // @slothy:is_spill // @slothy:writes=stack_3 + bic x9, x24, x13, ror #47 // .................................*................... + eor x8, x9, x4, ror #2 // .................................*................... + str x8, [sp, #STACK_LOC_7] // .................................s................... // @slothy:is_spill // @slothy:writes=stack_7 + bic x12, x16, x24, ror #10 // ..................................*.................. + eor x13, x12, x13, ror #57 // ..................................*.................. + ldr x12, [sp, #STACK_LOC_18] // ..................................r.................. // @slothy:is_restore // @slothy:reads=stack_18 + str x13, [sp, #STACK_LOC_9] // ..................................s.................. // @slothy:is_spill // @slothy:writes=stack_9 + bic x3, x20, x16, ror #47 // ...................................*................. + ldr x13, [sp, #STACK_LOC_0] // ...................................r................. // @slothy:is_restore // @slothy:reads=stack_0 + eor x28, x3, x24, ror #57 // ...................................*................. + ldr x3, [sp, #STACK_LOC_3] // ...................................r................. // @slothy:is_restore // @slothy:reads=stack_3 + str x28, [sp, #STACK_LOC_0] // ...................................s................. // @slothy:is_spill // @slothy:writes=stack_0 + bic x19, x4, x20, ror #5 // ....................................*................ + eor x23, x19, x16, ror #52 // ....................................*................ + str x23, [sp, #STACK_LOC_3] // ....................................s................ // @slothy:is_spill // @slothy:writes=stack_3 + bic x30, x14, x5, ror #38 // .....................................*............... + ldr x23, [sp, #STACK_LOC_12] // .....................................r............... // @slothy:is_restore // @slothy:reads=stack_12 + eor x4, x30, x27, ror #47 // .....................................*............... + str x4, [sp, #STACK_LOC_12] // .....................................s............... // @slothy:is_spill // @slothy:writes=stack_12 + ldr x4, [sp, #STACK_LOC_11] // ......................................r.............. // @slothy:is_restore // @slothy:reads=stack_11 + bic x20, x4, x14, ror #5 // ......................................*.............. + eor x9, x20, x5, ror #43 // .......................................*............. + ldr x20, [sp, #STACK_LOC_5] // .......................................r............. // @slothy:is_restore // @slothy:reads=stack_5 + str x9, [sp, #STACK_LOC_5] // .......................................s............. // @slothy:is_spill // @slothy:writes=stack_5 + bic x8, x20, x4, ror #41 // .......................................*............. + ldr x9, [sp, #STACK_LOC_5] // .......................................r............. // @slothy:is_restore // @slothy:reads=stack_5 + eor x14, x8, x14, ror #46 // ........................................*............ + ldr x8, [sp, #STACK_LOC_7] // ........................................r............ // @slothy:is_restore // @slothy:reads=stack_7 + str x14, [sp, #STACK_LOC_5] // ........................................s............ // @slothy:is_spill // @slothy:writes=stack_5 + bic x22, x27, x20, ror #35 // ........................................*............ + ldr x14, [sp, #STACK_LOC_5] // ........................................r............ // @slothy:is_restore // @slothy:reads=stack_5 + eor x19, x22, x4, ror #12 // .........................................*........... + ldr x4, [sp, #STACK_LOC_8] // .........................................r........... // @slothy:is_restore // @slothy:reads=stack_8 + ldr x22, [sp, #STACK_LOC_4] // .........................................r........... // @slothy:is_restore // @slothy:reads=stack_4 + str x19, [sp, #STACK_LOC_4] // .........................................s........... // @slothy:is_spill // @slothy:writes=stack_4 + bic x21, x5, x27, ror #9 // .........................................*........... + ldr x19, [sp, #STACK_LOC_4] // .........................................r........... // @slothy:is_restore // @slothy:reads=stack_4 + eor x24, x21, x20, ror #44 // ..........................................*.......... + ldr x21, [sp, #STACK_LOC_13] // ..........................................r.......... // @slothy:is_restore // @slothy:reads=stack_13 + str x24, [sp, #STACK_LOC_4] // ..........................................s.......... // @slothy:is_spill // @slothy:writes=stack_4 + bic x20, x23, x4, ror #48 // ..........................................*.......... + ldr x24, [sp, #STACK_LOC_4] // ..........................................r.......... // @slothy:is_restore // @slothy:reads=stack_4 + eor x5, x20, x26, ror #41 // ...........................................*......... + ldr x20, [sp, #STACK_LOC_6] // ...........................................r......... // @slothy:is_restore // @slothy:reads=stack_6 + str x5, [sp, #STACK_LOC_4] // ...........................................s......... // @slothy:is_spill // @slothy:writes=stack_4 + bic x25, x20, x23, ror #2 // ...........................................*......... + ldr x5, [sp, #STACK_LOC_4] // ...........................................r......... // @slothy:is_restore // @slothy:reads=stack_4 + eor x10, x25, x4, ror #50 // ............................................*........ + str x10, [sp, #STACK_LOC_4] // ............................................s........ // @slothy:is_spill // @slothy:writes=stack_4 + bic x10, x21, x20, ror #25 // ............................................*........ + eor x15, x10, x23, ror #27 // .............................................*....... + ldr x23, [sp, #STACK_LOC_2] // .............................................r....... // @slothy:is_restore // @slothy:reads=stack_2 + str x15, [sp, #STACK_LOC_2] // .............................................s....... // @slothy:is_spill // @slothy:writes=stack_2 + bic x10, x26, x21, ror #60 // .............................................*....... + ldr x15, [sp, #STACK_LOC_2] // .............................................r....... // @slothy:is_restore // @slothy:reads=stack_2 + eor x20, x10, x20, ror #21 // ..............................................*...... + ldr x10, [sp, #STACK_LOC_4] // ..............................................r...... // @slothy:is_restore // @slothy:reads=stack_4 + str x20, [sp, #STACK_LOC_2] // ..............................................s...... // @slothy:is_spill // @slothy:writes=stack_2 + bic x2, x4, x26, ror #57 // ..............................................*...... + ldr x26, [sp, #STACK_LOC_1] // ..............................................r...... // @slothy:is_restore // @slothy:reads=stack_1 + ldr x20, [sp, #STACK_LOC_10] // ..............................................r...... // @slothy:is_restore // @slothy:reads=stack_10 + ldr x4, [sp, #STACK_LOC_12] // ..............................................r...... // @slothy:is_restore // @slothy:reads=stack_12 + eor x25, x2, x21, ror #53 // ...............................................*..... + ldr x21, [sp, #STACK_LOC_14] // ...............................................r..... // @slothy:is_restore // @slothy:reads=stack_14 + ldr x2, [sp, #STACK_LOC_16] // ...............................................r..... // @slothy:is_restore // @slothy:reads=stack_16 + str x25, [sp, #STACK_LOC_1] // ...............................................s..... // @slothy:is_spill // @slothy:writes=stack_1 + bic x0, x13, x21, ror #63 // ...............................................*..... + ldr x25, [sp, #STACK_LOC_1] // ...............................................r..... // @slothy:is_restore // @slothy:reads=stack_1 + eor x11, x26, x0, ror #21 // ................................................*.... + str x11, [sp, #STACK_LOC_1] // ................................................s.... // @slothy:is_spill // @slothy:writes=stack_1 + bic x7, x20, x13, ror #42 // ................................................*.... + eor x6, x7, x21, ror #41 // .................................................*... + ldr x7, [sp, #STACK_LOC_17] // .................................................r... // @slothy:is_restore // @slothy:reads=stack_17 + str x6, [sp, #STACK_LOC_4] // .................................................s... // @slothy:is_spill // @slothy:writes=stack_4 + bic x29, x23, x20, ror #57 // .................................................*... + ldr x6, [sp, #STACK_LOC_4] // .................................................r... // @slothy:is_restore // @slothy:reads=stack_4 + eor x11, x29, x13, ror #35 // ..................................................*.. + ldr x13, [sp, #STACK_LOC_9] // ..................................................r.. // @slothy:is_restore // @slothy:reads=stack_9 + str x11, [sp, #STACK_LOC_4] // ..................................................s.. // @slothy:is_spill // @slothy:writes=stack_4 + bic x1, x26, x23, ror #50 // ..................................................*.. + ldr x11, [sp, #STACK_LOC_1] // ..................................................r.. // @slothy:is_restore // @slothy:reads=stack_1 + eor x16, x1, x20, ror #43 // ...................................................*. + ldr x20, [sp, #STACK_LOC_15] // ...................................................r. // @slothy:is_restore // @slothy:reads=stack_15 + str x16, [sp, #STACK_LOC_1] // ...................................................s. // @slothy:is_spill // @slothy:writes=stack_1 + bic x28, x21, x26, ror #44 // ...................................................*. + ldr x16, [sp, #STACK_LOC_1] // ...................................................r. // @slothy:is_restore // @slothy:reads=stack_1 + eor x21, x28, x23, ror #30 // ....................................................* + ldr x28, [sp, #STACK_LOC_0] // ....................................................r // @slothy:is_restore // @slothy:reads=stack_0 + ldr x23, [sp, #STACK_LOC_3] // ....................................................r // @slothy:is_restore // @slothy:reads=stack_3 + eor x1, x11, x20 // ....................................................* + ldr x20, [sp, #STACK_LOC_2] // ....................................................r // @slothy:is_restore // @slothy:reads=stack_2 + ldr x11, [sp, #STACK_LOC_4] // ....................................................r // @slothy:is_restore // @slothy:reads=stack_4 + + // ----------------- cycle (expected) -----------------> + // 0 25 50 + // |------------------------|------------------------|-- + // eor X, x4, x5 // *.................................................... + // eor X, X, x1 // *.................................................... + // eor X, X, x2 // .*................................................... + // eor X, X, x3 // .*................................................... + // eor X, x9, x10 // ..*.................................................. + // eor X, X, x6 // ..*.................................................. + // eor X, X, x7 // ...*................................................. + // eor X, X, x8 // ...*................................................. + // eor X, x14, x15 // ....*................................................ + // eor X, X, x11 // ....*................................................ + // eor X, X, x12 // .....*............................................... + // eor X, X, x13 // .....*............................................... + // eor X, x19, x20 // ......*.............................................. + // eor X, X, x16 // ......*.............................................. + // eor X, X, x17 // .......*............................................. + // eor X, X, x28 // .......*............................................. + // eor X, x24, x25 // ........*............................................ + // eor X, X, x21 // ........*............................................ + // eor X, X, x22 // .........*........................................... + // eor X, X, x23 // .........*........................................... + // eor X, X, X, ror #63 // ..........*.......................................... + // eor X, X, X, ror #63 // ..........*.......................................... + // eor X, X, X, ror #63 // ...........*......................................... + // eor X, X, X, ror #63 // ...........*......................................... + // eor X, X, X, ror #63 // ............*........................................ + // eor X, x1, X // ............*........................................ + // eor X, x11, X // .............*....................................... + // eor X, x13, X // .............*....................................... + // eor X, x28, X // ..............*...................................... + // eor X, x24, X // ..............*...................................... + // eor X, x20, X // ...............*..................................... + // eor X, x4, X // ...............*..................................... + // eor X, x6, X // ................*.................................... + // eor X, x17, X // ................*.................................... + // eor X, x9, X // .................*................................... + // eor X, x12, X // .................*................................... + // eor X, x3, X // ..................*.................................. + // eor X, x16, X // ..................*.................................. + // eor X, x19, X // ...................*................................. + // eor X, x14, X // ...................*................................. + // eor X, x8, X // ....................*................................ + // eor X, x22, X // ....................*................................ + // eor X, x15, X // .....................*............................... + // eor X, x23, X // .....................*............................... + // eor X, x5, X // ......................*.............................. + // eor X, x21, X // ......................*.............................. + // eor X, x25, X // .......................*............................. + // eor X, x10, X // .......................*............................. + // eor X, x2, X // ........................*............................ + // eor X, x7, X // ........................*............................ + // ldr X, [sp, #STACK_OFFSET_CONST] // .........................*........................... + // ldr X, [X] // .........................*........................... + // mov X, #1 // ..........................*.......................... + // str X, [sp, #STACK_OFFSET_COUNT] // ..........................*.......................... + // bic X, X, X, ror #47 // ...........................*......................... + // eor x2, X, X, ror #39 // ...........................*......................... + // bic X, X, X, ror #42 // ............................*........................ + // eor x7, X, X, ror #25 // ............................*........................ + // bic X, X, X, ror #16 // .............................*....................... + // eor x12, X, X, ror #58 // .............................*....................... + // bic X, X, X, ror #31 // ..............................*...................... + // eor x17, X, X, ror #47 // ..............................*...................... + // bic X, X, X, ror #56 // ...............................*..................... + // eor x22, X, X, ror #23 // ...............................*..................... + // bic X, X, X, ror #19 // ................................*.................... + // eor x3, X, X, ror #24 // ................................*.................... + // bic X, X, X, ror #47 // .................................*................... + // eor x8, X, X, ror #2 // .................................*................... + // bic X, X, X, ror #10 // ..................................*.................. + // eor x13, X, X, ror #57 // ..................................*.................. + // bic X, X, X, ror #47 // ...................................*................. + // eor x28, X, X, ror #57 // ...................................*................. + // bic X, X, X, ror #5 // ....................................*................ + // eor x23, X, X, ror #52 // ....................................*................ + // bic X, X, X, ror #38 // .....................................*............... + // eor x4, X, X, ror #47 // .....................................*............... + // bic X, X, X, ror #5 // ......................................*.............. + // eor x9, X, X, ror #43 // .......................................*............. + // bic X, X, X, ror #41 // .......................................*............. + // eor x14, X, X, ror #46 // ........................................*............ + // bic X, X, X, ror #35 // ........................................*............ + // eor x19, X, X, ror #12 // .........................................*........... + // bic X, X, X, ror #9 // .........................................*........... + // eor x24, X, X, ror #44 // ..........................................*.......... + // bic X, X, X, ror #48 // ..........................................*.......... + // eor x5, X, X, ror #41 // ...........................................*......... + // bic X, X, X, ror #2 // ...........................................*......... + // eor x10, X, X, ror #50 // ............................................*........ + // bic X, X, X, ror #25 // ............................................*........ + // eor x15, X, X, ror #27 // .............................................*....... + // bic X, X, X, ror #60 // .............................................*....... + // eor x20, X, X, ror #21 // ..............................................*...... + // bic X, X, X, ror #57 // ..............................................*...... + // eor x25, X, X, ror #53 // ...............................................*..... + // bic X, X, X, ror #63 // ...............................................*..... + // eor x1, X, X, ror #21 // ................................................*.... + // bic X, X, X, ror #42 // ................................................*.... + // eor x6, X, X, ror #41 // .................................................*... + // bic X, X, X, ror #57 // .................................................*... + // eor x11, X, X, ror #35 // ..................................................*.. + // bic X, X, X, ror #50 // ..................................................*.. + // eor x16, X, X, ror #43 // ...................................................*. + // bic X, X, X, ror #44 // ...................................................*. + // eor x21, X, X, ror #30 // ....................................................* + // eor x1, x1, X // ....................................................* + + initial_round_end: + + + loop: + // Instructions: 110 + // Expected cycles: 55 + // Expected IPC: 2.00 + // + // Wall time: 2.48s + // User time: 2.48s + // + // ------------------ cycle (expected) ------------------> + // 0 25 50 + // |------------------------|------------------------|---- + eor x0, x1, x2, ror #61 // *...................................................... + eor x0, x0, x4, ror #54 // *...................................................... + eor x27, x0, x3, ror #39 // .*..................................................... + eor x0, x27, x5, ror #25 // .*..................................................... + str x0, [sp, #STACK_LOC_0] // .s..................................................... // @slothy:is_spill // @slothy:writes=stack_0 + eor x26, x8, x9, ror #57 // ..*.................................................... + eor x29, x26, x6, ror #51 // ..*.................................................... + eor x26, x29, x10, ror #31 // ...*................................................... + eor x26, x26, x7, ror #27 // ...*................................................... + str x26, [sp, #STACK_LOC_1] // ...s................................................... // @slothy:is_spill // @slothy:writes=stack_1 + eor x30, x15, x11, ror #52 // ....*.................................................. + eor x26, x30, x13, ror #48 // ....*.................................................. + eor x26, x26, x14, ror #10 // .....*................................................. + eor x27, x26, x12, ror #5 // .....*................................................. + str x27, [sp, #STACK_LOC_2] // .....s................................................. // @slothy:is_spill // @slothy:writes=stack_2 + eor x0, x16, x28, ror #63 // ......*................................................ + ldr x27, [sp, #STACK_LOC_2] // ......r................................................ // @slothy:is_restore // @slothy:reads=stack_2 + eor x30, x0, x19, ror #37 // ......*................................................ + ldr x0, [sp, #STACK_LOC_0] // ......r................................................ // @slothy:is_restore // @slothy:reads=stack_0 + eor x26, x30, x17, ror #36 // .......*............................................... + eor x26, x26, x20, ror #2 // .......*............................................... + str x26, [sp, #STACK_LOC_0] // .......s............................................... // @slothy:is_spill // @slothy:writes=stack_0 + eor x26, x23, x22, ror #50 // ........*.............................................. + eor x26, x26, x24, ror #34 // ........*.............................................. + eor x26, x26, x21, ror #26 // .........*............................................. + eor x26, x26, x25, ror #15 // .........*............................................. + str x26, [sp, #STACK_LOC_2] // .........s............................................. // @slothy:is_spill // @slothy:writes=stack_2 + eor x26, x0, x27, ror #61 // ..........*............................................ + str x26, [sp, #STACK_LOC_3] // ..........s............................................ // @slothy:is_spill // @slothy:writes=stack_3 + ror x27, x27, #62 // ..........*............................................ + ldr x26, [sp, #STACK_LOC_2] // ..........r............................................ // @slothy:is_restore // @slothy:reads=stack_2 + eor x30, x27, x26, ror #57 // ...........*........................................... + str x30, [sp, #STACK_LOC_2] // ...........s........................................... // @slothy:is_spill // @slothy:writes=stack_2 + ror x27, x26, #58 // ...........*........................................... + ldr x26, [sp, #STACK_LOC_1] // ...........r........................................... // @slothy:is_restore // @slothy:reads=stack_1 + ldr x30, [sp, #STACK_LOC_2] // ...........r........................................... // @slothy:is_restore // @slothy:reads=stack_2 + eor x27, x27, x26, ror #55 // ............*.......................................... + str x27, [sp, #STACK_LOC_1] // ............s.......................................... // @slothy:is_spill // @slothy:writes=stack_1 + ror x27, x26, #56 // ............*.......................................... + ldr x26, [sp, #STACK_LOC_0] // ............r.......................................... // @slothy:is_restore // @slothy:reads=stack_0 + eor x29, x27, x26, ror #63 // .............*......................................... + ldr x27, [sp, #STACK_LOC_1] // .............r......................................... // @slothy:is_restore // @slothy:reads=stack_1 + str x29, [sp, #STACK_LOC_0] // .............s......................................... // @slothy:is_spill // @slothy:writes=stack_0 + eor x0, x26, x0, ror #63 // .............*......................................... + ldr x29, [sp, #STACK_LOC_0] // .............r......................................... // @slothy:is_restore // @slothy:reads=stack_0 + str x0, [sp, #STACK_LOC_0] // .............s......................................... // @slothy:is_spill // @slothy:writes=stack_0 + eor x26, x27, x1 // ..............*........................................ + ldr x0, [sp, #STACK_LOC_0] // ..............r........................................ // @slothy:is_restore // @slothy:reads=stack_0 + str x26, [sp, #STACK_LOC_0] // ..............s........................................ // @slothy:is_spill // @slothy:writes=stack_0 + eor x26, x29, x11, ror #50 // ..............*........................................ + str x26, [sp, #STACK_LOC_1] // ..............s........................................ // @slothy:is_spill // @slothy:writes=stack_1 + eor x13, x29, x13, ror #46 // ...............*....................................... + str x13, [sp, #STACK_LOC_2] // ...............s....................................... // @slothy:is_spill // @slothy:writes=stack_2 + eor x13, x30, x28, ror #63 // ...............*....................................... + str x13, [sp, #STACK_LOC_4] // ...............s....................................... // @slothy:is_spill // @slothy:writes=stack_4 + eor x26, x0, x24, ror #28 // ................*...................................... + str x26, [sp, #STACK_LOC_5] // ................s...................................... // @slothy:is_spill // @slothy:writes=stack_5 + eor x13, x30, x20, ror #2 // ................*...................................... + ldr x26, [sp, #STACK_LOC_3] // ................r...................................... // @slothy:is_restore // @slothy:reads=stack_3 + str x13, [sp, #STACK_LOC_3] // ................s...................................... // @slothy:is_spill // @slothy:writes=stack_3 + eor x11, x27, x4, ror #54 // .................*..................................... + eor x20, x26, x6, ror #43 // .................*..................................... + str x20, [sp, #STACK_LOC_6] // .................s..................................... // @slothy:is_spill // @slothy:writes=stack_6 + eor x1, x30, x17, ror #36 // ..................*.................................... + eor x17, x26, x9, ror #49 // ..................*.................................... + eor x9, x29, x12, ror #3 // ...................*................................... + eor x12, x27, x3, ror #39 // ...................*................................... + eor x28, x30, x16 // ....................*.................................. + eor x30, x30, x19, ror #37 // ....................*.................................. + eor x19, x29, x14, ror #8 // .....................*................................. + eor x14, x26, x8, ror #56 // .....................*................................. + eor x13, x0, x22, ror #44 // ......................*................................ + eor x22, x29, x15, ror #62 // ......................*................................ + eor x16, x0, x23, ror #58 // .......................*............................... + eor x4, x27, x5, ror #25 // .......................*............................... + eor x15, x0, x21, ror #20 // ........................*.............................. + eor x29, x0, x25, ror #9 // ........................*.............................. + eor x6, x26, x10, ror #23 // .........................*............................. + eor x24, x27, x2, ror #61 // .........................*............................. + eor x21, x26, x7, ror #19 // ..........................*............................ + ldr x26, [sp, #STACK_LOC_5] // ..........................r............................ // @slothy:is_restore // @slothy:reads=stack_5 + ldr x23, [sp, #STACK_OFFSET_CONST] // ..........................*............................ + str x23, [sp, #STACK_LOC_5] // ..........................s............................ // @slothy:is_spill // @slothy:writes=stack_5 + ldr x20, [sp, #STACK_OFFSET_COUNT] // ...........................*........................... // @slothy:reads=STACK_OFFSET_COUNT + ldr x23, [sp, #STACK_LOC_5] // ...........................r........................... // @slothy:is_restore // @slothy:reads=stack_5 + ldr x27, [x23, w20, UXTW #3] // ...........................*........................... + add x20, x20, #1 // ............................*.......................... + cmp x20, #(KECCAK_F1600_ROUNDS-1) // ............................*.......................... + str x20, [sp, #STACK_OFFSET_COUNT] // .............................*......................... // @slothy:writes=STACK_OFFSET_COUNT + bic x20, x12, x13, ror #47 // .............................*......................... + eor x2, x20, x28, ror #39 // ..............................*........................ + str x2, [sp, #STACK_LOC_5] // ..............................s........................ // @slothy:is_spill // @slothy:writes=stack_5 + bic x20, x17, x12, ror #42 // ..............................*........................ + ldr x2, [sp, #STACK_LOC_5] // ..............................r........................ // @slothy:is_restore // @slothy:reads=stack_5 + eor x7, x20, x13, ror #25 // ...............................*....................... + str x7, [sp, #STACK_LOC_5] // ...............................s....................... // @slothy:is_spill // @slothy:writes=stack_5 + bic x20, x22, x17, ror #16 // ...............................*....................... + eor x12, x20, x12, ror #58 // ................................*...................... + str x12, [sp, #STACK_LOC_7] // ................................s...................... // @slothy:is_spill // @slothy:writes=stack_7 + bic x7, x28, x22, ror #31 // ................................*...................... + ldr x12, [sp, #STACK_LOC_7] // ................................r...................... // @slothy:is_restore // @slothy:reads=stack_7 + eor x17, x7, x17, ror #47 // .................................*..................... + ldr x7, [sp, #STACK_LOC_5] // .................................r..................... // @slothy:is_restore // @slothy:reads=stack_5 + str x17, [sp, #STACK_LOC_5] // .................................s..................... // @slothy:is_spill // @slothy:writes=stack_5 + bic x20, x13, x28, ror #56 // .................................*..................... + ldr x13, [sp, #STACK_LOC_4] // .................................r..................... // @slothy:is_restore // @slothy:reads=stack_4 + ldr x17, [sp, #STACK_LOC_5] // .................................r..................... // @slothy:is_restore // @slothy:reads=stack_5 + eor x22, x20, x22, ror #23 // ..................................*.................... + ldr x20, [sp, #STACK_LOC_6] // ..................................r.................... // @slothy:is_restore // @slothy:reads=stack_6 + str x22, [sp, #STACK_LOC_4] // ..................................s.................... // @slothy:is_spill // @slothy:writes=stack_4 + bic x23, x13, x9, ror #19 // ..................................*.................... + ldr x22, [sp, #STACK_LOC_4] // ..................................r.................... // @slothy:is_restore // @slothy:reads=stack_4 + eor x3, x23, x20, ror #24 // ...................................*................... + str x3, [sp, #STACK_LOC_4] // ...................................s................... // @slothy:is_spill // @slothy:writes=stack_4 + bic x23, x26, x13, ror #47 // ...................................*................... + ldr x3, [sp, #STACK_LOC_4] // ...................................r................... // @slothy:is_restore // @slothy:reads=stack_4 + eor x8, x23, x9, ror #2 // ....................................*.................. + str x8, [sp, #STACK_LOC_4] // ....................................s.................. // @slothy:is_spill // @slothy:writes=stack_4 + bic x23, x4, x26, ror #10 // ....................................*.................. + ldr x8, [sp, #STACK_LOC_4] // ....................................r.................. // @slothy:is_restore // @slothy:reads=stack_4 + eor x13, x23, x13, ror #57 // .....................................*................. + str x13, [sp, #STACK_LOC_4] // .....................................s................. // @slothy:is_spill // @slothy:writes=stack_4 + bic x23, x20, x4, ror #47 // .....................................*................. + ldr x13, [sp, #STACK_LOC_3] // .....................................r................. // @slothy:is_restore // @slothy:reads=stack_3 + eor x28, x23, x26, ror #57 // ......................................*................ + ldr x26, [sp, #STACK_LOC_1] // ......................................r................ // @slothy:is_restore // @slothy:reads=stack_1 + str x28, [sp, #STACK_LOC_1] // ......................................s................ // @slothy:is_spill // @slothy:writes=stack_1 + bic x20, x9, x20, ror #5 // ......................................*................ + ldr x28, [sp, #STACK_LOC_1] // ......................................r................ // @slothy:is_restore // @slothy:reads=stack_1 + eor x23, x20, x4, ror #52 // .......................................*............... + str x23, [sp, #STACK_LOC_1] // .......................................s............... // @slothy:is_spill // @slothy:writes=stack_1 + bic x20, x14, x24, ror #38 // .......................................*............... + ldr x23, [sp, #STACK_LOC_1] // .......................................r............... // @slothy:is_restore // @slothy:reads=stack_1 + eor x4, x20, x15, ror #47 // ........................................*.............. + str x4, [sp, #STACK_LOC_1] // ........................................s.............. // @slothy:is_spill // @slothy:writes=stack_1 + bic x20, x19, x14, ror #5 // ........................................*.............. + ldr x4, [sp, #STACK_LOC_1] // ........................................r.............. // @slothy:is_restore // @slothy:reads=stack_1 + eor x9, x20, x24, ror #43 // .........................................*............. + str x9, [sp, #STACK_LOC_1] // .........................................s............. // @slothy:is_spill // @slothy:writes=stack_1 + bic x20, x13, x19, ror #41 // .........................................*............. + ldr x9, [sp, #STACK_LOC_1] // .........................................r............. // @slothy:is_restore // @slothy:reads=stack_1 + eor x14, x20, x14, ror #46 // ..........................................*............ + str x14, [sp, #STACK_LOC_1] // ..........................................s............ // @slothy:is_spill // @slothy:writes=stack_1 + bic x20, x15, x13, ror #35 // ..........................................*............ + ldr x14, [sp, #STACK_LOC_1] // ..........................................r............ // @slothy:is_restore // @slothy:reads=stack_1 + eor x19, x20, x19, ror #12 // ...........................................*........... + str x19, [sp, #STACK_LOC_1] // ...........................................s........... // @slothy:is_spill // @slothy:writes=stack_1 + bic x20, x24, x15, ror #9 // ...........................................*........... + ldr x19, [sp, #STACK_LOC_1] // ...........................................r........... // @slothy:is_restore // @slothy:reads=stack_1 + eor x24, x20, x13, ror #44 // ............................................*.......... + ldr x13, [sp, #STACK_LOC_2] // ............................................r.......... // @slothy:is_restore // @slothy:reads=stack_2 + str x24, [sp, #STACK_LOC_1] // ............................................s.......... // @slothy:is_spill // @slothy:writes=stack_1 + bic x20, x16, x1, ror #48 // ............................................*.......... + ldr x24, [sp, #STACK_LOC_1] // ............................................r.......... // @slothy:is_restore // @slothy:reads=stack_1 + eor x5, x20, x26, ror #41 // .............................................*......... + str x5, [sp, #STACK_LOC_1] // .............................................s......... // @slothy:is_spill // @slothy:writes=stack_1 + bic x20, x11, x16, ror #2 // .............................................*......... + ldr x5, [sp, #STACK_LOC_1] // .............................................r......... // @slothy:is_restore // @slothy:reads=stack_1 + eor x10, x20, x1, ror #50 // ..............................................*........ + str x10, [sp, #STACK_LOC_1] // ..............................................s........ // @slothy:is_spill // @slothy:writes=stack_1 + bic x20, x6, x11, ror #25 // ..............................................*........ + ldr x10, [sp, #STACK_LOC_1] // ..............................................r........ // @slothy:is_restore // @slothy:reads=stack_1 + eor x15, x20, x16, ror #27 // ...............................................*....... + str x15, [sp, #STACK_LOC_1] // ...............................................s....... // @slothy:is_spill // @slothy:writes=stack_1 + bic x20, x26, x6, ror #60 // ...............................................*....... + ldr x15, [sp, #STACK_LOC_1] // ...............................................r....... // @slothy:is_restore // @slothy:reads=stack_1 + eor x20, x20, x11, ror #21 // ................................................*...... + str x20, [sp, #STACK_LOC_1] // ................................................s...... // @slothy:is_spill // @slothy:writes=stack_1 + bic x20, x1, x26, ror #57 // ................................................*...... + ldr x26, [sp, #STACK_LOC_0] // ................................................r...... // @slothy:is_restore // @slothy:reads=stack_0 + eor x25, x20, x6, ror #53 // .................................................*..... + str x25, [sp, #STACK_LOC_0] // .................................................s..... // @slothy:is_spill // @slothy:writes=stack_0 + bic x20, x13, x21, ror #63 // .................................................*..... + ldr x25, [sp, #STACK_LOC_0] // .................................................r..... // @slothy:is_restore // @slothy:reads=stack_0 + eor x20, x26, x20, ror #21 // ..................................................*.... + str x20, [sp, #STACK_LOC_0] // ..................................................s.... // @slothy:is_spill // @slothy:writes=stack_0 + bic x20, x30, x13, ror #42 // ..................................................*.... + eor x6, x20, x21, ror #41 // ...................................................*... + str x6, [sp, #STACK_LOC_2] // ...................................................s... // @slothy:is_spill // @slothy:writes=stack_2 + bic x20, x29, x30, ror #57 // ...................................................*... + ldr x6, [sp, #STACK_LOC_2] // ...................................................r... // @slothy:is_restore // @slothy:reads=stack_2 + eor x11, x20, x13, ror #35 // ....................................................*.. + ldr x13, [sp, #STACK_LOC_4] // ....................................................r.. // @slothy:is_restore // @slothy:reads=stack_4 + str x11, [sp, #STACK_LOC_2] // ....................................................s.. // @slothy:is_spill // @slothy:writes=stack_2 + bic x20, x26, x29, ror #50 // ....................................................*.. + ldr x11, [sp, #STACK_LOC_2] // ....................................................r.. // @slothy:is_restore // @slothy:reads=stack_2 + eor x16, x20, x30, ror #43 // .....................................................*. + str x16, [sp, #STACK_LOC_2] // .....................................................s. // @slothy:is_spill // @slothy:writes=stack_2 + bic x20, x21, x26, ror #44 // .....................................................*. + ldr x16, [sp, #STACK_LOC_2] // .....................................................r. // @slothy:is_restore // @slothy:reads=stack_2 + eor x21, x20, x29, ror #30 // ......................................................* + ldr x20, [sp, #STACK_LOC_0] // ......................................................r // @slothy:is_restore // @slothy:reads=stack_0 + eor x1, x20, x27 // ......................................................* + ldr x20, [sp, #STACK_LOC_1] // ......................................................r // @slothy:is_restore // @slothy:reads=stack_1 + + // ------------------ cycle (expected) ------------------> + // 0 25 50 + // |------------------------|------------------------|---- + // eor X, x1, x2, ror #61 // *...................................................... + // eor X, X, x4, ror #54 // *...................................................... + // eor X, X, x3, ror #39 // .*..................................................... + // eor X, X, x5, ror #25 // .*..................................................... + // eor X, x8, x9, ror #57 // ..*.................................................... + // eor X, X, x6, ror #51 // ..*.................................................... + // eor X, X, x10, ror #31 // ...*................................................... + // eor X, X, x7, ror #27 // ...*................................................... + // eor X, x15, x11, ror #52 // ....*.................................................. + // eor X, X, x13, ror #48 // ....*.................................................. + // eor X, X, x14, ror #10 // .....*................................................. + // eor X, X, x12, ror #5 // .....*................................................. + // eor X, x16, x28, ror #63 // ......*................................................ + // eor X, X, x19, ror #37 // ......*................................................ + // eor X, X, x17, ror #36 // .......*............................................... + // eor X, X, x20, ror #2 // .......*............................................... + // eor X, x23, x22, ror #50 // ........*.............................................. + // eor X, X, x24, ror #34 // ........*.............................................. + // eor X, X, x21, ror #26 // .........*............................................. + // eor X, X, x25, ror #15 // .........*............................................. + // eor X, X, X, ror #61 // ..........*............................................ + // ror X, X, #62 // ..........*............................................ + // eor X, X, X, ror #57 // ...........*........................................... + // ror X, X, #58 // ...........*........................................... + // eor X, X, X, ror #55 // ............*.......................................... + // ror X, X, #56 // ............*.......................................... + // eor X, X, X, ror #63 // .............*......................................... + // eor X, X, X, ror #63 // .............*......................................... + // eor X, X, x1 // ..............*........................................ + // eor X, X, x11, ror #50 // ..............*........................................ + // eor X, X, x13, ror #46 // ...............*....................................... + // eor X, X, x28, ror #63 // ...............*....................................... + // eor X, X, x24, ror #28 // ................*...................................... + // eor X, X, x20, ror #2 // ................*...................................... + // eor X, X, x4, ror #54 // .................*..................................... + // eor X, X, x6, ror #43 // .................*..................................... + // eor X, X, x17, ror #36 // ..................*.................................... + // eor X, X, x9, ror #49 // ..................*.................................... + // eor X, X, x12, ror #3 // ...................*................................... + // eor X, X, x3, ror #39 // ...................*................................... + // eor X, X, x16 // ....................*.................................. + // eor X, X, x19, ror #37 // ....................*.................................. + // eor X, X, x14, ror #8 // .....................*................................. + // eor X, X, x8, ror #56 // .....................*................................. + // eor X, X, x22, ror #44 // ......................*................................ + // eor X, X, x15, ror #62 // ......................*................................ + // eor X, X, x23, ror #58 // .......................*............................... + // eor X, X, x5, ror #25 // .......................*............................... + // eor X, X, x21, ror #20 // ........................*.............................. + // eor X, X, x25, ror #9 // ........................*.............................. + // eor X, X, x10, ror #23 // .........................*............................. + // eor X, X, x2, ror #61 // .........................*............................. + // eor X, X, x7, ror #19 // ..........................*............................ + // ldr X, [sp, #STACK_OFFSET_CONST] // ..........................*............................ + // ldr X, [sp, #STACK_OFFSET_COUNT] // ...........................*........................... + // ldr X, [X, W, UXTW #3] // ...........................*........................... + // add X, X, #1 // ............................*.......................... + // cmp X, #(KECCAK_F1600_ROUNDS-1) // ............................*.......................... + // str X, [sp, #STACK_OFFSET_COUNT] // .............................*......................... + // bic X, X, X, ror #47 // .............................*......................... + // eor x2, X, X, ror #39 // ..............................*........................ + // bic X, X, X, ror #42 // ..............................*........................ + // eor x7, X, X, ror #25 // ...............................*....................... + // bic X, X, X, ror #16 // ...............................*....................... + // eor x12, X, X, ror #58 // ................................*...................... + // bic X, X, X, ror #31 // ................................*...................... + // eor x17, X, X, ror #47 // .................................*..................... + // bic X, X, X, ror #56 // .................................*..................... + // eor x22, X, X, ror #23 // ..................................*.................... + // bic X, X, X, ror #19 // ..................................*.................... + // eor x3, X, X, ror #24 // ...................................*................... + // bic X, X, X, ror #47 // ...................................*................... + // eor x8, X, X, ror #2 // ....................................*.................. + // bic X, X, X, ror #10 // ....................................*.................. + // eor x13, X, X, ror #57 // .....................................*................. + // bic X, X, X, ror #47 // .....................................*................. + // eor x28, X, X, ror #57 // ......................................*................ + // bic X, X, X, ror #5 // ......................................*................ + // eor x23, X, X, ror #52 // .......................................*............... + // bic X, X, X, ror #38 // .......................................*............... + // eor x4, X, X, ror #47 // ........................................*.............. + // bic X, X, X, ror #5 // ........................................*.............. + // eor x9, X, X, ror #43 // .........................................*............. + // bic X, X, X, ror #41 // .........................................*............. + // eor x14, X, X, ror #46 // ..........................................*............ + // bic X, X, X, ror #35 // ..........................................*............ + // eor x19, X, X, ror #12 // ...........................................*........... + // bic X, X, X, ror #9 // ...........................................*........... + // eor x24, X, X, ror #44 // ............................................*.......... + // bic X, X, X, ror #48 // ............................................*.......... + // eor x5, X, X, ror #41 // .............................................*......... + // bic X, X, X, ror #2 // .............................................*......... + // eor x10, X, X, ror #50 // ..............................................*........ + // bic X, X, X, ror #25 // ..............................................*........ + // eor x15, X, X, ror #27 // ...............................................*....... + // bic X, X, X, ror #60 // ...............................................*....... + // eor x20, X, X, ror #21 // ................................................*...... + // bic X, X, X, ror #57 // ................................................*...... + // eor x25, X, X, ror #53 // .................................................*..... + // bic X, X, X, ror #63 // .................................................*..... + // eor x1, X, X, ror #21 // ..................................................*.... + // bic X, X, X, ror #42 // ..................................................*.... + // eor x6, X, X, ror #41 // ...................................................*... + // bic X, X, X, ror #57 // ...................................................*... + // eor x11, X, X, ror #35 // ....................................................*.. + // bic X, X, X, ror #50 // ....................................................*.. + // eor x16, X, X, ror #43 // .....................................................*. + // bic X, X, X, ror #44 // .....................................................*. + // eor x21, X, X, ror #30 // ......................................................* + // eor x1, x1, X // ......................................................* + + end_loop: + + ble loop + +final: + final_rotate + ldr input_addr, [sp, #STACK_OFFSET_INPUT] // @slothy:reads=STACK_OFFSET_INPUT + store_state +end_final: + + restore_gprs + free_stack + ret \ No newline at end of file diff --git a/examples/opt/aarch64/keccak_f1600_x1_scalar_slothy_a55_opt_a55.s b/examples/opt/aarch64/keccak_f1600_x1_scalar_slothy_a55_opt_a55.s index 3c053b8..0a1bf7d 100644 --- a/examples/opt/aarch64/keccak_f1600_x1_scalar_slothy_a55_opt_a55.s +++ b/examples/opt/aarch64/keccak_f1600_x1_scalar_slothy_a55_opt_a55.s @@ -136,7 +136,6 @@ round_constants: tmp .req x0 - tmp0 .req x0 tmp1 .req x29 @@ -174,41 +173,32 @@ round_constants: ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] .endm +.macro eor5 dst, src0, src1, src2, src3, src4 + eor \dst, \src0, \src1 + eor \dst, \dst, \src2 + eor \dst, \dst, \src3 + eor \dst, \dst, \src4 +.endm + + + +.macro addparity prty, dst0, src0, dst1, src1, dst2, src2, dst3, src3, dst4, src4 + eor \dst0, \src0, \prty + eor \dst1, \src1, \prty + eor \dst2, \src2, \prty + eor \dst3, \src3, \prty + eor \dst4, \src4, \prty +.endm + + + + .macro keccak_f1600_round_initial - ldp Aku, Ama, [input_addr, #(1*8*14)] - ldp Asa, Ase, [input_addr, #(1*8*20)] - eor C0, Ama, Asa - ldp Ame, Ami, [input_addr, #(1*8*16)] - eor C1, Ame, Ase - ldp Asi, Aso, [input_addr, #(1*8*22)] - eor C2, Ami, Asi - ldp Amo, Amu, [input_addr, #(1*8*18)] - eor C3, Amo, Aso - ldr Asu, [input_addr, #(1*8*24)] - eor C4, Amu, Asu - ldp Aka, Ake, [input_addr, #(1*8*10)] - eor C0, Aka, C0 - eor C1, Ake, C1 - ldp Aki, Ako, [input_addr, #(1*8*12)] - eor C2, Aki, C2 - ldp Abu, Aga, [input_addr, #(1*8*4)] - eor C3, Ako, C3 - eor C4, Aku, C4 - ldp Age, Agi, [input_addr, #(1*8*6)] - eor C0, Aga, C0 - ldp Ago, Agu, [input_addr, #(1*8*8)] - eor C1, Age, C1 - ldp Aba, Abe, [input_addr, #(1*8*0)] - eor C2, Agi, C2 - ldp Abi, Abo, [input_addr, #(1*8*2)] - eor C3, Ago, C3 - str input_addr, [sp, #STACK_OFFSET_INPUT] // @slothy:writes=STACK_OFFSET_INPUT - eor C4, Agu, C4 - eor C0, Aba, C0 - eor C1, Abe, C1 - eor C2, Abi, C2 - eor C3, Abo, C3 - eor C4, Abu, C4 + eor5 C0, Ama, Asa, Aba, Aga, Aka + eor5 C1, Ame, Ase, Abe, Age, Ake + eor5 C2, Ami, Asi, Abi, Agi, Aki + eor5 C3, Amo, Aso, Abo, Ago, Ako + eor5 C4, Amu, Asu, Abu, Agu, Aku eor E1, C0, C2, ror #63 eor E3, C2, C4, ror #63 @@ -307,167 +297,180 @@ round_constants: .endm +.macro eor5ror dst, src0, src1, rot1, src2, rot2, src3, rot3, src4, rot4 + eor \dst, \src0, \src1, ror \rot1 + eor \dst, \dst, \src2, ror \rot2 + eor \dst, \dst, \src3, ror \rot3 + eor \dst, \dst, \src4, ror \rot4 +.endm -.macro keccak_f1600_round_noninitial +.macro addparityror prty, dst0, src0, rot0, dst1, src1, rot1, dst2, src2, rot2, dst3, src3, rot3, dst4, src4, rot4 + eor \dst0, \prty, \src0, ror \rot0 + eor \dst1, \prty, \src1, ror \rot1 + eor \dst2, \prty, \src2, ror \rot2 + eor \dst3, \prty, \src3, ror \rot3 + eor \dst4, \prty, \src4, ror \rot4 +.endm - eor C2, Asi, Abi, ror #52 - eor C0, Aba, Aga, ror #61 - eor C4, Aku, Agu, ror #50 - eor C1, Ake, Ame, ror #57 - eor C3, Abo, Ako, ror #63 - eor C2, C2, Aki, ror #48 - eor C0, C0, Ama, ror #54 - eor C4, C4, Amu, ror #34 - eor C1, C1, Abe, ror #51 - eor C3, C3, Amo, ror #37 - eor C2, C2, Ami, ror #10 - eor C0, C0, Aka, ror #39 - eor C4, C4, Abu, ror #26 - eor C1, C1, Ase, ror #31 - eor C3, C3, Ago, ror #36 - eor C2, C2, Agi, ror #5 - eor C0, C0, Asa, ror #25 - eor C4, C4, Asu, ror #15 - eor C1, C1, Age, ror #27 - eor C3, C3, Aso, ror #2 - - eor E1, C0, C2, ror #61 - ror C2, C2, #62 - eor E3, C2, C4, ror #57 - ror C4, C4, #58 - eor E0, C4, C1, ror #55 - ror C1, C1, #56 - eor E2, C1, C3, ror #63 - eor E4, C3, C0, ror #63 +.macro chi_step_ror out, a, b, c, r1, r2 + bic X, \c\(), \b\(), ror #\r1 + eor \out\(), X, \a\(), ror #\r2 +.endm - eor Aba_, E0, Aba - eor Asa_, E2, Abi, ror #50 - eor Abi_, E2, Aki, ror #46 - eor Aki_, E3, Ako, ror #63 - eor Ako_, E4, Amu, ror #28 - eor Amu_, E3, Aso, ror #2 - eor Aso_, E0, Ama, ror #54 - eor Aka_, E1, Abe, ror #43 - eor Ase_, E3, Ago, ror #36 - eor Ago_, E1, Ame, ror #49 - eor Ake_, E2, Agi, ror #3 - eor Agi_, E0, Aka, ror #39 - eor Aga_, E3, Abo - eor Abo_, E3, Amo, ror #37 - eor Amo_, E2, Ami, ror #8 - eor Ami_, E1, Ake, ror #56 - eor Age_, E4, Agu, ror #44 - eor Agu_, E2, Asi, ror #62 - eor Asi_, E4, Aku, ror #58 - eor Aku_, E0, Asa, ror #25 - eor Ama_, E4, Abu, ror #20 - eor Abu_, E4, Asu, ror #9 - eor Asu_, E1, Ase, ror #23 - eor Ame_, E0, Aga, ror #61 - eor Abe_, E1, Age, ror #19 +.macro keccak_f1600_round_noninitial + + eor X, Aba, Aga, ror #61 + eor X, X, Ama, ror #54 + eor X, X, Aka, ror #39 + eor X, X, Asa, ror #25 + eor X, Ake, Ame, ror #57 + eor X, X, Abe, ror #51 + eor X, X, Ase, ror #31 + eor X, X, Age, ror #27 + eor X, Asi, Abi, ror #52 + eor X, X, Aki, ror #48 + eor X, X, Ami, ror #10 + eor X, X, Agi, ror #5 + eor X, Abo, Ako, ror #63 + eor X, X, Amo, ror #37 + eor X, X, Ago, ror #36 + eor X, X, Aso, ror #2 + eor X, Aku, Agu, ror #50 + eor X, X, Amu, ror #34 + eor X, X, Abu, ror #26 + eor X, X, Asu, ror #15 + + eor X, X, X, ror #61 + ror X, X, #62 + eor X, X, X, ror #57 + ror X, X, #58 + eor X, X, X, ror #55 + ror X, X, #56 + eor X, X, X, ror #63 + eor X, X, X, ror #63 + + str Age, [sp, #16] // @slothy:writes=Age + str Aga, [sp, #24] // @slothy:writes=Aga + ldr Aga, [sp, #24] // @slothy:reads=Aga + ldr Age, [sp, #16] // @slothy:reads=Age + + eor Aba_, X, Aba + eor Asa_, X, Abi, ror #50 + eor Abi_, X, Aki, ror #46 + eor Aki_, X, Ako, ror #63 + eor Ako_, X, Amu, ror #28 + eor Amu_, X, Aso, ror #2 + eor Aso_, X, Ama, ror #54 + eor Aka_, X, Abe, ror #43 + eor Ase_, X, Ago, ror #36 + eor Ago_, X, Ame, ror #49 + eor Ake_, X, Agi, ror #3 + eor Agi_, X, Aka, ror #39 + eor Aga_, X, Abo + eor Abo_, X, Amo, ror #37 + eor Amo_, X, Ami, ror #8 + eor Ami_, X, Ake, ror #56 + eor Age_, X, Agu, ror #44 + eor Agu_, X, Asi, ror #62 + eor Asi_, X, Aku, ror #58 + eor Aku_, X, Asa, ror #25 + eor Ama_, X, Abu, ror #20 + eor Abu_, X, Asu, ror #9 + eor Asu_, X, Ase, ror #23 + eor Ame_, X, Aga, ror #61 + eor Abe_, X, Age, ror #19 load_constant_ptr_stack ldr count, [sp, #STACK_OFFSET_COUNT] // @slothy:reads=STACK_OFFSET_COUNT - - bic tmp0, Agi_, Age_, ror #47 - bic tmp1, Ago_, Agi_, ror #42 - eor Aga, tmp0, Aga_, ror #39 - bic tmp0, Agu_, Ago_, ror #16 - eor Age, tmp1, Age_, ror #25 - bic tmp1, Aga_, Agu_, ror #31 - eor Agi, tmp0, Agi_, ror #58 - bic tmp0, Age_, Aga_, ror #56 - eor Ago, tmp1, Ago_, ror #47 - bic tmp1, Aki_, Ake_, ror #19 - eor Agu, tmp0, Agu_, ror #23 - bic tmp0, Ako_, Aki_, ror #47 - eor Aka, tmp1, Aka_, ror #24 - bic tmp1, Aku_, Ako_, ror #10 - eor Ake, tmp0, Ake_, ror #2 - bic tmp0, Aka_, Aku_, ror #47 - eor Aki, tmp1, Aki_, ror #57 - bic tmp1, Ake_, Aka_, ror #5 - eor Ako, tmp0, Ako_, ror #57 - bic tmp0, Ami_, Ame_, ror #38 - eor Aku, tmp1, Aku_, ror #52 - bic tmp1, Amo_, Ami_, ror #5 - eor Ama, tmp0, Ama_, ror #47 - bic tmp0, Amu_, Amo_, ror #41 - eor Ame, tmp1, Ame_, ror #43 - bic tmp1, Ama_, Amu_, ror #35 - eor Ami, tmp0, Ami_, ror #46 - bic tmp0, Ame_, Ama_, ror #9 - ldr cur_const, [const_addr, count, UXTW #3] - - eor Amo, tmp1, Amo_, ror #12 - bic tmp1, Asi_, Ase_, ror #48 - eor Amu, tmp0, Amu_, ror #44 - bic tmp0, Aso_, Asi_, ror #2 - eor Asa, tmp1, Asa_, ror #41 - bic tmp1, Asu_, Aso_, ror #25 - eor Ase, tmp0, Ase_, ror #50 - bic tmp0, Asa_, Asu_, ror #60 - eor Asi, tmp1, Asi_, ror #27 - bic tmp1, Ase_, Asa_, ror #57 - eor Aso, tmp0, Aso_, ror #21 - bic tmp0, Abi_, Abe_, ror #63 add count, count, #1 str count, [sp, #STACK_OFFSET_COUNT] // @slothy:writes=STACK_OFFSET_COUNT - eor Asu, tmp1, Asu_, ror #53 - bic tmp1, Abo_, Abi_, ror #42 - eor Aba, Aba_, tmp0, ror #21 - bic tmp0, Abu_, Abo_, ror #57 - eor Abe, tmp1, Abe_, ror #41 - bic tmp1, Aba_, Abu_, ror #50 - eor Abi, tmp0, Abi_, ror #35 - bic tmp0, Abe_, Aba_, ror #44 - eor Abo, tmp1, Abo_, ror #43 - eor Abu, tmp0, Abu_, ror #30 - eor Aba, Aba, cur_const + chi_step_ror Aga, Aga_, Agi_, Age_, 47, 39 + chi_step_ror Age, Age_, Ago_, Agi_, 42, 25 + chi_step_ror Agi, Agi_, Agu_, Ago_, 16, 58 + chi_step_ror Ago, Ago_, Aga_, Agu_, 31, 47 + chi_step_ror Agu, Agu_, Age_, Aga_, 56, 23 + chi_step_ror Aka, Aka_, Aki_, Ake_, 19, 24 + chi_step_ror Ake, Ake_, Ako_, Aki_, 47, 2 + chi_step_ror Aki, Aki_, Aku_, Ako_, 10, 57 + chi_step_ror Ako, Ako_, Aka_, Aku_, 47, 57 + chi_step_ror Aku, Aku_, Ake_, Aka_, 5, 52 + chi_step_ror Ama, Ama_, Ami_, Ame_, 38, 47 + chi_step_ror Ame, Ame_, Amo_, Ami_, 5, 43 + chi_step_ror Ami, Ami_, Amu_, Amo_, 41, 46 + chi_step_ror Amo, Amo_, Ama_, Amu_, 35, 12 + chi_step_ror Amu, Amu_, Ame_, Ama_, 9, 44 + chi_step_ror Asa, Asa_, Asi_, Ase_, 48, 41 + chi_step_ror Ase, Ase_, Aso_, Asi_, 2, 50 + chi_step_ror Asi, Asi_, Asu_, Aso_, 25, 27 + chi_step_ror Aso, Aso_, Asa_, Asu_, 60, 21 + chi_step_ror Asu, Asu_, Ase_, Asa_, 57, 53 + chi_step_ror Aba, Aba_, Abi_, Abe_, 63, 21 + chi_step_ror Abe, Abe_, Abo_, Abi_, 42, 41 + chi_step_ror Abi, Abi_, Abu_, Abo_, 57, 35 + chi_step_ror Abo, Abo_, Aba_, Abu_, 50, 43 + chi_step_ror Abu, Abu_, Abe_, Aba_, 44, 30 + eor Aba, Aba, cur_const .endm -.macro final_rotate_store - ror Aga, Aga,#(64-3) - ldr input_addr, [sp, #STACK_OFFSET_INPUT] // @slothy:reads=STACK_OFFSET_INPUT +.macro load_state + ldp Aba, Abe, [input_addr, #(1*8*0)] + ldp Abi, Abo, [input_addr, #(1*8*2)] + ldp Abu, Aga, [input_addr, #(1*8*4)] + ldp Age, Agi, [input_addr, #(1*8*6)] + ldp Ago, Agu, [input_addr, #(1*8*8)] + ldp Aka, Ake, [input_addr, #(1*8*10)] + ldp Aki, Ako, [input_addr, #(1*8*12)] + ldp Aku, Ama, [input_addr, #(1*8*14)] + ldp Ame, Ami, [input_addr, #(1*8*16)] + ldp Amo, Amu, [input_addr, #(1*8*18)] + ldp Asa, Ase, [input_addr, #(1*8*20)] + ldp Asi, Aso, [input_addr, #(1*8*22)] + ldr Asu, [input_addr, #(1*8*24)] +.endm - ror Abu, Abu,#(64-44) - ror Aka, Aka,#(64-25) - ror Ake, Ake,#(64-8) +.macro store_state + stp Aba, Abe, [input_addr, #(1*8*0)] + stp Abi, Abo, [input_addr, #(1*8*2)] stp Abu, Aga, [input_addr, #(1*8*4)] - ror Ama, Ama,#(64-10) - ror Aku, Aku,#(64-6) + stp Age, Agi, [input_addr, #(1*8*6)] + stp Ago, Agu, [input_addr, #(1*8*8)] stp Aka, Ake, [input_addr, #(1*8*10)] - ror Asa, Asa,#(64-39) - ror Ase, Ase,#(64-41) + stp Aki, Ako, [input_addr, #(1*8*12)] stp Aku, Ama, [input_addr, #(1*8*14)] + stp Ame, Ami, [input_addr, #(1*8*16)] + stp Amo, Amu, [input_addr, #(1*8*18)] + stp Asa, Ase, [input_addr, #(1*8*20)] + stp Asi, Aso, [input_addr, #(1*8*22)] + str Asu, [input_addr, #(1*8*24)] +.endm + +.macro final_rotate ror Abe, Abe,#(64-21) + ror Abi, Abi,#(64-14) + ror Abu, Abu,#(64-44) + ror Aga, Aga,#(64-3) ror Age, Age,#(64-45) - stp Asa, Ase, [input_addr, #(1*8*20)] ror Agi, Agi,#(64-61) - stp Aba, Abe, [input_addr, #(1*8*0)] - ror Ame, Ame,#(64-15) - ror Ami, Ami,#(64-56) - stp Age, Agi, [input_addr, #(1*8*6)] - ror Abi, Abi,#(64-14) - ror Aki, Aki,#(64-18) - stp Ame, Ami, [input_addr, #(1*8*16)] - ror Ako, Ako,#(64-1) - stp Abi, Abo, [input_addr, #(1*8*2)] - ror Asi, Asi,#(64-2) - ror Aso, Aso,#(64-62) - stp Aki, Ako, [input_addr, #(1*8*12)] ror Ago, Ago,#(64-28) ror Agu, Agu,#(64-20) - stp Asi, Aso, [input_addr, #(1*8*22)] + ror Aka, Aka,#(64-25) + ror Ake, Ake,#(64-8) + ror Aki, Aki,#(64-18) + ror Ako, Ako,#(64-1) + ror Aku, Aku,#(64-6) + ror Ama, Ama,#(64-10) + ror Ame, Ame,#(64-15) + ror Ami, Ami,#(64-56) ror Amo, Amo,#(64-27) ror Amu, Amu,#(64-36) - stp Ago, Agu, [input_addr, #(1*8*8)] + ror Asa, Asa,#(64-39) + ror Ase, Ase,#(64-41) + ror Asi, Asi,#(64-2) + ror Aso, Aso,#(64-62) ror Asu, Asu,#(64-55) - stp Amo, Amu, [input_addr, #(1*8*18)] - str Asu, [input_addr, #(1*8*24)] .endm #define KECCAK_F1600_ROUNDS 24 @@ -485,244 +488,264 @@ _keccak_f1600_x1_scalar_slothy_opt_a55: alloc_stack save_gprs +initial: + load_state + str input_addr, [sp, #STACK_OFFSET_INPUT] // @slothy:writes=STACK_OFFSET_INPUT keccak_f1600_round_initial loop: - // Instructions: 109 - // Expected cycles: 54 - // Expected IPC: 2.02 + // Instructions: 113 + // Expected cycles: 57 + // Expected IPC: 1.98 // - // -------------------------------------------- original position ---------------------------------------------> - // 0 25 50 75 100 - // |------------------------|------------------------|------------------------|------------------------|-------- - eor x30, x15, x11, ror #52 // *............................................................................................................ - eor x28, x1, x2, ror #61 // .*........................................................................................................... - eor x29, x23, x22, ror #50 // ..*.......................................................................................................... - eor x0, x8, x9, ror #57 // ...*......................................................................................................... - eor x27, x16, x18, ror #63 // ....*........................................................................................................ - eor x30, x30, x13, ror #48 // .....*....................................................................................................... - eor x26, x29, x24, ror #34 // .......*..................................................................................................... - eor x0, x0, x6, ror #51 // ........*.................................................................................................... - eor x29, x27, x19, ror #37 // .........*................................................................................................... - eor x27, x30, x14, ror #10 // ..........*.................................................................................................. - eor x26, x26, x21, ror #26 // ............*................................................................................................ - eor x0, x0, x10, ror #31 // .............*............................................................................................... - eor x30, x29, x17, ror #36 // ..............*.............................................................................................. - eor x28, x28, x4, ror #54 // ......*...................................................................................................... - eor x27, x27, x12, ror #5 // ...............*............................................................................................. - eor x26, x26, x25, ror #15 // .................*........................................................................................... - eor x29, x0, x7, ror #27 // ..................*.......................................................................................... - eor x0, x28, x3, ror #39 // ...........*................................................................................................. - ror x28, x27, #62 // .....................*....................................................................................... - eor x30, x30, x20, ror #2 // ...................*......................................................................................... - eor x28, x28, x26, ror #57 // ......................*...................................................................................... - eor x0, x0, x5, ror #25 // ................*............................................................................................ - ror x26, x26, #58 // .......................*..................................................................................... - eor x26, x26, x29, ror #55 // ........................*.................................................................................... - ror x29, x29, #56 // .........................*................................................................................... - eor x27, x0, x27, ror #61 // ....................*........................................................................................ - eor x29, x29, x30, ror #63 // ..........................*.................................................................................. - eor x0, x30, x0, ror #63 // ...........................*................................................................................. - eor x30, x26, x1 // ............................*................................................................................ - eor x1, x29, x11, ror #50 // .............................*............................................................................... - eor x11, x29, x13, ror #46 // ..............................*.............................................................................. - eor x13, x28, x18, ror #63 // ...............................*............................................................................. - eor x18, x0, x24, ror #28 // ................................*............................................................................ - eor x24, x28, x20, ror #2 // .................................*........................................................................... - eor x20, x26, x4, ror #54 // ..................................*.......................................................................... - eor x4, x27, x6, ror #43 // ...................................*......................................................................... - eor x6, x28, x17, ror #36 // ....................................*........................................................................ - eor x17, x27, x9, ror #49 // .....................................*....................................................................... - eor x9, x29, x12, ror #3 // ......................................*...................................................................... - eor x12, x26, x3, ror #39 // .......................................*..................................................................... - eor x3, x28, x16 // ........................................*.................................................................... - eor x16, x28, x19, ror #37 // .........................................*................................................................... - eor x19, x29, x14, ror #8 // ..........................................*.................................................................. - eor x14, x27, x8, ror #56 // ...........................................*................................................................. - eor x8, x0, x22, ror #44 // ............................................*................................................................ - eor x22, x29, x15, ror #62 // .............................................*............................................................... - eor x15, x0, x23, ror #58 // ..............................................*.............................................................. - eor x23, x26, x5, ror #25 // ...............................................*............................................................. - eor x29, x0, x21, ror #20 // ................................................*............................................................ - eor x21, x0, x25, ror #9 // .................................................*........................................................... - ldr x5, [sp, #STACK_OFFSET_CONST] // .....................................................*....................................................... - eor x25, x27, x10, ror #23 // ..................................................*.......................................................... - eor x26, x26, x2, ror #61 // ...................................................*......................................................... - eor x28, x27, x7, ror #19 // ....................................................*........................................................ - bic x7, x12, x8, ror #47 // .......................................................*..................................................... - bic x10, x22, x17, ror #16 // ..........................................................*.................................................. - bic x0, x17, x12, ror #42 // ........................................................*.................................................... - ldr w27, [sp, #STACK_OFFSET_COUNT] // ......................................................*...................................................... // @slothy:reads=STACK_OFFSET_COUNT - eor x2, x7, x3, ror #39 // .........................................................*................................................... - eor x7, x0, x8, ror #25 // ...........................................................*................................................. - bic x0, x3, x22, ror #31 // ............................................................*................................................ - bic x8, x8, x3, ror #56 // ..............................................................*.............................................. - eor x12, x10, x12, ror #58 // .............................................................*............................................... - eor x17, x0, x17, ror #47 // ...............................................................*............................................. - bic x3, x13, x9, ror #19 // ................................................................*............................................ - eor x22, x8, x22, ror #23 // .................................................................*........................................... - bic x8, x18, x13, ror #47 // ..................................................................*.......................................... - bic x0, x23, x18, ror #10 // ....................................................................*........................................ - bic x10, x4, x23, ror #47 // ......................................................................*...................................... - eor x3, x3, x4, ror #24 // ...................................................................*......................................... - eor x8, x8, x9, ror #2 // .....................................................................*....................................... - eor x13, x0, x13, ror #57 // .......................................................................*..................................... - bic x0, x9, x4, ror #5 // ........................................................................*.................................... - eor x18, x10, x18, ror #57 // .........................................................................*................................... - bic x4, x14, x26, ror #38 // ..........................................................................*.................................. - eor x23, x0, x23, ror #52 // ...........................................................................*................................. - bic x9, x19, x14, ror #5 // ............................................................................*................................ - eor x4, x4, x29, ror #47 // .............................................................................*............................... - bic x0, x24, x19, ror #41 // ..............................................................................*.............................. - eor x9, x9, x26, ror #43 // ...............................................................................*............................. - bic x10, x29, x24, ror #35 // ................................................................................*............................ - eor x14, x0, x14, ror #46 // .................................................................................*........................... - bic x0, x26, x29, ror #9 // ..................................................................................*.......................... - ldr x26, [x5, w27, UXTW #3] // ...................................................................................*......................... - bic x29, x25, x20, ror #25 // .........................................................................................*................... - bic x5, x15, x6, ror #48 // .....................................................................................*....................... - eor x19, x10, x19, ror #12 // ....................................................................................*........................ - eor x24, x0, x24, ror #44 // ......................................................................................*...................... - bic x10, x20, x15, ror #2 // .......................................................................................*..................... - bic x0, x1, x25, ror #60 // ...........................................................................................*................. - eor x5, x5, x1, ror #41 // ........................................................................................*.................... - eor x10, x10, x6, ror #50 // ..........................................................................................*.................. - eor x15, x29, x15, ror #27 // ............................................................................................*................ - bic x29, x6, x1, ror #57 // .............................................................................................*............... - eor x20, x0, x20, ror #21 // ..............................................................................................*.............. - bic x0, x11, x28, ror #63 // ...............................................................................................*............. - add w27, w27, #1 // ................................................................................................*............ - str w27, [sp, #STACK_OFFSET_COUNT] // .................................................................................................*........... // @slothy:writes=STACK_OFFSET_COUNT - eor x25, x29, x25, ror #53 // ..................................................................................................*.......... - bic x6, x16, x11, ror #42 // ...................................................................................................*......... - eor x1, x30, x0, ror #21 // ....................................................................................................*........ - bic x29, x21, x16, ror #57 // .....................................................................................................*....... - bic x0, x30, x21, ror #50 // .......................................................................................................*..... - bic x30, x28, x30, ror #44 // .........................................................................................................*... - eor x6, x6, x28, ror #41 // ......................................................................................................*...... - eor x16, x0, x16, ror #43 // ..........................................................................................................*.. - eor x11, x29, x11, ror #35 // ........................................................................................................*.... - eor x21, x30, x21, ror #30 // ...........................................................................................................*. - eor x1, x1, x26 // ............................................................................................................* - - // ----------------------------------------------- new position -----------------------------------------------> - // 0 25 50 75 100 - // |------------------------|------------------------|------------------------|------------------------|-------- - // eor x27, x15, x11, ror #52 // *............................................................................................................ - // eor x30, x1, x2, ror #61 // .*........................................................................................................... - // eor x29, x23, x22, ror #50 // ..*.......................................................................................................... - // eor x26, x8, x9, ror #57 // ...*......................................................................................................... - // eor x28, x16, x18, ror #63 // ....*........................................................................................................ - // eor x27, x27, x13, ror #48 // .....*....................................................................................................... - // eor x30, x30, x4, ror #54 // .............*............................................................................................... - // eor x29, x29, x24, ror #34 // ......*...................................................................................................... - // eor x26, x26, x6, ror #51 // .......*..................................................................................................... - // eor x28, x28, x19, ror #37 // ........*.................................................................................................... - // eor x27, x27, x14, ror #10 // .........*................................................................................................... - // eor x30, x30, x3, ror #39 // .................*........................................................................................... - // eor x29, x29, x21, ror #26 // ..........*.................................................................................................. - // eor x26, x26, x10, ror #31 // ...........*................................................................................................. - // eor x28, x28, x17, ror #36 // ............*................................................................................................ - // eor x27, x27, x12, ror #5 // ..............*.............................................................................................. - // eor x30, x30, x5, ror #25 // .....................*....................................................................................... - // eor x29, x29, x25, ror #15 // ...............*............................................................................................. - // eor x26, x26, x7, ror #27 // ................*............................................................................................ - // eor x28, x28, x20, ror #2 // ...................*......................................................................................... - // eor x0, x30, x27, ror #61 // .........................*................................................................................... - // ror x27, x27, #62 // ..................*.......................................................................................... - // eor x27, x27, x29, ror #57 // ....................*........................................................................................ - // ror x29, x29, #58 // ......................*...................................................................................... - // eor x29, x29, x26, ror #55 // .......................*..................................................................................... - // ror x26, x26, #56 // ........................*.................................................................................... - // eor x26, x26, x28, ror #63 // ..........................*.................................................................................. - // eor x28, x28, x30, ror #63 // ...........................*................................................................................. - // eor x30, x29, x1 // ............................*................................................................................ - // eor x1, x26, x11, ror #50 // .............................*............................................................................... - // eor x11, x26, x13, ror #46 // ..............................*.............................................................................. - // eor x13, x27, x18, ror #63 // ...............................*............................................................................. - // eor x18, x28, x24, ror #28 // ................................*............................................................................ - // eor x24, x27, x20, ror #2 // .................................*........................................................................... - // eor x20, x29, x4, ror #54 // ..................................*.......................................................................... - // eor x4, x0, x6, ror #43 // ...................................*......................................................................... - // eor x6, x27, x17, ror #36 // ....................................*........................................................................ - // eor x17, x0, x9, ror #49 // .....................................*....................................................................... - // eor x9, x26, x12, ror #3 // ......................................*...................................................................... - // eor x12, x29, x3, ror #39 // .......................................*..................................................................... - // eor x3, x27, x16 // ........................................*.................................................................... - // eor x16, x27, x19, ror #37 // .........................................*................................................................... - // eor x19, x26, x14, ror #8 // ..........................................*.................................................................. - // eor x14, x0, x8, ror #56 // ...........................................*................................................................. - // eor x8, x28, x22, ror #44 // ............................................*................................................................ - // eor x22, x26, x15, ror #62 // .............................................*............................................................... - // eor x15, x28, x23, ror #58 // ..............................................*.............................................................. - // eor x23, x29, x5, ror #25 // ...............................................*............................................................. - // eor x5, x28, x21, ror #20 // ................................................*............................................................ - // eor x21, x28, x25, ror #9 // .................................................*........................................................... - // eor x25, x0, x10, ror #23 // ...................................................*......................................................... - // eor x10, x29, x2, ror #61 // ....................................................*........................................................ - // eor x28, x0, x7, ror #19 // .....................................................*....................................................... - // ldr x26, [sp, #(STACK_OFFSET_CONST)] // ..................................................*.......................................................... - // ldr w27, [sp, #STACK_OFFSET_COUNT] // .........................................................*................................................... - // bic x0, x12, x8, ror #47 // ......................................................*...................................................... - // bic x29, x17, x12, ror #42 // ........................................................*.................................................... - // eor x2, x0, x3, ror #39 // ..........................................................*.................................................. - // bic x0, x22, x17, ror #16 // .......................................................*..................................................... - // eor x7, x29, x8, ror #25 // ...........................................................*................................................. - // bic x29, x3, x22, ror #31 // ............................................................*................................................ - // eor x12, x0, x12, ror #58 // ..............................................................*.............................................. - // bic x0, x8, x3, ror #56 // .............................................................*............................................... - // eor x17, x29, x17, ror #47 // ...............................................................*............................................. - // bic x29, x13, x9, ror #19 // ................................................................*............................................ - // eor x22, x0, x22, ror #23 // .................................................................*........................................... - // bic x0, x18, x13, ror #47 // ..................................................................*.......................................... - // eor x3, x29, x4, ror #24 // .....................................................................*....................................... - // bic x29, x23, x18, ror #10 // ...................................................................*......................................... - // eor x8, x0, x9, ror #2 // ......................................................................*...................................... - // bic x0, x4, x23, ror #47 // ....................................................................*........................................ - // eor x13, x29, x13, ror #57 // .......................................................................*..................................... - // bic x29, x9, x4, ror #5 // ........................................................................*.................................... - // eor x18, x0, x18, ror #57 // .........................................................................*................................... - // bic x0, x14, x10, ror #38 // ..........................................................................*.................................. - // eor x23, x29, x23, ror #52 // ...........................................................................*................................. - // bic x29, x19, x14, ror #5 // ............................................................................*................................ - // eor x4, x0, x5, ror #47 // .............................................................................*............................... - // bic x0, x24, x19, ror #41 // ..............................................................................*.............................. - // eor x9, x29, x10, ror #43 // ...............................................................................*............................. - // bic x29, x5, x24, ror #35 // ................................................................................*............................ - // eor x14, x0, x14, ror #46 // .................................................................................*........................... - // bic x0, x10, x5, ror #9 // ..................................................................................*.......................... - // ldr x26, [x26, w27, UXTW #3] // ...................................................................................*......................... - // eor x19, x29, x19, ror #12 // ......................................................................................*...................... - // bic x29, x15, x6, ror #48 // .....................................................................................*....................... - // eor x24, x0, x24, ror #44 // .......................................................................................*..................... - // bic x0, x20, x15, ror #2 // ........................................................................................*.................... - // eor x5, x29, x1, ror #41 // ..........................................................................................*.................. - // bic x29, x25, x20, ror #25 // ....................................................................................*........................ - // eor x10, x0, x6, ror #50 // ...........................................................................................*................. - // bic x0, x1, x25, ror #60 // .........................................................................................*................... - // eor x15, x29, x15, ror #27 // ............................................................................................*................ - // bic x29, x6, x1, ror #57 // .............................................................................................*............... - // eor x20, x0, x20, ror #21 // ..............................................................................................*.............. - // bic x0, x11, x28, ror #63 // ...............................................................................................*............. - // add w27, w27, #1 // ................................................................................................*............ - // str w27, [sp, #STACK_OFFSET_COUNT] // .................................................................................................*........... - // eor x25, x29, x25, ror #53 // ..................................................................................................*.......... - // bic x29, x16, x11, ror #42 // ...................................................................................................*......... - // eor x1, x30, x0, ror #21 // ....................................................................................................*........ - // bic x0, x21, x16, ror #57 // .....................................................................................................*....... - // eor x6, x29, x28, ror #41 // ........................................................................................................*.... - // bic x29, x30, x21, ror #50 // ......................................................................................................*...... - // eor x11, x0, x11, ror #35 // ..........................................................................................................*.. - // bic x0, x28, x30, ror #44 // .......................................................................................................*..... - // eor x16, x29, x16, ror #43 // .........................................................................................................*... - // eor x21, x0, x21, ror #30 // ...........................................................................................................*. - // eor x1, x1, x26 // ............................................................................................................* + // Cycle bound: 57.0 + // IPC bound: 1.98 + // + // Wall time: 15.98s + // User time: 15.98s + // + // ------------------- cycle (expected) -------------------> + // 0 25 50 + // |------------------------|------------------------|------ + eor x27, x1, x2, ror #61 // *........................................................ + eor x28, x27, x4, ror #54 // *........................................................ + eor x28, x28, x3, ror #39 // .*....................................................... + eor x0, x28, x5, ror #25 // .*....................................................... + eor x28, x8, x9, ror #57 // ..*...................................................... + eor x26, x28, x6, ror #51 // ...*..................................................... + eor x28, x26, x10, ror #31 // ...*..................................................... + eor x28, x28, x7, ror #27 // ....*.................................................... + eor x26, x15, x11, ror #52 // ....*.................................................... + eor x26, x26, x13, ror #48 // .....*................................................... + eor x26, x26, x14, ror #10 // .....*................................................... + eor x30, x26, x12, ror #5 // ......*.................................................. + eor x26, x16, x18, ror #63 // ......*.................................................. + eor x26, x26, x19, ror #37 // .......*................................................. + eor x26, x26, x17, ror #36 // .......*................................................. + eor x29, x26, x20, ror #2 // ........*................................................ + eor x26, x23, x22, ror #50 // ........*................................................ + eor x26, x26, x24, ror #34 // .........*............................................... + eor x26, x26, x21, ror #26 // .........*............................................... + eor x27, x26, x25, ror #15 // ..........*.............................................. + eor x26, x0, x30, ror #61 // ..........*.............................................. + ror x30, x30, #62 // ...........*............................................. + eor x30, x30, x27, ror #57 // ...........*............................................. + ror x27, x27, #58 // ............*............................................ + eor x27, x27, x28, ror #55 // ............*............................................ + ror x28, x28, #56 // .............*........................................... + eor x28, x28, x29, ror #63 // .............*........................................... + eor x0, x29, x0, ror #63 // ..............*.......................................... + str x7, [sp, #16] // ..............*.......................................... // @slothy:writes=Age + str x2, [sp, #24] // ...............*......................................... // @slothy:writes=Aga + ldr x2, [sp, #24] // ...............*......................................... // @slothy:reads=Aga + ldr x7, [sp, #16] // ................*........................................ // @slothy:reads=Age + eor x1, x27, x1 // ................*........................................ + eor x11, x28, x11, ror #50 // .................*....................................... + eor x29, x28, x13, ror #46 // .................*....................................... + eor x13, x30, x18, ror #63 // ..................*...................................... + eor x18, x0, x24, ror #28 // ..................*...................................... + eor x24, x30, x20, ror #2 // ...................*..................................... + eor x20, x27, x4, ror #54 // ...................*..................................... + eor x4, x26, x6, ror #43 // ....................*.................................... + eor x6, x30, x17, ror #36 // ....................*.................................... + eor x17, x26, x9, ror #49 // .....................*................................... + eor x9, x28, x12, ror #3 // .....................*................................... + eor x12, x27, x3, ror #39 // ......................*.................................. + eor x3, x30, x16 // ......................*.................................. + eor x16, x30, x19, ror #37 // .......................*................................. + eor x19, x28, x14, ror #8 // .......................*................................. + eor x14, x26, x8, ror #56 // ........................*................................ + eor x8, x0, x22, ror #44 // ........................*................................ + eor x28, x28, x15, ror #62 // .........................*............................... + eor x15, x0, x23, ror #58 // .........................*............................... + eor x23, x27, x5, ror #25 // ..........................*.............................. + eor x21, x0, x21, ror #20 // ..........................*.............................. + eor x30, x0, x25, ror #9 // ...........................*............................. + eor x25, x26, x10, ror #23 // ...........................*............................. + eor x10, x27, x2, ror #61 // ............................*............................ + eor x26, x26, x7, ror #19 // ............................*............................ + ldr x7, [sp, #STACK_OFFSET_CONST] // .............................*........................... + ldr w5, [sp, #STACK_OFFSET_COUNT] // .............................*........................... // @slothy:reads=STACK_OFFSET_COUNT + ldr x0, [x7, w5, UXTW #3] // ..............................*.......................... + add w27, w5, #1 // ..............................*.......................... + str w27, [sp, #STACK_OFFSET_COUNT] // ...............................*......................... // @slothy:writes=STACK_OFFSET_COUNT + bic x5, x8, x12, ror #47 // ...............................*......................... + eor x2, x5, x3, ror #39 // ................................*........................ + bic x5, x12, x17, ror #42 // ................................*........................ + eor x7, x5, x8, ror #25 // .................................*....................... + bic x5, x17, x28, ror #16 // .................................*....................... + eor x12, x5, x12, ror #58 // ..................................*...................... + bic x5, x28, x3, ror #31 // ..................................*...................... + eor x17, x5, x17, ror #47 // ...................................*..................... + bic x5, x3, x8, ror #56 // ...................................*..................... + eor x22, x5, x28, ror #23 // ....................................*.................... + bic x28, x9, x13, ror #19 // ....................................*.................... + eor x3, x28, x4, ror #24 // .....................................*................... + bic x5, x13, x18, ror #47 // .....................................*................... + eor x8, x5, x9, ror #2 // ......................................*.................. + bic x5, x18, x23, ror #10 // ......................................*.................. + eor x13, x5, x13, ror #57 // .......................................*................. + bic x5, x23, x4, ror #47 // .......................................*................. + eor x18, x5, x18, ror #57 // ........................................*................ + bic x5, x4, x9, ror #5 // ........................................*................ + eor x23, x5, x23, ror #52 // .........................................*............... + bic x5, x10, x14, ror #38 // .........................................*............... + eor x4, x5, x21, ror #47 // ..........................................*.............. + bic x5, x14, x19, ror #5 // ..........................................*.............. + eor x9, x5, x10, ror #43 // ...........................................*............. + bic x5, x19, x24, ror #41 // ...........................................*............. + eor x14, x5, x14, ror #46 // ............................................*............ + bic x5, x24, x21, ror #35 // ............................................*............ + eor x19, x5, x19, ror #12 // .............................................*........... + bic x5, x21, x10, ror #9 // .............................................*........... + eor x24, x5, x24, ror #44 // ..............................................*.......... + bic x5, x6, x15, ror #48 // ..............................................*.......... + eor x5, x5, x11, ror #41 // ...............................................*......... + bic x28, x15, x20, ror #2 // ...............................................*......... + eor x10, x28, x6, ror #50 // ................................................*........ + bic x28, x20, x25, ror #25 // ................................................*........ + eor x15, x28, x15, ror #27 // .................................................*....... + bic x28, x25, x11, ror #60 // .................................................*....... + eor x20, x28, x20, ror #21 // ..................................................*...... + bic x28, x11, x6, ror #57 // ..................................................*...... + eor x25, x28, x25, ror #53 // ...................................................*..... + bic x28, x26, x29, ror #63 // ...................................................*..... + eor x21, x28, x1, ror #21 // ....................................................*.... + bic x28, x29, x16, ror #42 // ....................................................*.... + eor x6, x28, x26, ror #41 // .....................................................*... + bic x11, x16, x30, ror #57 // .....................................................*... + bic x28, x30, x1, ror #50 // ......................................................*.. + eor x11, x11, x29, ror #35 // ......................................................*.. + eor x16, x28, x16, ror #43 // .......................................................*. + bic x28, x1, x26, ror #44 // .......................................................*. + eor x1, x21, x0 // ........................................................* + eor x21, x28, x30, ror #30 // ........................................................* + + // ------------------- cycle (expected) -------------------> + // 0 25 50 + // |------------------------|------------------------|------ + // eor X, x1, x2, ror #61 // *........................................................ + // eor X, X, x4, ror #54 // *........................................................ + // eor X, X, x3, ror #39 // .*....................................................... + // eor X, X, x5, ror #25 // .*....................................................... + // eor X, x8, x9, ror #57 // ..*...................................................... + // eor X, X, x6, ror #51 // ...*..................................................... + // eor X, X, x10, ror #31 // ...*..................................................... + // eor X, X, x7, ror #27 // ....*.................................................... + // eor X, x15, x11, ror #52 // ....*.................................................... + // eor X, X, x13, ror #48 // .....*................................................... + // eor X, X, x14, ror #10 // .....*................................................... + // eor X, X, x12, ror #5 // ......*.................................................. + // eor X, x16, x18, ror #63 // ......*.................................................. + // eor X, X, x19, ror #37 // .......*................................................. + // eor X, X, x17, ror #36 // .......*................................................. + // eor X, X, x20, ror #2 // ........*................................................ + // eor X, x23, x22, ror #50 // ........*................................................ + // eor X, X, x24, ror #34 // .........*............................................... + // eor X, X, x21, ror #26 // .........*............................................... + // eor X, X, x25, ror #15 // ..........*.............................................. + // eor X, X, X, ror #61 // ..........*.............................................. + // ror X, X, #62 // ...........*............................................. + // eor X, X, X, ror #57 // ...........*............................................. + // ror X, X, #58 // ............*............................................ + // eor X, X, X, ror #55 // ............*............................................ + // ror X, X, #56 // .............*........................................... + // eor X, X, X, ror #63 // .............*........................................... + // eor X, X, X, ror #63 // ..............*.......................................... + // str x7, [sp, #16] // ..............*.......................................... + // str x2, [sp, #24] // ...............*......................................... + // ldr x2, [sp, #24] // ...............*......................................... + // ldr x7, [sp, #16] // ................*........................................ + // eor x30, X, x1 // ................*........................................ + // eor x1, X, x11, ror #50 // .................*....................................... + // eor x11, X, x13, ror #46 // .................*....................................... + // eor x13, X, x18, ror #63 // ..................*...................................... + // eor x18, X, x24, ror #28 // ..................*...................................... + // eor x24, X, x20, ror #2 // ...................*..................................... + // eor x20, X, x4, ror #54 // ...................*..................................... + // eor x4, X, x6, ror #43 // ....................*.................................... + // eor x6, X, x17, ror #36 // ....................*.................................... + // eor x17, X, x9, ror #49 // .....................*................................... + // eor x9, X, x12, ror #3 // .....................*................................... + // eor x12, X, x3, ror #39 // ......................*.................................. + // eor x3, X, x16 // ......................*.................................. + // eor x16, X, x19, ror #37 // .......................*................................. + // eor x19, X, x14, ror #8 // .......................*................................. + // eor x14, X, x8, ror #56 // ........................*................................ + // eor x8, X, x22, ror #44 // ........................*................................ + // eor x22, X, x15, ror #62 // .........................*............................... + // eor x15, X, x23, ror #58 // .........................*............................... + // eor x23, X, x5, ror #25 // ..........................*.............................. + // eor x5, X, x21, ror #20 // ..........................*.............................. + // eor x21, X, x25, ror #9 // ...........................*............................. + // eor x25, X, x10, ror #23 // ...........................*............................. + // eor x10, X, x2, ror #61 // ............................*............................ + // eor x28, X, x7, ror #19 // ............................*............................ + // ldr x26, [sp, #(STACK_OFFSET_CONST)] // .............................*........................... + // ldr w27, [sp, #STACK_OFFSET_COUNT] // .............................*........................... + // ldr x26, [x26, w27, UXTW #3] // ..............................*.......................... + // add w27, w27, #1 // ..............................*.......................... + // str w27, [sp, #STACK_OFFSET_COUNT] // ...............................*......................... + // bic X, x8, x12, ror #47 // ...............................*......................... + // eor x2, X, x3, ror #39 // ................................*........................ + // bic X, x12, x17, ror #42 // ................................*........................ + // eor x7, X, x8, ror #25 // .................................*....................... + // bic X, x17, x22, ror #16 // .................................*....................... + // eor x12, X, x12, ror #58 // ..................................*...................... + // bic X, x22, x3, ror #31 // ..................................*...................... + // eor x17, X, x17, ror #47 // ...................................*..................... + // bic X, x3, x8, ror #56 // ...................................*..................... + // eor x22, X, x22, ror #23 // ....................................*.................... + // bic X, x9, x13, ror #19 // ....................................*.................... + // eor x3, X, x4, ror #24 // .....................................*................... + // bic X, x13, x18, ror #47 // .....................................*................... + // eor x8, X, x9, ror #2 // ......................................*.................. + // bic X, x18, x23, ror #10 // ......................................*.................. + // eor x13, X, x13, ror #57 // .......................................*................. + // bic X, x23, x4, ror #47 // .......................................*................. + // eor x18, X, x18, ror #57 // ........................................*................ + // bic X, x4, x9, ror #5 // ........................................*................ + // eor x23, X, x23, ror #52 // .........................................*............... + // bic X, x10, x14, ror #38 // .........................................*............... + // eor x4, X, x5, ror #47 // ..........................................*.............. + // bic X, x14, x19, ror #5 // ..........................................*.............. + // eor x9, X, x10, ror #43 // ...........................................*............. + // bic X, x19, x24, ror #41 // ...........................................*............. + // eor x14, X, x14, ror #46 // ............................................*............ + // bic X, x24, x5, ror #35 // ............................................*............ + // eor x19, X, x19, ror #12 // .............................................*........... + // bic X, x5, x10, ror #9 // .............................................*........... + // eor x24, X, x24, ror #44 // ..............................................*.......... + // bic X, x6, x15, ror #48 // ..............................................*.......... + // eor x5, X, x1, ror #41 // ...............................................*......... + // bic X, x15, x20, ror #2 // ...............................................*......... + // eor x10, X, x6, ror #50 // ................................................*........ + // bic X, x20, x25, ror #25 // ................................................*........ + // eor x15, X, x15, ror #27 // .................................................*....... + // bic X, x25, x1, ror #60 // .................................................*....... + // eor x20, X, x20, ror #21 // ..................................................*...... + // bic X, x1, x6, ror #57 // ..................................................*...... + // eor x25, X, x25, ror #53 // ...................................................*..... + // bic X, x28, x11, ror #63 // ...................................................*..... + // eor x1, X, x30, ror #21 // ....................................................*.... + // bic X, x11, x16, ror #42 // ....................................................*.... + // eor x6, X, x28, ror #41 // .....................................................*... + // bic X, x16, x21, ror #57 // .....................................................*... + // eor x11, X, x11, ror #35 // ......................................................*.. + // bic X, x21, x30, ror #50 // ......................................................*.. + // eor x16, X, x16, ror #43 // .......................................................*. + // bic X, x30, x28, ror #44 // .......................................................*. + // eor x21, X, x21, ror #30 // ........................................................* + // eor x1, x1, x26 // ........................................................* end_loop: cmp count, #(KECCAK_F1600_ROUNDS-1) ble loop - - final_rotate_store +final: + final_rotate + ldr input_addr, [sp, #STACK_OFFSET_INPUT] // @slothy:reads=STACK_OFFSET_INPUT + store_state +end_final: restore_gprs free_stack ret \ No newline at end of file