Skip to content

Commit

Permalink
Slightly faster still
Browse files Browse the repository at this point in the history
  • Loading branch information
hectorchu committed Mar 3, 2025
1 parent 93a4e24 commit eba1cca
Showing 1 changed file with 120 additions and 106 deletions.
226 changes: 120 additions & 106 deletions ltcutil/scrypt/scrypt_arm64.s
Original file line number Diff line number Diff line change
@@ -1,119 +1,133 @@
#include "textflag.h"

#define EORP(n, Ra1, Ra2, Rb1, Rb2, Rc1, Rc2) \
LDP n(Ra1), (Rb1, Rb2) \
LDP n(Ra2), (Rc1, Rc2) \
EOR Rb1, Rc1, Rc1 \
EOR Rb2, Rc2, Rc2 \
STP (Rc1, Rc2), n(Ra1)

#define ADDPW(n, Ra, _, Rb1, Rb2, Rc1, Rc2) \
LDPW n(Ra), (Rb1, Rb2) \
ADDW Rb1, Rc1, Rc1 \
ADDW Rb2, Rc2, Rc2 \
STPW (Rc1, Rc2), n(Ra)

#define BLK(OP, w, n, Ra, Rb, Rc, Rd, Re, Rf, Rg, Rh) \
OP(0*w+n, R16, R17, R19, R20, Ra, Rb) \
OP(1*w+n, R16, R17, R21, R22, Rc, Rd) \
OP(2*w+n, R16, R17, R23, R24, Re, Rf) \
OP(3*w+n, R16, R17, R25, R26, Rg, Rh)

#define ADDEORW(Ra, Rb, Rc, n, Rd) \
ADDW Ra, Rb, Rc \
EORW Rc@>n, Rd, Rd

#define QRTRND(Ra, Rb, Rc, Rd, Re, Rf, Rg, Rh, Ri, Rj, Rk, Rl, n) \
ADDEORW(Ra, Rb, R19, n, Rc) \
ADDEORW(Rd, Re, R20, n, Rf) \
ADDEORW(Rg, Rh, R21, n, Ri) \
ADDEORW(Rj, Rk, R22, n, Rl)
#define BLK_COPY \
FLDPQ 0(R16), (F0, F1) \
FSTPQ (F0, F1), 0(R17) \
FLDPQ 32(R16), (F2, F3) \
FSTPQ (F2, F3), 32(R17) \
FLDPQ 64(R16), (F4, F5) \
FSTPQ (F4, F5), 64(R17) \
FLDPQ 96(R16), (F6, F7) \
FSTPQ (F6, F7), 96(R17)

#define BLK_MIX \
ADD $64, R16, R17 \
CALL ·eorSalsa8(SB) \
MOVD X+0(FP), R17 \
ADD $64, R17, R16 \
CALL ·eorSalsa8(SB)

#define MIX_LOOP1 \
MOVD X+0(FP), R16 \
MOVD R17, p-8(SP) \
BLK_COPY \
BLK_MIX \
MOVD p-8(SP), R17 \
ADD $128, R17, R17

#define EORP(n, Ra, Rb) \
LDP n(R16), (R19, R20) \
LDP n(R17), (Ra, Rb) \
EOR R19, Ra, Ra \
EOR R20, Rb, Rb

#define EORP_STP(n) \
EORP (n, R0, R1) \
STP (R0, R1), n(R16)

#define MIX_LOOP2 \
MOVD X+0(FP), R16 \
MOVD V+8(FP), R17 \
MOVWU 64(R16), R0 \
AND $1023, R0, R0 \
ADD R0<<7, R17, R17 \
EORP_STP(0); EORP_STP(16) \
EORP_STP(32); EORP_STP(48) \
EORP_STP(64); EORP_STP(80) \
EORP_STP(96); EORP_STP(112) \
BLK_MIX

TEXT ·scrypt(SB), NOSPLIT, $8-16
MOVD V+8(FP), R1

L1: MOVD X+0(FP), R0

FLDPQ 0(R0), (F0, F1)
FSTPQ (F0, F1), 0(R1)
FLDPQ 32(R0), (F2, F3)
FSTPQ (F2, F3), 32(R1)
FLDPQ 64(R0), (F4, F5)
FSTPQ (F4, F5), 64(R1)
FLDPQ 96(R0), (F6, F7)
FSTPQ (F6, F7), 96(R1)

MOVD R1, 8(RSP)
ADD $64, R0, R1
CALL eor_salsa8(SB)
MOVD X+0(FP), R1
ADD $64, R1, R0
CALL eor_salsa8(SB)

MOVD 8(RSP), R1
ADD $128, R1, R1
MOVD V+8(FP), R2
ADD $0x20000, R2, R2
CMP R1, R2
MOVD V+8(FP), R17

L1: MIX_LOOP1
MIX_LOOP1
MIX_LOOP1
MIX_LOOP1

MOVD V+8(FP), R0
ADD $0x20000, R0, R0
CMP R17, R0
BNE L1

MOVD $0, R1
MOVD $1024, R0
MOVD R0, i-8(SP)

L2: MOVD R1, 8(RSP)
MOVD X+0(FP), R16
MOVD V+8(FP), R17
MOVWU 64(R16), R0
AND $1023, R0, R0
ADD R0<<7, R17, R17

BLK(EORP, 16, 0, R0, R1, R2, R3, R4, R5, R6, R7)
BLK(EORP, 16, 64, R0, R1, R2, R3, R4, R5, R6, R7)

MOVD R16, R0
ADD $64, R0, R1
CALL eor_salsa8(SB)
MOVD X+0(FP), R1
ADD $64, R1, R0
CALL eor_salsa8(SB)

MOVD 8(RSP), R1
ADD $1, R1, R1
CMP $1024, R1
L2: MIX_LOOP2
MIX_LOOP2
MIX_LOOP2
MIX_LOOP2

MOVD i-8(SP), R0
SUBS $4, R0, R0
MOVD R0, i-8(SP)
BNE L2

RET

TEXT eor_salsa8(SB), NOSPLIT, $0
MOVD R0, R16
MOVD R1, R17

BLK(EORP, 16, 0, R0, R2, R4, R6, R8, R10, R12, R14)

LSR $32, R0, R1
LSR $32, R2, R3
LSR $32, R4, R5
LSR $32, R6, R7
LSR $32, R8, R9
LSR $32, R10, R11
LSR $32, R12, R13
LSR $32, R14, R15

MOVD $0, R17

L: QRTRND(R0, R12, R4, R5, R1, R9, R10, R6, R14, R15, R11, R3, 25)
QRTRND(R4, R0, R8, R9, R5, R13, R14, R10, R2, R3, R15, R7, 23)
QRTRND(R8, R4, R12, R13, R9, R1, R2, R14, R6, R7, R3, R11, 19)
QRTRND(R12, R8, R0, R1, R13, R5, R6, R2, R10, R11, R7, R15, 14)
QRTRND(R0, R3, R1, R5, R4, R6, R10, R9, R11, R15, R14, R12, 25)
QRTRND(R1, R0, R2, R6, R5, R7, R11, R10, R8, R12, R15, R13, 23)
QRTRND(R2, R1, R3, R7, R6, R4, R8, R11, R9, R13, R12, R14, 19)
QRTRND(R3, R2, R0, R4, R7, R5, R9, R8, R10, R14, R13, R15, 14)

ADD $1, R17, R17
CMP $4, R17
BNE L

BLK(ADDPW, 8, 0, R0, R1, R2, R3, R4, R5, R6, R7)
BLK(ADDPW, 8, 32, R8, R9, R10, R11, R12, R13, R14, R15)
#define EORP_LSR(n, Ra, Rb, Rc, Rd) \
EORP (n, Ra, Rc) \
LSR $32, Ra, Rb \
LSR $32, Rc, Rd

#define EORP_LSR_MOV(n, Ra, Rb, Rc, Rd, Re, Rf) \
EORP_LSR(n, Ra, Rb, Rc, Rd) \
MOVD Ra, Re \
MOVD Rc, Rf

#define ADD_ROR_EOR(Ra, Rb, n, Rc) \
ADDW Ra, Rb, R17 \
EORW R17@>n, Rc, Rc

#define QR(Ra, Rb, Rc, Rd, Re, Rf, Rg, Rh, Ri, Rj, Rk, Rl, n) \
ADD_ROR_EOR(Ra, Rb, n, Rc) \
ADD_ROR_EOR(Rd, Re, n, Rf) \
ADD_ROR_EOR(Rg, Rh, n, Ri) \
ADD_ROR_EOR(Rj, Rk, n, Rl)

#define DBL_ROUND \
QR(R0, R12, R4, R5, R1, R9, R10, R6, R14, R15, R11, R3, 25) \
QR(R4, R0, R8, R9, R5, R13, R14, R10, R2, R3, R15, R7, 23) \
QR(R8, R4, R12, R13, R9, R1, R2, R14, R6, R7, R3, R11, 19) \
QR(R12, R8, R0, R1, R13, R5, R6, R2, R10, R11, R7, R15, 14) \
QR(R0, R3, R1, R5, R4, R6, R10, R9, R11, R15, R14, R12, 25) \
QR(R1, R0, R2, R6, R5, R7, R11, R10, R8, R12, R15, R13, 23) \
QR(R2, R1, R3, R7, R6, R4, R8, R11, R9, R13, R12, R14, 19) \
QR(R3, R2, R0, R4, R7, R5, R9, R8, R10, R14, R13, R15, 14)

#define ADD_STP(n, Ra, Rb, Rc) \
ADDW Ra, Rb, Rb \
ADD Ra>>32, Rc, Rc \
STPW (Rb, Rc), n(R16)

TEXT ·eorSalsa8(SB), NOSPLIT, $0
EORP_LSR_MOV(0, R0, R1, R2, R3, R21, R22)
EORP_LSR_MOV(16, R4, R5, R6, R7, R23, R24)
EORP_LSR_MOV(32, R8, R9, R10, R11, R25, R26)
EORP_LSR_MOV(48, R12, R13, R14, R15, R19, R20)

DBL_ROUND
DBL_ROUND
DBL_ROUND
DBL_ROUND

ADD_STP(0, R21, R0, R1)
ADD_STP(8, R22, R2, R3)
ADD_STP(16, R23, R4, R5)
ADD_STP(24, R24, R6, R7)
ADD_STP(32, R25, R8, R9)
ADD_STP(40, R26, R10, R11)
ADD_STP(48, R19, R12, R13)
ADD_STP(56, R20, R14, R15)

RET

0 comments on commit eba1cca

Please sign in to comment.