From 9cfc373bb339d9840970c23b29e17b5164be05fe Mon Sep 17 00:00:00 2001 From: Hector Chu Date: Fri, 28 Feb 2025 18:28:35 +0000 Subject: [PATCH] Scrypt ARM64 assembly 20% faster than C on mobile --- ltcutil/scrypt/scrypt.c | 91 ------------------------ ltcutil/scrypt/scrypt.go | 16 ++--- ltcutil/scrypt/scrypt_arm64.go | 24 +++++++ ltcutil/scrypt/scrypt_arm64.s | 122 +++++++++++++++++++++++++++++++++ 4 files changed, 151 insertions(+), 102 deletions(-) delete mode 100644 ltcutil/scrypt/scrypt.c create mode 100644 ltcutil/scrypt/scrypt_arm64.go create mode 100644 ltcutil/scrypt/scrypt_arm64.s diff --git a/ltcutil/scrypt/scrypt.c b/ltcutil/scrypt/scrypt.c deleted file mode 100644 index 16a011bbf7..0000000000 --- a/ltcutil/scrypt/scrypt.c +++ /dev/null @@ -1,91 +0,0 @@ -#include -#include - -#define ROTL(a, b) (((a) << (b)) | ((a) >> (32 - (b)))) - -static inline void xor_salsa8(uint32_t B[16], const uint32_t Bx[16]) -{ - uint32_t x00,x01,x02,x03,x04,x05,x06,x07,x08,x09,x10,x11,x12,x13,x14,x15; - int i; - - x00 = (B[ 0] ^= Bx[ 0]); - x01 = (B[ 1] ^= Bx[ 1]); - x02 = (B[ 2] ^= Bx[ 2]); - x03 = (B[ 3] ^= Bx[ 3]); - x04 = (B[ 4] ^= Bx[ 4]); - x05 = (B[ 5] ^= Bx[ 5]); - x06 = (B[ 6] ^= Bx[ 6]); - x07 = (B[ 7] ^= Bx[ 7]); - x08 = (B[ 8] ^= Bx[ 8]); - x09 = (B[ 9] ^= Bx[ 9]); - x10 = (B[10] ^= Bx[10]); - x11 = (B[11] ^= Bx[11]); - x12 = (B[12] ^= Bx[12]); - x13 = (B[13] ^= Bx[13]); - x14 = (B[14] ^= Bx[14]); - x15 = (B[15] ^= Bx[15]); - - for (i = 0; i < 8; i += 2) { - /* Operate on columns. */ - x04 ^= ROTL(x00 + x12, 7); x09 ^= ROTL(x05 + x01, 7); - x14 ^= ROTL(x10 + x06, 7); x03 ^= ROTL(x15 + x11, 7); - - x08 ^= ROTL(x04 + x00, 9); x13 ^= ROTL(x09 + x05, 9); - x02 ^= ROTL(x14 + x10, 9); x07 ^= ROTL(x03 + x15, 9); - - x12 ^= ROTL(x08 + x04, 13); x01 ^= ROTL(x13 + x09, 13); - x06 ^= ROTL(x02 + x14, 13); x11 ^= ROTL(x07 + x03, 13); - - x00 ^= ROTL(x12 + x08, 18); x05 ^= ROTL(x01 + x13, 18); - x10 ^= ROTL(x06 + x02, 18); x15 ^= ROTL(x11 + x07, 18); - - /* Operate on rows. */ - x01 ^= ROTL(x00 + x03, 7); x06 ^= ROTL(x05 + x04, 7); - x11 ^= ROTL(x10 + x09, 7); x12 ^= ROTL(x15 + x14, 7); - - x02 ^= ROTL(x01 + x00, 9); x07 ^= ROTL(x06 + x05, 9); - x08 ^= ROTL(x11 + x10, 9); x13 ^= ROTL(x12 + x15, 9); - - x03 ^= ROTL(x02 + x01, 13); x04 ^= ROTL(x07 + x06, 13); - x09 ^= ROTL(x08 + x11, 13); x14 ^= ROTL(x13 + x12, 13); - - x00 ^= ROTL(x03 + x02, 18); x05 ^= ROTL(x04 + x07, 18); - x10 ^= ROTL(x09 + x08, 18); x15 ^= ROTL(x14 + x13, 18); - } - - B[ 0] += x00; - B[ 1] += x01; - B[ 2] += x02; - B[ 3] += x03; - B[ 4] += x04; - B[ 5] += x05; - B[ 6] += x06; - B[ 7] += x07; - B[ 8] += x08; - B[ 9] += x09; - B[10] += x10; - B[11] += x11; - B[12] += x12; - B[13] += x13; - B[14] += x14; - B[15] += x15; -} - -void scrypt_aux(uint32_t X[32]) -{ - uint32_t V[32768], i, j, k; - - for (i = 0; i < 1024; i++) { - memcpy(&V[i * 32], X, 128); - xor_salsa8(&X[0], &X[16]); - xor_salsa8(&X[16], &X[0]); - } - - for (i = 0; i < 1024; i++) { - j = 32 * (X[16] & 1023); - for (k = 0; k < 32; k++) - X[k] ^= V[j + k]; - xor_salsa8(&X[0], &X[16]); - xor_salsa8(&X[16], &X[0]); - } -} diff --git a/ltcutil/scrypt/scrypt.go b/ltcutil/scrypt/scrypt.go index f2e780a798..6c3e0634fb 100644 --- a/ltcutil/scrypt/scrypt.go +++ b/ltcutil/scrypt/scrypt.go @@ -1,16 +1,10 @@ -package scrypt - -// void scrypt_aux(unsigned char*); -import "C" +//go:build !arm64 -import ( - "crypto/sha256" +package scrypt - "golang.org/x/crypto/pbkdf2" -) +import scrypt2 "golang.org/x/crypto/scrypt" func Scrypt(x []byte) []byte { - X := pbkdf2.Key(x, x, 1, 128, sha256.New) - C.scrypt_aux((*C.uchar)(&X[0])) - return pbkdf2.Key(x, X, 1, 32, sha256.New) + x, _ = scrypt2.Key(x, x, 1024, 1, 1, 32) + return x } diff --git a/ltcutil/scrypt/scrypt_arm64.go b/ltcutil/scrypt/scrypt_arm64.go new file mode 100644 index 0000000000..9c7869bf1f --- /dev/null +++ b/ltcutil/scrypt/scrypt_arm64.go @@ -0,0 +1,24 @@ +package scrypt + +import ( + "crypto/sha256" + "sync" + + "golang.org/x/crypto/pbkdf2" +) + +type scratch [1024][32]uint32 + +var pool = sync.Pool{New: func() interface{} { + return &scratch{} +}} + +func Scrypt(x []byte) []byte { + X := pbkdf2.Key(x, x, 1, 128, sha256.New) + V := pool.Get().(*scratch) + scrypt(&X[0], V) + pool.Put(V) + return pbkdf2.Key(x, X, 1, 32, sha256.New) +} + +func scrypt(X *byte, V *scratch) diff --git a/ltcutil/scrypt/scrypt_arm64.s b/ltcutil/scrypt/scrypt_arm64.s new file mode 100644 index 0000000000..a3773f13d5 --- /dev/null +++ b/ltcutil/scrypt/scrypt_arm64.s @@ -0,0 +1,122 @@ +#include "textflag.h" + +#define EORP(n, Ra1, Ra2, Rb1, Rb2, Rc1, Rc2) \ + LDP n(Ra1), (Rb1, Rb2) \ + LDP n(Ra2), (Rc1, Rc2) \ + EOR Rb1, Rc1, Rc1 \ + EOR Rb2, Rc2, Rc2 \ + STP (Rc1, Rc2), n(Ra1) + +#define ADDPW(n, Ra, _, Rb1, Rb2, Rc1, Rc2) \ + LDPW n(Ra), (Rb1, Rb2) \ + ADDW Rb1, Rc1, Rc1 \ + ADDW Rb2, Rc2, Rc2 \ + STPW (Rc1, Rc2), n(Ra) + +#define BLK(OP, w, n, Ra, Rb, Rc, Rd, Re, Rf, Rg, Rh) \ + OP(0*w+n, R16, R17, R19, R20, Ra, Rb) \ + OP(1*w+n, R16, R17, R21, R22, Rc, Rd) \ + OP(2*w+n, R16, R17, R23, R24, Re, Rf) \ + OP(3*w+n, R16, R17, R25, R26, Rg, Rh) + +#define ADDEORW(Ra, Rb, Rc, n, Rd) \ + ADDW Ra, Rb, Rc \ + EORW Rc@>n, Rd, Rd + +#define QRTRND(Ra1, Ra2, Ra3, Rb1, Rb2, Rb3, Rc1, Rc2, Rc3, Rd1, Rd2, Rd3, n) \ + ADDEORW(Ra1, Ra2, R19, n, Ra3) \ + ADDEORW(Rb1, Rb2, R20, n, Rb3) \ + ADDEORW(Rc1, Rc2, R21, n, Rc3) \ + ADDEORW(Rd1, Rd2, R22, n, Rd3) + +TEXT ·scrypt(SB), NOSPLIT, $8-16 + MOVD V+8(FP), R1 + +loop1: + MOVD X+0(FP), R0 + + FLDPQ 0(R0), (F0, F1) + FSTPQ (F0, F1), 0(R1) + FLDPQ 32(R0), (F2, F3) + FSTPQ (F2, F3), 32(R1) + FLDPQ 64(R0), (F4, F5) + FSTPQ (F4, F5), 64(R1) + FLDPQ 96(R0), (F6, F7) + FSTPQ (F6, F7), 96(R1) + + MOVD R1, 8(RSP) + ADD $64, R0, R1 + CALL eor_salsa8(SB) + MOVD X+0(FP), R1 + ADD $64, R1, R0 + CALL eor_salsa8(SB) + + MOVD 8(RSP), R1 + ADD $128, R1, R1 + MOVD V+8(FP), R2 + ADD $0x20000, R2, R2 + CMP R1, R2 + BNE loop1 + + MOVD $0, R1 + +loop2: + MOVD R1, 8(RSP) + MOVD X+0(FP), R16 + MOVD V+8(FP), R17 + MOVWU 64(R16), R0 + AND $1023, R0, R0 + ADD R0<<7, R17, R17 + + BLK(EORP, 16, 0, R0, R1, R2, R3, R4, R5, R6, R7) + BLK(EORP, 16, 64, R0, R1, R2, R3, R4, R5, R6, R7) + + MOVD R16, R0 + ADD $64, R0, R1 + CALL eor_salsa8(SB) + MOVD X+0(FP), R1 + ADD $64, R1, R0 + CALL eor_salsa8(SB) + + MOVD 8(RSP), R1 + ADD $1, R1, R1 + CMP $1024, R1 + BNE loop2 + + RET + +TEXT eor_salsa8(SB), NOSPLIT, $0 + MOVD R0, R16 + MOVD R1, R17 + + BLK(EORP, 16, 0, R0, R2, R4, R6, R8, R10, R12, R14) + + LSR $32, R0, R1 + LSR $32, R2, R3 + LSR $32, R4, R5 + LSR $32, R6, R7 + LSR $32, R8, R9 + LSR $32, R10, R11 + LSR $32, R12, R13 + LSR $32, R14, R15 + + MOVD $0, R17 + +loop: + QRTRND(R0, R12, R4, R5, R1, R9, R10, R6, R14, R15, R11, R3, 25) + QRTRND(R4, R0, R8, R9, R5, R13, R14, R10, R2, R3, R15, R7, 23) + QRTRND(R8, R4, R12, R13, R9, R1, R2, R14, R6, R7, R3, R11, 19) + QRTRND(R12, R8, R0, R1, R13, R5, R6, R2, R10, R11, R7, R15, 14) + QRTRND(R0, R3, R1, R5, R4, R6, R10, R9, R11, R15, R14, R12, 25) + QRTRND(R1, R0, R2, R6, R5, R7, R11, R10, R8, R12, R15, R13, 23) + QRTRND(R2, R1, R3, R7, R6, R4, R8, R11, R9, R13, R12, R14, 19) + QRTRND(R3, R2, R0, R4, R7, R5, R9, R8, R10, R14, R13, R15, 14) + + ADD $1, R17, R17 + CMP $4, R17 + BNE loop + + BLK(ADDPW, 8, 0, R0, R1, R2, R3, R4, R5, R6, R7) + BLK(ADDPW, 8, 32, R8, R9, R10, R11, R12, R13, R14, R15) + + RET