Skip to content

Commit

Permalink
Scrypt ARM64 assembly
Browse files Browse the repository at this point in the history
20% faster than C on mobile
  • Loading branch information
hectorchu committed Feb 28, 2025
1 parent 79182db commit c6da236
Show file tree
Hide file tree
Showing 4 changed files with 152 additions and 102 deletions.
91 changes: 0 additions & 91 deletions ltcutil/scrypt/scrypt.c

This file was deleted.

16 changes: 5 additions & 11 deletions ltcutil/scrypt/scrypt.go
Original file line number Diff line number Diff line change
@@ -1,16 +1,10 @@
package scrypt

// void scrypt_aux(unsigned char*);
import "C"
//go:build !arm64

import (
"crypto/sha256"
package scrypt

"golang.org/x/crypto/pbkdf2"
)
import scrypt2 "golang.org/x/crypto/scrypt"

func Scrypt(x []byte) []byte {
X := pbkdf2.Key(x, x, 1, 128, sha256.New)
C.scrypt_aux((*C.uchar)(&X[0]))
return pbkdf2.Key(x, X, 1, 32, sha256.New)
x, _ = scrypt2.Key(x, x, 1024, 1, 1, 32)
return x
}
24 changes: 24 additions & 0 deletions ltcutil/scrypt/scrypt_arm64.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
package scrypt

import (
"crypto/sha256"
"sync"

"golang.org/x/crypto/pbkdf2"
)

type scratch [1024][32]uint32

var pool = sync.Pool{New: func() interface{} {
return &scratch{}
}}

func Scrypt(x []byte) []byte {
X := pbkdf2.Key(x, x, 1, 128, sha256.New)
V := pool.Get().(*scratch)
scrypt(&X[0], V)
pool.Put(V)
return pbkdf2.Key(x, X, 1, 32, sha256.New)
}

func scrypt(X *byte, V *scratch)
123 changes: 123 additions & 0 deletions ltcutil/scrypt/scrypt_arm64.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
#include "textflag.h"

#define EORPW(n, Ra1, Ra2, Rb1, Rb2, Rc1, Rc2) \
LDPW n(Ra1), (Rb1, Rb2) \
LDPW n(Ra2), (Rc1, Rc2) \
EORW Rb1, Rc1, Rc1 \
EORW Rb2, Rc2, Rc2 \
STPW (Rc1, Rc2), n(Ra1)

#define EORBLK(n) \
EORPW(0x00+n, R16, R17, R19, R20, R0, R1) \
EORPW(0x08+n, R16, R17, R21, R22, R2, R3) \
EORPW(0x10+n, R16, R17, R23, R24, R4, R5) \
EORPW(0x18+n, R16, R17, R25, R26, R6, R7) \
EORPW(0x20+n, R16, R17, R19, R20, R8, R9) \
EORPW(0x28+n, R16, R17, R21, R22, R10, R11) \
EORPW(0x30+n, R16, R17, R23, R24, R12, R13) \
EORPW(0x38+n, R16, R17, R25, R26, R14, R15)

#define ADDPW(n, Ra, Rb1, Rb2, Rc1, Rc2) \
LDPW n(Ra), (Rb1, Rb2) \
ADDW Rb1, Rc1, Rc1 \
ADDW Rb2, Rc2, Rc2 \
STPW (Rc1, Rc2), n(Ra)

#define ADDEORW(Ra, Rb, Rc, n, Rd) \
ADDW Ra, Rb, Rc \
EORW Rc@>n, Rd, Rd

#define QUARTRND(Ra1, Ra2, Ra3, Rb1, Rb2, Rb3, Rc1, Rc2, Rc3, Rd1, Rd2, Rd3, n) \
ADDEORW(Ra1, Ra2, R19, n, Ra3) \
ADDEORW(Rb1, Rb2, R20, n, Rb3) \
ADDEORW(Rc1, Rc2, R21, n, Rc3) \
ADDEORW(Rd1, Rd2, R22, n, Rd3)

TEXT ·scrypt(SB), NOSPLIT, $8-16
MOVD V+8(FP), R1

loop1:
MOVD X+0(FP), R0

FLDPQ 0(R0), (F0, F1)
FSTPQ (F0, F1), 0(R1)
FLDPQ 32(R0), (F2, F3)
FSTPQ (F2, F3), 32(R1)
FLDPQ 64(R0), (F4, F5)
FSTPQ (F4, F5), 64(R1)
FLDPQ 96(R0), (F6, F7)
FSTPQ (F6, F7), 96(R1)

MOVD R1, 8(RSP)
ADD $64, R0, R1
CALL eor_salsa8(SB)
MOVD X+0(FP), R1
ADD $64, R1, R0
CALL eor_salsa8(SB)

MOVD 8(RSP), R1
ADD $128, R1, R1
MOVD V+8(FP), R2
ADD $0x20000, R2, R2
CMP R1, R2
BNE loop1

MOVD $0, R1

loop2:
MOVD R1, 8(RSP)
MOVD X+0(FP), R16
MOVD V+8(FP), R17
MOVWU 64(R16), R0
AND $1023, R0, R0
ADD R0<<7, R17, R17

EORBLK(0)
EORBLK(64)

MOVD R16, R0
ADD $64, R0, R1
CALL eor_salsa8(SB)
MOVD X+0(FP), R1
ADD $64, R1, R0
CALL eor_salsa8(SB)

MOVD 8(RSP), R1
ADD $1, R1, R1
CMP $1024, R1
BNE loop2

RET

TEXT eor_salsa8(SB), NOSPLIT, $0
MOVD R0, R16
MOVD R1, R17

EORBLK(0)

MOVD $0, R17

loop:
QUARTRND(R0, R12, R4, R5, R1, R9, R10, R6, R14, R15, R11, R3, 25)
QUARTRND(R4, R0, R8, R9, R5, R13, R14, R10, R2, R3, R15, R7, 23)
QUARTRND(R8, R4, R12, R13, R9, R1, R2, R14, R6, R7, R3, R11, 19)
QUARTRND(R12, R8, R0, R1, R13, R5, R6, R2, R10, R11, R7, R15, 14)
QUARTRND(R0, R3, R1, R5, R4, R6, R10, R9, R11, R15, R14, R12, 25)
QUARTRND(R1, R0, R2, R6, R5, R7, R11, R10, R8, R12, R15, R13, 23)
QUARTRND(R2, R1, R3, R7, R6, R4, R8, R11, R9, R13, R12, R14, 19)
QUARTRND(R3, R2, R0, R4, R7, R5, R9, R8, R10, R14, R13, R15, 14)

ADD $1, R17, R17
CMP $4, R17
BNE loop

ADDPW(0x00, R16, R19, R20, R0, R1)
ADDPW(0x08, R16, R21, R22, R2, R3)
ADDPW(0x10, R16, R23, R24, R4, R5)
ADDPW(0x18, R16, R25, R26, R6, R7)
ADDPW(0x20, R16, R19, R20, R8, R9)
ADDPW(0x28, R16, R21, R22, R10, R11)
ADDPW(0x30, R16, R23, R24, R12, R13)
ADDPW(0x38, R16, R25, R26, R14, R15)

RET

0 comments on commit c6da236

Please sign in to comment.