From a1dc90b0d78a62340c4bbfed04fc169c1c12ebdf Mon Sep 17 00:00:00 2001
From: Mamy Ratsimbazafy <mamy_github@numforge.co>
Date: Wed, 25 Dec 2024 19:09:42 +0100
Subject: [PATCH] keccak: BMI1 optimizations

---
 benchmarks/bench_h_keccak.nim                 |   1 +
 constantine/hashes/h_keccak.nim               | 210 ++++++++++--------
 constantine/hashes/keccak/keccak_generic.nim  |  65 +++++-
 constantine/hashes/keccak/keccak_x86_bmi1.nim |  60 +++++
 4 files changed, 239 insertions(+), 97 deletions(-)
 create mode 100644 constantine/hashes/keccak/keccak_x86_bmi1.nim

diff --git a/benchmarks/bench_h_keccak.nim b/benchmarks/bench_h_keccak.nim
index d0a0d466..d1d8aea6 100644
--- a/benchmarks/bench_h_keccak.nim
+++ b/benchmarks/bench_h_keccak.nim
@@ -84,5 +84,6 @@ when isMainModule:
       let iters = int(target_cycles div (s.int64 * worst_cycles_per_bytes))
       benchKeccak256_constantine(msg, $s & "B", iters)
       benchSHA3_256_openssl(msg, $s & "B", iters)
+      echo "----"
 
   main()
diff --git a/constantine/hashes/h_keccak.nim b/constantine/hashes/h_keccak.nim
index 6f90576f..a5f780da 100644
--- a/constantine/hashes/h_keccak.nim
+++ b/constantine/hashes/h_keccak.nim
@@ -10,6 +10,9 @@ import
   constantine/platforms/[abstractions, views],
   ./keccak/keccak_generic
 
+when UseASM_X86_32:
+  import ./keccak/keccak_x86_bmi1
+
 # Keccak, the hash function underlying SHA3
 # --------------------------------------------------------------------------------
 #
@@ -125,8 +128,110 @@ func init*(ctx: var KeccakContext) {.inline.} =
   ## Initialize or reinitialize a Keccak context
   ctx.reset()
 
-# debug
-import constantine/serialization/codecs
+template genAbsorb(isaFeatures: untyped) =
+  func `absorb _ isaFeatures`*(ctx: var KeccakContext, message: openArray[byte]) =
+    ## Absorb a message in the Keccak sponge state
+    ##
+    ## Security note: the tail of your message might be stored
+    ## in an internal buffer.
+    ## if sensitive content is used, ensure that
+    ## `ctx.finish(...)` and `ctx.clear()` are called as soon as possible.
+    ## Additionally ensure that the message(s) passed were stored
+    ## in memory considered secure for your threat model.
+
+    var pos = int ctx.absorb_offset # offset in Keccak state
+    var cur = 0                     # offset in message
+    var bytesLeft = message.len
+
+    # We follow the "absorb-permute-squeeze" approach
+    # originally defined by the Keccak team.
+    # It is compatible with SHA-3 hash spec.
+    # See https://eprint.iacr.org/2022/1340.pdf
+    #
+    # There are no transition/permutation between squeezing -> absorbing
+    # And within this `absorb` function
+    #    the state pos == ctx.rate()
+    # is always followed by a permute and setting `pos = 0`
+
+    if (pos mod ctx.rate()) != 0 and pos+bytesLeft >= ctx.rate():
+      # Previous partial update, fill the state and do one permutation
+      let free = ctx.rate() - pos
+      ctx.H.`xorInPartial _ isaFeatures`(pos, message.toOpenArray(0, free-1))
+      ctx.H.`permute _ isaFeatures`(NumRounds = 24)
+      pos = 0
+      cur = free
+      bytesLeft -= free
+
+    if bytesLeft >= ctx.rate():
+      # Process multiple blocks
+      let numBlocks = bytesLeft div ctx.rate()
+      ctx.H.`hashMessageBlocks _ isaFeatures`(message.asUnchecked() +% cur, numBlocks)
+      cur += numBlocks * ctx.rate()
+      bytesLeft -= numBlocks * ctx.rate()
+
+    if bytesLeft != 0:
+      # Store the tail in buffer
+      ctx.H.`xorInPartial _ isaFeatures`(pos, message.toOpenArray(cur, cur+bytesLeft-1))
+
+    # Epilogue
+    ctx.absorb_offset = int32(pos+bytesLeft)
+    # Signal that the next squeeze transition needs a permute
+    ctx.squeeze_offset = int32 ctx.rate()
+
+genAbsorb(generic)
+when UseASM_X86_32:
+  genAbsorb(x86_bmi1)
+
+template genSqueeze(isaFeatures: untyped) =
+  func `squeeze _ isaFeatures`*(ctx: var KeccakContext, digest: var openArray[byte]) =
+    var pos = ctx.squeeze_offset # offset in Keccak state
+    var cur = 0                  # offset in message
+    var bytesLeft = digest.len
+
+    if pos == ctx.rate():
+      # Transition from absorbing to squeezing
+      #   This state can only come from `absorb` function
+      #   as within `squeeze`, pos == ctx.rate() is always followed
+      #   by a permute and pos = 0
+      ctx.H.pad(ctx.absorb_offset, ctx.delimiter, ctx.rate())
+      ctx.H.`permute _ isaFeatures`(NumRounds = 24)
+      pos = 0
+      ctx.absorb_offset = 0
+
+    if (pos mod ctx.rate()) != 0 and pos+bytesLeft >= ctx.rate():
+      # Previous partial squeeze, fill up to rate and do one permutation
+      let free = ctx.rate() - pos
+      ctx.H.`copyOutPartial _ isaFeatures`(hByteOffset = pos, digest.toOpenArray(0, free-1))
+      ctx.H.`permute _ isaFeatures`(NumRounds = 24)
+      pos = 0
+      ctx.absorb_offset = 0
+      cur = free
+      bytesLeft -= free
+
+    if bytesLeft >= ctx.rate():
+      # Process multiple blocks
+      let numBlocks = bytesLeft div ctx.rate()
+      ctx.H.`squeezeDigestBlocks _ isaFeatures`(digest.asUnchecked() +% cur, numBlocks)
+      ctx.absorb_offset = 0
+      cur += numBlocks * ctx.rate()
+      bytesLeft -= numBlocks * ctx.rate()
+
+    if bytesLeft != 0:
+      # Output the tail
+      ctx.H.`copyOutPartial _ isaFeatures`(hByteOffset = pos, digest.toOpenArray(cur, bytesLeft-1))
+
+    # Epilogue
+    ctx.squeeze_offset = int32 bytesLeft
+    # We don't signal absorb_offset to permute the state if called next
+    # as per
+    #   - original keccak spec that uses "absorb-permute-squeeze" protocol
+    #   - https://eprint.iacr.org/2022/1340.pdf
+    #   - https://eprint.iacr.org/2023/522.pdf
+    #     https://hackmd.io/@7dpNYqjKQGeYC7wMlPxHtQ/ByIbpfX9c#2-SAFE-definition
+
+genSqueeze(generic)
+when UseASM_X86_32:
+  genSqueeze(x86_bmi1)
 
 func absorb*(ctx: var KeccakContext, message: openArray[byte]) =
   ## Absorb a message in the Keccak sponge state
@@ -137,91 +242,22 @@ func absorb*(ctx: var KeccakContext, message: openArray[byte]) =
   ## `ctx.finish(...)` and `ctx.clear()` are called as soon as possible.
   ## Additionally ensure that the message(s) passed were stored
   ## in memory considered secure for your threat model.
-
-  var pos = int ctx.absorb_offset # offset in Keccak state
-  var cur = 0                     # offset in message
-  var bytesLeft = message.len
-
-  # We follow the "absorb-permute-squeeze" approach
-  # originally defined by the Keccak team.
-  # It is compatible with SHA-3 hash spec.
-  # See https://eprint.iacr.org/2022/1340.pdf
-  #
-  # There are no transition/permutation between squeezing -> absorbing
-  # And within this `absorb` function
-  #    the state pos == ctx.rate()
-  # is always followed by a permute and setting `pos = 0`
-
-  if (pos mod ctx.rate()) != 0 and pos+bytesLeft >= ctx.rate():
-    # Previous partial update, fill the state and do one permutation
-    let free = ctx.rate() - pos
-    ctx.H.xorInPartial(pos, message.toOpenArray(0, free-1))
-    ctx.H.permute_generic(NumRounds = 24)
-    pos = 0
-    cur = free
-    bytesLeft -= free
-
-  if bytesLeft >= ctx.rate():
-    # Process multiple blocks
-    let numBlocks = bytesLeft div ctx.rate()
-    ctx.H.hashMessageBlocks_generic(message.asUnchecked() +% cur, numBlocks)
-    cur += numBlocks * ctx.rate()
-    bytesLeft -= numBlocks * ctx.rate()
-
-  if bytesLeft != 0:
-    # Store the tail in buffer
-    ctx.H.xorInPartial(pos, message.toOpenArray(cur, cur+bytesLeft-1))
-
-  # Epilogue
-  ctx.absorb_offset = int32(pos+bytesLeft)
-  # Signal that the next squeeze transition needs a permute
-  ctx.squeeze_offset = int32 ctx.rate()
-
-func squeeze*(ctx: var KeccakContext, digest: var openArray[byte]) =
-  var pos = ctx.squeeze_offset # offset in Keccak state
-  var cur = 0                  # offset in message
-  var bytesLeft = digest.len
-
-  if pos == ctx.rate():
-    # Transition from absorbing to squeezing
-    #   This state can only come from `absorb` function
-    #   as within `squeeze`, pos == ctx.rate() is always followed
-    #   by a permute and pos = 0
-    ctx.H.pad(ctx.absorb_offset, ctx.delimiter, ctx.rate())
-    ctx.H.permute_generic(NumRounds = 24)
-    pos = 0
-    ctx.absorb_offset = 0
-
-  if (pos mod ctx.rate()) != 0 and pos+bytesLeft >= ctx.rate():
-    # Previous partial squeeze, fill up to rate and do one permutation
-    let free = ctx.rate() - pos
-    ctx.H.copyOutPartial(hByteOffset = pos, digest.toOpenArray(0, free-1))
-    ctx.H.permute_generic(NumRounds = 24)
-    pos = 0
-    ctx.absorb_offset = 0
-    cur = free
-    bytesLeft -= free
-
-  if bytesLeft >= ctx.rate():
-    # Process multiple blocks
-    let numBlocks = bytesLeft div ctx.rate()
-    ctx.H.squeezeDigestBlocks_generic(digest.asUnchecked() +% cur, numBlocks)
-    ctx.absorb_offset = 0
-    cur += numBlocks * ctx.rate()
-    bytesLeft -= numBlocks * ctx.rate()
-
-  if bytesLeft != 0:
-    # Output the tail
-    ctx.H.copyOutPartial(hByteOffset = pos, digest.toOpenArray(cur, bytesLeft-1))
-
-  # Epilogue
-  ctx.squeeze_offset = int32 bytesLeft
-  # We don't signal absorb_offset to permute the state if called next
-  # as per
-  #   - original keccak spec that uses "absorb-permute-squeeze" protocol
-  #   - https://eprint.iacr.org/2022/1340.pdf
-  #   - https://eprint.iacr.org/2023/522.pdf
-  #     https://hackmd.io/@7dpNYqjKQGeYC7wMlPxHtQ/ByIbpfX9c#2-SAFE-definition
+  when UseASM_X86_32:
+    if ({.noSideEffect.}: hasBmi1()):
+      ctx.absorb_x86_bmi1(message)
+    else:
+      ctx.absorb_generic(message)
+  else:
+    ctx.absorb_generic(message)
+
+func squeeze*(ctx: var KeccakContext, message: var openArray[byte]) =
+  when UseASM_X86_32:
+    if ({.noSideEffect.}: hasBmi1()):
+      ctx.squeeze_x86_bmi1(message)
+    else:
+      ctx.squeeze_generic(message)
+  else:
+    ctx.squeeze_generic(message)
 
 func update*(ctx: var KeccakContext, message: openArray[byte]) =
   ## Append a message to a Keccak context
diff --git a/constantine/hashes/keccak/keccak_generic.nim b/constantine/hashes/keccak/keccak_generic.nim
index 2d60cc01..f36ac0dd 100644
--- a/constantine/hashes/keccak/keccak_generic.nim
+++ b/constantine/hashes/keccak/keccak_generic.nim
@@ -178,7 +178,12 @@ func genRho(): array[5*5, int] =
 func rotl(x: uint64, k: static int): uint64 {.inline.} =
   return (x shl k) or (x shr (64 - k))
 
-func permute_generic*(A: var KeccakState, NumRounds: static int) =
+func permute_impl*(A: var KeccakState, NumRounds: static int) {.inline.} =
+  ## Implementation of Keccak permutation
+  ## Tagged inline so it's copied in:
+  ## - keccak_generic.nim
+  ## - keccak_x86_bmi1.nim
+  ## and uses CPU features such as SIMD or andnot instructions
   # We use algorithm 4 in https://keccak.team/files/Keccak-implementation-3.2.pdf
   const Rho = genRho()
 
@@ -219,6 +224,9 @@ func permute_generic*(A: var KeccakState, NumRounds: static int) =
       # ι step: break symmetries
       A[0, 0] = A[0, 0] xor KRC[i+j]
 
+func permute_generic*(A: var KeccakState, NumRounds: static int) =
+  permuteImpl(A, NumRounds)
+
 template `^=`(accum: var SomeInteger, b: SomeInteger) =
   accum = accum xor b
 
@@ -235,7 +243,7 @@ func xorInSingle(H: var KeccakState, hByteOffset: int, val: byte) {.inline.} =
   let lane = uint64(val) shl slot # All bits but the one set in `val` are 0, and 0 is neutral element of xor
   H.state[hByteOffset shr 3] ^= lane
 
-func xorInBlock_generic(H: var KeccakState, msg: array[200 - 2*32, byte]) {.inline.} =
+func xorInBlock(H: var KeccakState, msg: array[200 - 2*32, byte]) {.inline.} =
   ## Add new data into the Keccak state
   # This can benefit from vectorized instructions
   for i in 0 ..< msg.len div 8:
@@ -275,7 +283,7 @@ func copyOutPartialWord(
     dst[i] = toByte(lane)
     lane = lane shr sizeof(T)
 
-func xorInPartial*(H: var KeccakState, hByteOffset: int, msg: openArray[byte]) =
+func xorInPartial_impl*(H: var KeccakState, hByteOffset: int, msg: openArray[byte]) {.inline.} =
   ## Add multiple bytes to the state
   ## The hByteOffset+length MUST be less than the state length.
   debug: doAssert hByteOffset + msg.len <= sizeof(H.state)
@@ -317,7 +325,12 @@ func xorInPartial*(H: var KeccakState, hByteOffset: int, msg: openArray[byte]) =
     # Store the tail in buffer
     H.xorInPartialWord(pos, msg.toOpenArray(cur, cur+bytesLeft-1))
 
-func copyOutPartial*(
+func xorInPartial_generic*(H: var KeccakState, hByteOffset: int, msg: openArray[byte]) =
+  ## Add multiple bytes to the state
+  ## The hByteOffset+length MUST be less than the state length.
+  xorInPartial_impl(H, hByteOffset, msg)
+
+func copyOutPartial_impl*(
       H: KeccakState,
       hByteOffset: int,
       dst: var openArray[byte]) {.inline.} =
@@ -364,15 +377,25 @@ func copyOutPartial*(
     # Store the tail in buffer
     H.copyOutPartialWord(pos, dst.toOpenArray(cur, cur+bytesLeft-1))
 
+func copyOutPartial_generic*(
+      H: KeccakState,
+      hByteOffset: int,
+      dst: var openArray[byte]) =
+  ## Read data from the Keccak state
+  ## and write it into `dst`
+  ## starting from the state byte offset `hByteOffset`
+  ## hByteOffset + dst length MUST be less than the Keccak rate
+  copyOutPartial_impl(H, hByteOffset, dst)
+
 func pad*(H: var KeccakState, hByteOffset: int, delim: static byte, rate: static int) {.inline.} =
   debug: doAssert hByteOffset < rate
   H.xorInSingle(hByteOffset, delim)
   H.xorInSingle(hByteOffset = rate-1, 0x80)
 
-func hashMessageBlocks_generic*(
+func hashMessageBlocks_impl*(
       H: var KeccakState,
       message: ptr UncheckedArray[byte],
-      numBlocks: int) =
+      numBlocks: int) {.inline.} =
   ## Hash a message block by block
   ## Keccak block size is the rate: 64
   ## The state MUST be absorb ready
@@ -384,11 +407,22 @@ func hashMessageBlocks_generic*(
   const numRounds = 24    # TODO: auto derive number of rounds
   for _ in 0 ..< numBlocks:
     let msg = cast[ptr array[rate, byte]](message)
-    H.xorInBlock_generic(msg[])
-    H.permute_generic(numRounds)
+    H.xorInBlock(msg[])
+    H.permute_impl(numRounds)
     message +%= rate
 
-func squeezeDigestBlocks_generic*(
+func hashMessageBlocks_generic*(
+      H: var KeccakState,
+      message: ptr UncheckedArray[byte],
+      numBlocks: int) =
+  ## Hash a message block by block
+  ## Keccak block size is the rate: 64
+  ## The state MUST be absorb ready
+  ## i.e. previous operation cannot be a squeeze
+  ##      a permutation is needed in-between
+  hashMessageBlocks_impl(H, message, numBlocks)
+
+func squeezeDigestBlocks_impl*(
       H: var KeccakState,
       digest: ptr UncheckedArray[byte],
       numBlocks: int) =
@@ -404,4 +438,15 @@ func squeezeDigestBlocks_generic*(
     let msg = cast[ptr array[rate, byte]](digest)
     H.copyOutWords(msg[])
     H.permute_generic(numRounds)
-    digest +%= rate
\ No newline at end of file
+    digest +%= rate
+
+func squeezeDigestBlocks_generic*(
+      H: var KeccakState,
+      digest: ptr UncheckedArray[byte],
+      numBlocks: int) =
+  ## Squeeze a digest block by block
+  ## Keccak block digest is the rate: 64
+  ## The state MUST be squeeze ready
+  ## i.e. previous operation cannot be an absorb
+  ##      a permutation is needed in-between
+  squeezeDigestBlocks_impl(H, digest, numBlocks)
\ No newline at end of file
diff --git a/constantine/hashes/keccak/keccak_x86_bmi1.nim b/constantine/hashes/keccak/keccak_x86_bmi1.nim
new file mode 100644
index 00000000..3140a8df
--- /dev/null
+++ b/constantine/hashes/keccak/keccak_x86_bmi1.nim
@@ -0,0 +1,60 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import ./keccak_generic
+
+# Notes:
+# - AVX2 makes thing **slower**
+# - BMI2 makes the compiler use RORX everywhere
+#   but
+#   - hardware already has instruction-level parallelism (ILP)
+#     when modified flags are not consumed by next instructions
+#   - compiler generates RORX everywhere even when self-rotating a register
+#     and the instructions is 2x bigger than ROL/ROR so it hurts instruction cache.
+#   - benchmarks appear to be the same
+{.localpassC:"-mbmi".}
+
+func permute_x86_bmi1*(A: var KeccakState, NumRounds: static int) =
+  permute_impl(A, NumRounds)
+
+func xorInPartial_x86_bmi1*(H: var KeccakState, hByteOffset: int, msg: openArray[byte]) =
+  ## Add multiple bytes to the state
+  ## The hByteOffset+length MUST be less than the state length.
+  xorInPartial_impl(H, hByteOffset, msg)
+
+func copyOutPartial_x86_bmi1*(
+      H: KeccakState,
+      hByteOffset: int,
+      dst: var openArray[byte]) {.inline.} =
+  ## Read data from the Keccak state
+  ## and write it into `dst`
+  ## starting from the state byte offset `hByteOffset`
+  ## hByteOffset + dst length MUST be less than the Keccak rate
+  copyOutPartial_impl(H, hByteOffset, dst)
+
+func hashMessageBlocks_x86_bmi1*(
+      H: var KeccakState,
+      message: ptr UncheckedArray[byte],
+      numBlocks: int) =
+  ## Hash a message block by block
+  ## Keccak block size is the rate: 64
+  ## The state MUST be absorb ready
+  ## i.e. previous operation cannot be a squeeze
+  ##      a permutation is needed in-between
+  hashMessageBlocks_impl(H, message, numBlocks)
+
+func squeezeDigestBlocks_x86_bmi1*(
+      H: var KeccakState,
+      digest: ptr UncheckedArray[byte],
+      numBlocks: int) =
+  ## Squeeze a digest block by block
+  ## Keccak block digest is the rate: 64
+  ## The state MUST be squeeze ready
+  ## i.e. previous operation cannot be an absorb
+  ##      a permutation is needed in-between
+  squeezeDigestBlocks_impl(H, digest, numBlocks)
\ No newline at end of file