From 06bca112a2577f5bb6dd25234e2e20127527b5b4 Mon Sep 17 00:00:00 2001
From: Mamy Ratsimbazafy <mamy_github@numforge.co>
Date: Sun, 22 Dec 2024 14:46:50 +0100
Subject: [PATCH] keccak: initial implementation of keccak256 and sha3-256
 [skip ci]

---
 constantine/ciphers/chacha20.nim             |  10 +-
 constantine/hashes/h_keccak.nim              | 275 ++++++++++++++++
 constantine/hashes/h_sha256.nim              |   4 +-
 constantine/hashes/keccak/keccak_generic.nim | 330 +++++++++++++++++++
 constantine/hashes/sha256/sha256_generic.nim |  21 +-
 constantine/serialization/endians.nim        |  89 ++---
 6 files changed, 668 insertions(+), 61 deletions(-)
 create mode 100644 constantine/hashes/h_keccak.nim
 create mode 100644 constantine/hashes/keccak/keccak_generic.nim

diff --git a/constantine/ciphers/chacha20.nim b/constantine/ciphers/chacha20.nim
index 42600799b..e8d487da7 100644
--- a/constantine/ciphers/chacha20.nim
+++ b/constantine/ciphers/chacha20.nim
@@ -106,12 +106,14 @@ func chacha20_cipher*(
   var keyU{.noInit.}: array[8, uint32]
   var nonceU{.noInit.}: array[3, uint32]
 
-  var pos = 0'u
+  var pos = 0
   for i in 0 ..< 8:
-    keyU[i].parseFromBlob(key, pos, littleEndian)
-  pos = 0'u
+    keyU[i] = uint32.fromBytes(key, pos, littleEndian)
+    pos += sizeof(uint32)
+  pos = 0
   for i in 0 ..< 3:
-    nonceU[i].parseFromBlob(nonce, pos, littleEndian)
+    nonceU[i] = uint32.fromBytes(nonce, pos, littleEndian)
+    pos += sizeof(uint32)
 
   var counter = counter
   var eaten = 0
diff --git a/constantine/hashes/h_keccak.nim b/constantine/hashes/h_keccak.nim
new file mode 100644
index 000000000..3c3312cc4
--- /dev/null
+++ b/constantine/hashes/h_keccak.nim
@@ -0,0 +1,275 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import ../zoo_exports
+
+import
+  ../platforms/[abstractions, views],
+  ./keccak/keccak_generic
+
+# Keccak, the hash function underlying SHA3
+# --------------------------------------------------------------------------------
+#
+# References:
+# - https://keccak.team/keccak_specs_summary.html
+# - https://keccak.team/files/Keccak-reference-3.0.pdf
+# - https://keccak.team/files/Keccak-implementation-3.2.pdf
+# - SHA3 (different padding): https://csrc.nist.gov/publications/detail/fips/202/final
+#   - https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf
+
+# Sponge API
+# --------------------------------------------------------------------------------
+#
+# References:
+# - https://keccak.team/keccak_specs_summary.html
+# - https://keccak.team/files/SpongeFunctions.pdf
+# - https://keccak.team/files/CSF-0.1.pdf
+#
+# Keccak[r,c](Mbytes || Mbits) {
+#   # Padding
+#   d = 2^|Mbits| + sum for i=0..|Mbits|-1 of 2^i*Mbits[i]
+#   P = Mbytes || d || 0x00 || … || 0x00
+#   P = P xor (0x00 || … || 0x00 || 0x80)
+#
+#   # Initialization
+#   S[x,y] = 0,                               for (x,y) in (0…4,0…4)
+#
+#   # Absorbing phase
+#   for each block Pi in P
+#     S[x,y] = S[x,y] xor Pi[x+5*y],          for (x,y) such that x+5*y < r/w
+#     S = Keccak-f[r+c](S)
+#
+#   # Squeezing phase
+#   Z = empty string
+#   while output is requested
+#     Z = Z || S[x,y],                        for (x,y) such that x+5*y < r/w
+#     S = Keccak-f[r+c](S)
+#
+#   return Z
+# }
+
+# Duplex construction
+# --------------------------------------------------------
+# - https://keccak.team/sponge_duplex.html
+#   - https://keccak.team/files/SpongeDuplex.pdf
+#   - https://eprint.iacr.org/2011/499.pdf: Duplexing the Sponge
+# - https://eprint.iacr.org/2023/522.pdf: SAFE - Sponge API for Field Element
+#   - https://hackmd.io/@7dpNYqjKQGeYC7wMlPxHtQ/ByIbpfX9c
+#
+# The original duplex construction described by the Keccak team
+# is "absorb-permute-squeeze"
+# Paper https://eprint.iacr.org/2022/1340.pdf
+# goes over other approaches.
+#
+# We follow the original intent:
+# - permute required when transitioning between absorb->squeeze
+# - no permute required when transitioning between squeeze->absorb
+# This may change depending on protocol requirement.
+# This is inline with the SAFE (Sponge API for FIeld Element) approach
+
+# Types and constants
+# ----------------------------------------------------------------
+
+type
+  KeccakContext*[bits: static int, delimiter: static byte] = object
+
+    # Context description
+    # - `state` is the permutation state, it is update only
+    #   prior to a permutation
+    # - `buf` is a message buffer to store partial state updates
+    # - `absorb_offset` tracks how filled the message buffer is
+    # - `squeeze_offset` tracks the write position in the output buffer
+    #
+    # Subtilities:
+    #   Duplex construction requires a state permutation when
+    #   transitioning between absorb and squeezing phase.
+    #   After an absorb, squeeze_offset is incremented by the sponge `rate`
+    #   This signals the need of a permutation before squeeze.
+    #   Similarly after a squeeze, absorb_offset is incremented by the sponge rate.
+    #   The real offset can be recovered with a substraction
+    #   to properly update the state.
+
+    H {.align: 64.}: KeccakState
+    buf {.align: 64.}: array[bits div 8, byte]
+    absorb_offset: int32
+    squeeze_offset: int32
+
+  keccak256* = KeccakContext[256, 0x01]
+  sha3_256* = KeccakContext[256, 0x06]
+
+template rate(ctx: KeccakContext): int =
+  200 - 2*(ctx.bits div 8)
+
+# Internals
+# ----------------------------------------------------------------
+
+# No exceptions allowed in core cryptographic operations
+{.push raises: [].}
+{.push checks: off.}
+
+func absorbBuffer(ctx: var KeccakContext) {.inline.} =
+  ctx.H.hashMessageBlocks_generic(ctx.buf.asUnchecked(), numBlocks = 1)
+  ctx.buf.setZero()
+  # Note: in certain case like authenticated encryption
+  # we might want to absorb at the same position that have been squeezed
+  # hence we don't reset the absorb_offset to 0
+  # The buf is zeroed which is the neutral element for xor.
+
+# Public API
+# ----------------------------------------------------------------
+
+template digestSize*(H: type KeccakContext): int =
+  ## Returns the output size in bytes
+  KeccakContext.bits shr 3
+
+template internalBlockSize*(H: type KeccakContext): int =
+  ## Returns the byte size of the hash function ingested blocks
+  2 * (KeccakContext.bits shr 3)
+
+func init*(ctx: var KeccakContext) {.inline.} =
+  ## Initialize or reinitialize a Keccak context
+  ctx.reset()
+
+func absorb*(ctx: var KeccakContext, message: openArray[byte]) =
+  ## Absorb a message in the Keccak sponge state
+  ##
+  ## Security note: the tail of your message might be stored
+  ## in an internal buffer.
+  ## if sensitive content is used, ensure that
+  ## `ctx.finish(...)` and `ctx.clear()` are called as soon as possible.
+  ## Additionally ensure that the message(s) passed were stored
+  ## in memory considered secure for your threat model.
+
+  if message.len == 0:
+    return
+
+  var pos = int ctx.absorb_offset
+  var cur = 0
+  var bytesLeft = message.len
+
+  # We follow the "absorb-permute-squeeze" approach
+  # originally defined by the Keccak team.
+  # It is compatible with SHA-3 hash spec.
+  # See https://eprint.iacr.org/2022/1340.pdf
+  #
+  # There are no transition/permutation between squeezing -> absorbing
+  # And within this `absorb` function
+  #    the state pos == ctx.rate()
+  # is always followed by a permute and setting `pos = 0`
+
+  if (pos mod ctx.rate()) != 0 and pos+bytesLeft >= ctx.rate():
+    # Previous partial update, fill the state and do one permutation
+    let free = ctx.rate() - pos
+    ctx.buf.rawCopy(dStart = pos, message, sStart = 0, len = free)
+    ctx.absorbBuffer()
+    pos = 0
+    cur = free
+    bytesLeft -= free
+
+  if bytesLeft >= ctx.rate():
+    # Process multiple blocks
+    let numBlocks = bytesLeft div ctx.rate()
+    ctx.H.hashMessageBlocks_generic(message.asUnchecked() +% cur, numBlocks)
+    cur += numBlocks * ctx.rate()
+    bytesLeft -= numBlocks * ctx.rate()
+
+  if bytesLeft != 0:
+    # Store the tail in buffer
+    ctx.buf.rawCopy(dStart = pos, message, sStart = cur, len = bytesLeft)
+
+  # Epilogue
+  ctx.absorb_offset = int32 bytesLeft
+  # Signal that the next squeeze transition needs a permute
+  ctx.squeeze_offset = int32 ctx.rate()
+
+func squeeze*(ctx: var KeccakContext, digest: var openArray[byte]) =
+  if digest.len == 0:
+    return
+
+  var pos = ctx.squeeze_offset
+  var cur = 0
+  var bytesLeft = digest.len
+
+  if pos == ctx.rate():
+    # Transition from absorbing to squeezing
+    #   This state can only come from `absorb` function
+    #   as within `squeeze`, pos == ctx.rate() is always followed
+    #   by a permute and pos = 0
+    ctx.H.xorInPartial(ctx.buf.toOpenArray(0, ctx.absorb_offset-1))
+    ctx.H.pad(ctx.absorb_offset, ctx.delimiter, ctx.rate())
+    ctx.H.permute_generic(NumRounds = 24)
+    pos = 0
+    ctx.absorb_offset = 0
+
+  if (pos mod ctx.rate()) != 0 and pos+bytesLeft >= ctx.rate():
+    # Previous partial squeeze, fill up to rate and do one permutation
+    let free = ctx.rate() - pos
+    ctx.H.copyOutPartial(hByteOffset = pos, digest.toOpenArray(0, free-1))
+    ctx.H.permute_generic(NumRounds = 24)
+    pos = 0
+    ctx.absorb_offset = 0
+    cur = free
+    bytesLeft -= free
+
+  if bytesLeft >= ctx.rate():
+    # Process multiple blocks
+    let numBlocks = bytesLeft div ctx.rate()
+    ctx.H.squeezeDigestBlocks_generic(digest.asUnchecked() +% cur, numBlocks)
+    ctx.absorb_offset = 0
+    cur += numBlocks * ctx.rate()
+    bytesLeft -= numBlocks * ctx.rate()
+
+  if bytesLeft != 0:
+    # Output the tail
+    ctx.H.copyOutPartial(hByteOffset = pos, digest.toOpenArray(cur, bytesLeft-1))
+
+  # Epilogue
+  ctx.squeeze_offset = int32 bytesLeft
+  # We don't signal absorb_offset to permute the state if called next
+  # as per https://eprint.iacr.org/2023/522.pdf
+  #   https://hackmd.io/@7dpNYqjKQGeYC7wMlPxHtQ/ByIbpfX9c#2-SAFE-definition
+
+func update*(ctx: var KeccakContext, message: openArray[byte]) =
+  ## Append a message to a Keccak context
+  ## for incremental Keccak computation
+  ##
+  ## Security note: the tail of your message might be stored
+  ## in an internal buffer.
+  ## if sensitive content is used, ensure that
+  ## `ctx.finish(...)` and `ctx.clear()` are called as soon as possible.
+  ## Additionally ensure that the message(s) passed was(were) stored
+  ## in memory considered secure for your threat model.
+  ctx.absorb(message)
+
+func finish*[N: static int](ctx: var KeccakContext, digest: var array[N, byte]) =
+  ## Finalize a Keccak computation and output the
+  ## message digest to the `digest` buffer
+  ##
+  ## Security note: this does not clear the internal buffer.
+  ## if sensitive content is used, use "ctx.clear()"
+  ## and also make sure that the message(s) passed were stored
+  ## in memory considered secure for your threat model.
+  ctx.squeeze(digest)
+
+func clear*(ctx: var KeccakContext) =
+  ## Clear the context internal buffers
+  # TODO: ensure compiler cannot optimize the code away
+  ctx.reset()
+
+when isMainModule:
+  import constantine/serialization/codecs
+
+  var msg: array[32, byte]
+  var digest: array[32, byte]
+  var ctx: keccak256
+
+  ctx.init()
+  ctx.update(msg)
+  ctx.finish(digest)
+
+  echo digest.toHex()
\ No newline at end of file
diff --git a/constantine/hashes/h_sha256.nim b/constantine/hashes/h_sha256.nim
index 0f8b85807..ccd46da86 100644
--- a/constantine/hashes/h_sha256.nim
+++ b/constantine/hashes/h_sha256.nim
@@ -33,7 +33,7 @@ when UseASM_X86_32:
 
 type
   Sha256Context* = object
-    ## Align to 64 for cache line and SIMD friendliness
+    # Align to 64 for cache line and SIMD friendliness
     s{.align: 64}: Sha256_state
     buf{.align: 64}: array[BlockSize, byte]
     msgLen: uint64
@@ -130,7 +130,7 @@ func update*(ctx: var Sha256Context, message: openarray[byte]) {.libPrefix: pref
   ## in an internal buffer.
   ## if sensitive content is used, ensure that
   ## `ctx.finish(...)` and `ctx.clear()` are called as soon as possible.
-  ## Additionally ensure that the message(s) passed were stored
+  ## Additionally ensure that the message(s) passed was(were) stored
   ## in memory considered secure for your threat model.
   ##
   ## For passwords and secret keys, you MUST NOT use raw SHA-256
diff --git a/constantine/hashes/keccak/keccak_generic.nim b/constantine/hashes/keccak/keccak_generic.nim
new file mode 100644
index 000000000..1835d5bd3
--- /dev/null
+++ b/constantine/hashes/keccak/keccak_generic.nim
@@ -0,0 +1,330 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  constantine/platforms/primitives,
+  constantine/serialization/endians
+
+# Keccak
+# --------------------------------------------------------------------------------
+#
+# References:
+# - https://keccak.team/keccak_specs_summary.html
+# - https://keccak.team/files/Keccak-reference-3.0.pdf
+# - https://keccak.team/files/Keccak-implementation-3.2.pdf
+# - SHA3 (different padding): https://csrc.nist.gov/publications/detail/fips/202/final
+#
+# Pseudo-code
+# ~~~~~~~~~~~
+# Keccak-f[b](A) {
+#   for i in 0…n-1
+#     A = Round[b](A, RC[i])
+#   return A
+# }
+#
+# Round[b](A,RC) {
+#   # θ step
+#   C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4],   for x in 0…4
+#   D[x] = C[x-1] xor rot(C[x+1],1),                             for x in 0…4
+#   A[x,y] = A[x,y] xor D[x],                           for (x,y) in (0…4,0…4)
+#
+#   # ρ and π steps
+#   B[y,2*x+3*y] = rot(A[x,y], r[x,y]),                 for (x,y) in (0…4,0…4)
+#
+#   # χ step
+#   A[x,y] = B[x,y] xor ((not B[x+1,y]) and B[x+2,y]),  for (x,y) in (0…4,0…4)
+#
+#   # ι step
+#   A[0,0] = A[0,0] xor RC
+#
+#   return A
+# }
+
+# No exceptions allowed in core cryptographic operations
+{.push raises: [].}
+{.push checks: off.}
+
+# Hardware acceleration considerations
+# ------------------------------------------------
+#
+# 1. The χ step uses "and not", the Keccak implementation guide suggest a "lane-complementing technique"
+#    to reduce the number of `not` from 5 to 1.
+#    However, the BM1 CPU features introduced `andn` in AMD Piledriver (2012) and Intel Haswell (2013)
+#    ARM has the BIC instruction (Bit Clear) for ANDNOT
+
+# Types & Constants
+# ------------------------------------------------
+
+type KeccakState* = object
+  ## A Keccak state matrix: 5*5*uint64 = 1600 bits, in column major order
+  ##              ┌─┬─┬─┬─┬─┐
+  ##             ┌─┬─┬─┬─┬─┐┤
+  ##            ┌─┬─┬─┬─┬─┐┤┤
+  ##           ┌─┬─┬─┬─┬─┐┤┤┤
+  ##          ┌─┬─┬─┬─┬─┐┤┤┤┤
+  ##         ┌─┬─┬─┬─┬─┐┤┤┤┤┘
+  ##        ┌─┬─┬─┬─┬─┐┤┤┤┤┘
+  ##       ┌─┬─┬─┬─┬─┐┤┤┤┤┘
+  ##       ├─┼─┼─┼─┼─┤┤┤┤┘
+  ##       ├─┼─┼─┼─┼─┤┤┤┘
+  ##       ├─┼─┼─┼─┼─┤┤┘        ┌─┐ bit
+  ##       ├─┼─┼─┼─┼─┤┘         └─┘         ┌─┐
+  ##       └─┴─┴─┴─┴─┘                     ┌─┐┘
+  ##          state                       ┌─┐┘
+  ##                                     ┌─┐┘
+  ##                    ┌─┐             ┌─┐┘
+  ##                    ├─┤ column     ┌─┐┘
+  ##      row           ├─┤           ┌─┐┘
+  ##  ┌─┬─┬─┬─┬─┐       ├─┤          ┌─┐┘  lane
+  ##  └─┴─┴─┴─┴─┘       ├─┤          └─┘
+  ##                    └─┘
+  ##
+  ##  plane = row * lane
+  ##  slice = row * column
+  ##  sheet = column * lane
+  ##
+  ## Credit: https://github.com/tecosaur/KangarooTwelve.jl
+  state {.align: 64.}: array[5*5, uint64]
+
+func lin_idx(x, y: int): int {.inline.} =
+  5*y+x
+
+func `[]`(A: KeccakState, x, y: int): uint64 {.inline.} =
+  A.state[lin_idx(x, y)]
+
+func `[]=`(A: var KeccakState, x, y: int, val: uint64) {.inline.} =
+  A.state[lin_idx(x, y)] = val
+
+func N(exponent: static int, x, y: int): int {.inline.} =
+  # We use algorithm 4 in https://keccak.team/files/Keccak-implementation-3.2.pdf
+  # We have a coordinate displacement matrix N = [1 0]
+  #                                              [1 2]
+  # to store data without overwriting it
+  const exponent = exponent and 3 # exponent mod 4 as N has order 4
+  when exponent == 0:
+    # N⁰ = [1 0]
+    #      [0 1]
+    lin_idx(x, y)
+  elif exponent == 1:
+    # N¹ = [1 0]
+    #      [1 2]
+    lin_idx(x, (x+2*y) mod 5)
+  elif exponent == 2:
+    # N² = [1 0]
+    #      [3 4]
+    lin_idx(x, (3*x+4*y) mod 5)
+  elif exponent == 3:
+    # N³ = [1 0]
+    #      [2 3]
+    lin_idx(x, (2*x+3*y) mod 5)
+  else:
+    {.error: "unreachable".}
+
+func N(A: KeccakState, i: static int, x, y: int): uint64 {.inline.} =
+  A.state[N(i, x, y)]
+
+func N(A: var KeccakState, i: static int, x, y: int): var uint64 {.inline.} =
+  A.state[N(i, x, y)]
+
+# Keccak round constants
+#   are iteratively computed via a linear feedback shift register
+#   rc[t] = (xᵗ mod x⁸ + x⁶ + x⁵ + x⁴ + 1) mod x in GF(2)[x]
+const KRC: array[24, uint64] = [
+    0x0000000000000001'u64,
+    0x0000000000008082'u64,
+    0x800000000000808a'u64,
+    0x8000000080008000'u64,
+    0x000000000000808b'u64,
+    0x0000000080000001'u64,
+    0x8000000080008081'u64,
+    0x8000000000008009'u64,
+    0x000000000000008a'u64,
+    0x0000000000000088'u64,
+    0x0000000080008009'u64,
+    0x000000008000000a'u64,
+    0x000000008000808b'u64,
+    0x800000000000008b'u64,
+    0x8000000000008089'u64,
+    0x8000000000008003'u64,
+    0x8000000000008002'u64,
+    0x8000000000000080'u64,
+    0x000000000000800a'u64,
+    0x800000008000000a'u64,
+    0x8000000080008081'u64,
+    0x8000000000008080'u64,
+    0x0000000080000001'u64,
+    0x8000000080008008'u64,
+]
+
+func genRho(): array[5*5, int] =
+  result[lin_idx(0, 0)] = 0
+  var (x, y) = (1, 0)
+
+  for t in 0 ..< result.len-1: # skip 0
+    # rotation constant r = i(i+1)/2, skipping (0, 0) hence (t+1)(t+2)/2
+    result[lin_idx(x, y)] =
+        (((t+1) * (t+2)) shr 1) and (64-1)
+
+    let Y = (2*x + 3*y) mod 5
+    let X = y
+    x = X
+    y = Y
+
+func rotl(x: uint64, k: static int): uint64 {.inline.} =
+  return (x shl k) or (x shr (64 - k))
+
+func permute_generic*(A: var KeccakState, NumRounds: static int) =
+  # We use algorithm 4 in https://keccak.team/files/Keccak-implementation-3.2.pdf
+  const Rho = genRho()
+
+  var C {.noinit.}: array[5, uint64]
+  var D {.noinit.}: array[5, uint64]
+  template B: array[5, uint64] = C # Reuse C statefer for B
+
+  # We unroll the loop by 4 to:
+  # - reuse memory locations as N is cyclic of order 4
+  # - minimize code size vs unrolling by 24
+  static: doAssert((NumRounds and 3) == 0, "The number of rounds must be a multiple of 4")
+  for j in countup(0, NumRounds-1, 4):
+    staticFor i, 0, 4:
+      # θ₁: Column-parity via sum reduction in GF(2) (i.e. addition is xor)
+      staticFor x, 0, 5:
+        C[x] = A.N(i, x, 0) xor
+                A.N(i, x, 1) xor
+                A.N(i, x, 2) xor
+                A.N(i, x, 3) xor
+                A.N(i, x, 4)
+
+      # θ₂: Sum adjacent column parities
+      staticFor x, 0, 5:
+        D[x] = C[(x+4) mod 5] xor rotl(C[(x+1) mod 5], 1)
+
+      # Keccak state matrix is column major
+      # so y should be the outer loop for cache-friendliness
+      staticFor y, 0, 5:
+        staticFor x, 0, 5:
+          # θ₃: Diffusion
+          # ρ: inter-slice diffusion
+          # π: long-term diffusion
+          B[(x + 2*y) mod 5] = rotl(A.N(i+1, x, y) xor D[x], Rho[N(1, x, y)])
+        staticFor x, 0, 5:
+          # χ: non-linearity
+          A.N(i+1, x, y) = B[x] xor (not(B[(x+1) mod 5]) and B[(x+2) mod 5])
+
+      # ι step: break symmetries
+      A[0, 0] = A[0, 0] xor KRC[i+j]
+
+template `^=`(accum: var SomeInteger, b: SomeInteger) =
+  accum = accum xor b
+
+func xorInSingle(H: var KeccakState, val: byte, offset: int) {.inline.} =
+  ## Add a single byte in the Keccak state
+
+  # Shift of 3    = log2(sizeof(byte) * 8) - Find the word to read/write
+  # WordMask of 7 = sizeof(byte) * 8 - 1   - In the word, shift to the offset to read/write
+  let slot = (offset and 7) shl 3
+  let lane = uint64(val) shl slot # All bits but the one set in `val` are 0, and 0 is neutral element of xor
+  H.state[offset shr 3] ^= lane
+
+func xorInBlock_generic(H: var KeccakState, msg: array[64, byte]) {.inline.} =
+  ## Add new data into the Keccak state
+  # This can benefit from vectorized instructions
+  for i in 0 ..< 8:
+    H.state[i] ^= uint64.fromBytes(msg, i*8, littleEndian)
+
+func xorInPartial*(H: var KeccakState, msg: openArray[byte]) =
+  ## Add multiple bytes to the state
+  ## The length MUST be less than the state length.
+  debug: doAssert msg.len <= H.state
+
+  # Implementation detail:
+  #   We could avoid an intermediate variable but
+  #   dealing with non-multiple of size(T) length
+  #   would be verbose, and require less than size(T)
+  #   endianness handling.
+  #   Furthermore 2 copies without the "multiple-of"
+  #   tracking overhead might be faster, especially
+  #   if the compiler vectorize the second one
+  #   or is able to fuse the 2 together.
+  #   Lastly, this is only called when transitioning
+  #   between absorbing and squeezing, for hashing
+  #   this means once, however long a message to hash is.
+  var blck: array[64, byte] # zero-init
+  rawCopy(blck, 0, msg, 0, msg.len)
+  H.xorInBlock_generic(blck)
+
+func copyOutWords[W: static int](
+      H: KeccakState,
+      dst: var array[W*8, byte]) {.inline.} =
+  ## Read data from the Keccak state
+  ## and write it into `dst`
+  debug: doAssert dst.len <= sizeof(H.state)
+
+  for w in 0 ..< W:
+    let word = H.state[w]
+    for i in 0 ..< 8:
+      dst[w*8+i] = toByte(word shr (i*8))
+
+func copyOutPartial*(
+      H: KeccakState,
+      hByteOffset: int,
+      dst: var openArray[byte]) {.inline.} =
+  ## Read data from the Keccak state
+  ## and write it into `dst`
+  ## starting from the state byte offset `hByteOffset`
+  ## hByteOffset + dst length MUST be less than the Keccak rate
+  debug: doAssert dst.len + hByteOffset <= sizeof(H.state.size)
+
+  # Implementation details:
+  #   we could avoid a temporary block
+  #   see `xorInPartial` for rationale
+  var blck {.noInit.}: array[64, byte]
+  H.copyOutWords(blck)
+  rawCopy(dst, 0, blck, hByteOffset, dst.len)
+
+func pad*(H: var KeccakState, hByteOffset: int, delim: static byte, rate: static int) {.inline.} =
+  debug: doAssert hByteOffset < rate
+  H.xorInSingle(delim, hByteOffset)
+  H.xorInSingle(0x80, rate-1)
+
+func hashMessageBlocks_generic*(
+      H: var KeccakState,
+      message: ptr UncheckedArray[byte],
+      numBlocks: int) =
+  ## Hash a message block by block
+  ## Keccak block size is the rate: 64
+  ## The state MUST be absorb ready
+  ## i.e. previous operation cannot be a squeeze
+  ##      a permutation is needed in-between
+
+  var message = message
+  const rate = 64      # TODO: make a generic Keccak state with auto-derived rate
+  const numRounds = 24 # TODO: auto derive number of rounds
+  for _ in 0 ..< numBlocks:
+    let msg = cast[ptr array[rate, byte]](message)
+    H.xorInBlock_generic(msg[])
+    H.permute_generic(numRounds)
+    message +%= rate
+
+func squeezeDigestBlocks_generic*(
+      H: var KeccakState,
+      digest: ptr UncheckedArray[byte],
+      numBlocks: int) =
+  ## Squeeze a digest block by block
+  ## Keccak block digest is the rate: 64
+  ## The state MUST be squeeze ready
+  ## i.e. previous operation cannot be an absorb
+  ##      a permutation is needed in-between
+  var digest = digest
+  const rate = 64      # TODO: make a generic Keccak state with auto-derived rate
+  const numRounds = 24 # TODO: auto derive number of rounds
+  for _ in 0 ..< numBlocks:
+    let msg = cast[ptr array[rate, byte]](digest)
+    H.copyOutWords(msg[])
+    H.permute_generic(numRounds)
+    digest +%= rate
\ No newline at end of file
diff --git a/constantine/hashes/sha256/sha256_generic.nim b/constantine/hashes/sha256/sha256_generic.nim
index b89291077..fd52b6013 100644
--- a/constantine/hashes/sha256/sha256_generic.nim
+++ b/constantine/hashes/sha256/sha256_generic.nim
@@ -7,7 +7,8 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
 import
-  constantine/platforms/primitives
+  constantine/platforms/primitives,
+  constantine/serialization/endians
 
 # SHA256, a hash function from the SHA2 family
 # --------------------------------------------------------------------------------
@@ -90,22 +91,6 @@ template s1(x: uint32): uint32 =
   # σ₁
   rotr(x, 17) xor rotr(x, 19) xor (x shr 10)
 
-# Message schedule
-# ------------------------------------------------
-
-template u32BE(blob: array[4, byte]): uint32 =
-  ## Interpret a data blob as a big-endian uint32
-  when nimvm:
-    (blob[0].uint32 shl 24) or (blob[1].uint32 shl 16) or (blob[2].uint32 shl 8) or blob[3].uint32
-  else:
-    when cpuEndian == littleEndian:
-      (blob[0].uint32 shl 24) or (blob[1].uint32 shl 16) or (blob[2].uint32 shl 8) or blob[3].uint32
-    else:
-      cast[uint32](blob)
-
-template getU32at(msg: ptr UncheckedArray[byte], pos: SomeInteger): uint32 =
-  u32BE(cast[ptr array[4, byte]](msg[pos].addr)[])
-
 # State updates
 # ------------------------------------------------
 
@@ -147,7 +132,7 @@ func sha256_rounds_0_15(
        ms: var Sha256_MessageSchedule,
        message: ptr UncheckedArray[byte]) {.inline.} =
   staticFor t, 0, 16:
-    ms.w[t] = message.getU32at(t * sizeof(Word))
+    ms.w[t] = uint32.fromBytes(message, t * sizeof(Word), bigEndian)
     sha256_round(s, ms.w[t], K256[t])
 
 func sha256_rounds_16_63(
diff --git a/constantine/serialization/endians.nim b/constantine/serialization/endians.nim
index fd7f0f0c6..25ef9e41a 100644
--- a/constantine/serialization/endians.nim
+++ b/constantine/serialization/endians.nim
@@ -22,8 +22,9 @@ template toByte*(x: SomeUnsignedInt): byte =
   else:
     byte(x)
 
-template blobFrom*(dst: var openArray[byte], src: SomeUnsignedInt, startIdx: int, endian: static Endianness) =
+func blobFrom*(dst: var openArray[byte], src: SomeUnsignedInt, startIdx: int, endian: static Endianness) {.inline.} =
   ## Write an integer into a raw binary blob
+  ## The whole binary blob is interpreted as big-endian/little-endian
   ## Swapping endianness if needed
   ## startidx is the first written array item if littleEndian is requested
   ## or the last if bigEndian is requested
@@ -34,42 +35,12 @@ template blobFrom*(dst: var openArray[byte], src: SomeUnsignedInt, startIdx: int
     for i in 0 ..< sizeof(src):
       dst[startIdx+sizeof(src)-1-i] = toByte(src shr (i * 8))
 
-func parseFromBlob*(
-           dst: var SomeUnsignedInt,
-           src: openArray[byte],
-           cursor: var uint, endian: static Endianness) {.inline.} =
-  ## Read an unsigned integer from a raw binary blob.
-  ## The `cursor` represents the current index in the array and is updated
-  ## by N bytes where N is the size of `dst` type in bytes.
-  ## The binary blob is interpreted as:
-  ## - an array of words traversed from 0 ..< len (little-endian), via an incremented `cursor`
-  ## - with each word being of `endian` ordering for deserialization purpose.
-  debug:
-    doAssert 0 <= cursor and cursor < src.len.uint
-    doAssert cursor + sizeof(dst).uint <= src.len.uint,
-      "cursor (" & $cursor & ") + sizeof(dst) (" & $sizeof(dst) &
-      ") <= src.len (" & $src.len & ")"
-
-  type U = typeof(dst)
-  const L = sizeof(dst)
-
-  var accum: U = 0
-  when endian == littleEndian:
-    for i in 0'u ..< L:
-      accum = accum or (U(src[cursor+i]) shl (i * 8))
-  else:
-    for i in 0'u ..< L:
-      accum = accum or (U(src[cursor+i]) shl ((L - 1 - i) * 8))
-  dst = accum
-  cursor.inc(L)
-
 func dumpRawInt*(
            dst: var openArray[byte],
            src: SomeUnsignedInt,
            cursor: uint, endian: static Endianness) {.inline.} =
   ## Dump an integer into raw binary form
-  ## The `cursor` represents the current index in the array and is updated
-  ## by N bytes where N is the size of `src` type in bytes.
+  ## The `cursor` represents the current index in the array
   ## The binary blob is interpreted as:
   ## - an array of words traversed from 0 ..< len (little-endian), via an incremented `cursor`
   ## - with each word being of `endian` ordering for deserialization purpose.
@@ -99,15 +70,59 @@ func toBytes*(num: SomeUnsignedInt, endianness: static Endianness): array[sizeof
     for i in 0 ..< L:
       result[i] = toByte(num shr (i * 8))
 
-func fromBytes*(T: type SomeUnsignedInt, bytes: openArray[byte], endianness: static Endianness): T {.inline.} =
+func fromBytes*(T: type SomeUnsignedInt, bytes: array[sizeof(T), byte], endianness: static Endianness): T {.inline.} =
   const L = sizeof(T)
-  debug:
-    doAssert bytes.len == L
-
   # Note: result is zero-init
   when endianness == cpuEndian:
     for i in 0 ..< L:
       result = result or (T(bytes[i]) shl (i*8))
   else:
     for i in 0 ..< L:
-      result = result or (T(bytes[i]) shl ((L-1-i) * 8))
\ No newline at end of file
+      result = result or (T(bytes[i]) shl ((L-1-i) * 8))
+
+template fromBytesImpl(
+      r: var SomeUnsignedInt,
+      bytes: openArray[byte] or ptr UncheckedArray[byte],
+      offset: int,
+      endianness: static Endianness) =
+  # With a function array[N, byte] doesn't match "openArray[byte] or something"
+  # https://github.com/nim-lang/Nim/issues/7432
+  type T = typeof(r)
+  const L = sizeof(r)
+  r.reset()
+  when endianness == cpuEndian:
+    for i in 0 ..< L:
+      r = r or (T(bytes[i+offset]) shl (i*8))
+  else:
+    for i in 0 ..< L:
+      r = r or (T(bytes[i+offset]) shl ((L-1-i) * 8))
+
+func fromBytes*(
+      T: type SomeUnsignedInt,
+      bytes: openArray[byte],
+      offset: int,
+      endianness: static Endianness): T {.inline.} =
+  ## Read an unsigned integer from a raw binary blob.
+  ## The `offset` represents the current index in the array
+  ## The binary blob is interpreted as:
+  ## - an array of words traversed from 0 ..< len (little-endian)
+  ## - with each word being of `endian` ordering for deserialization purpose.
+  debug:
+    doAssert 0 <= offset and offset < bytes.len
+    doAssert offset + sizeof(T) <= bytes.len,
+      "offset (" & $offset & ") + sizeof(T) (" & $sizeof(T) &
+      ") <= bytes.len (" & $bytes.len & ")"
+
+  result.fromBytesImpl(bytes, offset, endianness)
+
+func fromBytes*(
+      T: type SomeUnsignedInt,
+      bytes: ptr UncheckedArray[byte],
+      offset: int,
+      endianness: static Endianness): T {.inline.} =
+  ## Read an unsigned integer from a raw binary blob.
+  ## The `offset` represents the current index in the array
+  ## The binary blob is interpreted as:
+  ## - an array of words traversed from 0 ..< len (little-endian)
+  ## - with each word being of `endian` ordering for deserialization purpose.
+  result.fromBytesImpl(bytes, offset, endianness)