From c6a68f616e186e25791c3b6d7bcac065e5aa22e6 Mon Sep 17 00:00:00 2001 From: Manuel Barbosa Date: Tue, 19 Dec 2023 16:22:52 +0000 Subject: [PATCH] avx2 mlkem --- code/jasmin/mlkem_avx2/Makefile | 141 + code/jasmin/mlkem_avx2/cbd.c | 128 + code/jasmin/mlkem_avx2/cbd.h | 9 + code/jasmin/mlkem_avx2/compile.bench | 147 + code/jasmin/mlkem_avx2/compile.bench.old | 147 + code/jasmin/mlkem_avx2/consts.c | 153 + code/jasmin/mlkem_avx2/consts.h | 40 + code/jasmin/mlkem_avx2/consts.jinc | 104 + .../jasmin/mlkem_avx2/extraction/Array1088.ec | 3 + code/jasmin/mlkem_avx2/extraction/Array128.ec | 3 + code/jasmin/mlkem_avx2/extraction/Array136.ec | 3 + code/jasmin/mlkem_avx2/extraction/Array16.ec | 3 + code/jasmin/mlkem_avx2/extraction/Array168.ec | 3 + .../jasmin/mlkem_avx2/extraction/Array2304.ec | 3 + code/jasmin/mlkem_avx2/extraction/Array24.ec | 3 + code/jasmin/mlkem_avx2/extraction/Array25.ec | 3 + code/jasmin/mlkem_avx2/extraction/Array256.ec | 3 + code/jasmin/mlkem_avx2/extraction/Array32.ec | 3 + code/jasmin/mlkem_avx2/extraction/Array33.ec | 3 + code/jasmin/mlkem_avx2/extraction/Array34.ec | 3 + code/jasmin/mlkem_avx2/extraction/Array4.ec | 3 + code/jasmin/mlkem_avx2/extraction/Array400.ec | 3 + code/jasmin/mlkem_avx2/extraction/Array5.ec | 3 + code/jasmin/mlkem_avx2/extraction/Array64.ec | 3 + code/jasmin/mlkem_avx2/extraction/Array768.ec | 3 + code/jasmin/mlkem_avx2/extraction/Array8.ec | 3 + code/jasmin/mlkem_avx2/extraction/Array960.ec | 3 + code/jasmin/mlkem_avx2/extraction/Makefile | 19 + .../mlkem_avx2/extraction/WArray1088.ec | 3 + .../jasmin/mlkem_avx2/extraction/WArray128.ec | 3 + .../jasmin/mlkem_avx2/extraction/WArray136.ec | 3 + .../mlkem_avx2/extraction/WArray1536.ec | 3 + code/jasmin/mlkem_avx2/extraction/WArray16.ec | 3 + .../jasmin/mlkem_avx2/extraction/WArray168.ec | 3 + .../jasmin/mlkem_avx2/extraction/WArray192.ec | 3 + .../jasmin/mlkem_avx2/extraction/WArray200.ec | 3 + .../jasmin/mlkem_avx2/extraction/WArray256.ec | 3 + code/jasmin/mlkem_avx2/extraction/WArray32.ec | 3 + code/jasmin/mlkem_avx2/extraction/WArray33.ec | 3 + code/jasmin/mlkem_avx2/extraction/WArray34.ec | 3 + code/jasmin/mlkem_avx2/extraction/WArray40.ec | 3 + .../mlkem_avx2/extraction/WArray4608.ec | 3 + .../jasmin/mlkem_avx2/extraction/WArray512.ec | 3 + code/jasmin/mlkem_avx2/extraction/WArray64.ec | 3 + .../jasmin/mlkem_avx2/extraction/WArray768.ec | 3 + .../jasmin/mlkem_avx2/extraction/WArray800.ec | 3 + .../jasmin/mlkem_avx2/extraction/WArray960.ec | 3 + .../jasmin/mlkem_avx2/extraction/jkem_avx2.ec | 5263 +++++++++++++++++ code/jasmin/mlkem_avx2/fips202.c | 549 ++ code/jasmin/mlkem_avx2/fips202.h | 28 + code/jasmin/mlkem_avx2/fips202.jinc | 647 ++ code/jasmin/mlkem_avx2/fips202_4x.jinc | 1434 +++++ code/jasmin/mlkem_avx2/fips202_common.jinc | 6 + code/jasmin/mlkem_avx2/fq.S | 129 + code/jasmin/mlkem_avx2/fq.inc | 26 + code/jasmin/mlkem_avx2/gen_matrix.jazz | 59 + code/jasmin/mlkem_avx2/gen_matrix.jinc | 137 + code/jasmin/mlkem_avx2/gen_matrix.jinc.try0 | 940 +++ code/jasmin/mlkem_avx2/gen_matrix_old.jinc | 129 + code/jasmin/mlkem_avx2/indcpa.c | 320 + code/jasmin/mlkem_avx2/indcpa.h | 36 + code/jasmin/mlkem_avx2/indcpa.jinc | 245 + code/jasmin/mlkem_avx2/jbench.sh | 20 + code/jasmin/mlkem_avx2/jfips202.jazz | 102 + code/jasmin/mlkem_avx2/jindcpa.jazz | 94 + code/jasmin/mlkem_avx2/jkem.jazz | 87 + code/jasmin/mlkem_avx2/jpoly.jazz | 316 + code/jasmin/mlkem_avx2/jpolyvec.jazz | 211 + code/jasmin/mlkem_avx2/jspeed.jazz | 197 + code/jasmin/mlkem_avx2/keccakf1600.jinc | 194 + code/jasmin/mlkem_avx2/kem.c | 145 + code/jasmin/mlkem_avx2/kem.h | 41 + code/jasmin/mlkem_avx2/kem.jinc | 142 + code/jasmin/mlkem_avx2/ntt.S | 198 + code/jasmin/mlkem_avx2/ntt.c | 152 + code/jasmin/mlkem_avx2/ntt.h | 45 + code/jasmin/mlkem_avx2/params.h | 50 + code/jasmin/mlkem_avx2/params.jinc | 26 + code/jasmin/mlkem_avx2/poly.c | 378 ++ code/jasmin/mlkem_avx2/poly.h | 68 + code/jasmin/mlkem_avx2/poly.jinc | 1411 +++++ code/jasmin/mlkem_avx2/poly_ntt.c | 10 + code/jasmin/mlkem_avx2/polyvec.c | 237 + code/jasmin/mlkem_avx2/polyvec.h | 47 + code/jasmin/mlkem_avx2/polyvec.jinc | 241 + code/jasmin/mlkem_avx2/reduce.c | 62 + code/jasmin/mlkem_avx2/reduce.h | 15 + code/jasmin/mlkem_avx2/reduce.jinc | 95 + code/jasmin/mlkem_avx2/shuffle.S | 261 + code/jasmin/mlkem_avx2/shuffle.inc | 23 + code/jasmin/mlkem_avx2/shuffle.jinc | 192 + code/jasmin/mlkem_avx2/speed.h | 62 + code/jasmin/mlkem_avx2/symmetric-fips202.c | 77 + code/jasmin/mlkem_avx2/symmetric.h | 52 + code/jasmin/mlkem_avx2/test/speed_indcpa.c | 100 + code/jasmin/mlkem_avx2/test/speed_kyber.c | 241 + code/jasmin/mlkem_avx2/test/test.sh | 14 + code/jasmin/mlkem_avx2/test/test_fips202.c | 48 + code/jasmin/mlkem_avx2/test/test_indcpa.c | 62 + code/jasmin/mlkem_avx2/test/test_kem.c | 71 + code/jasmin/mlkem_avx2/test/test_poly_add2.c | 29 + .../mlkem_avx2/test/test_poly_basemul.c | 30 + .../mlkem_avx2/test/test_poly_compress.c | 31 + code/jasmin/mlkem_avx2/test/test_poly_csubq.c | 32 + .../mlkem_avx2/test/test_poly_decompress.c | 24 + .../mlkem_avx2/test/test_poly_frombytes.c | 24 + .../mlkem_avx2/test/test_poly_frommont.c | 36 + .../mlkem_avx2/test/test_poly_frommsg.c | 24 + .../mlkem_avx2/test/test_poly_getnoise.c | 47 + .../jasmin/mlkem_avx2/test/test_poly_invntt.c | 36 + code/jasmin/mlkem_avx2/test/test_poly_ntt.c | 35 + .../jasmin/mlkem_avx2/test/test_poly_reduce.c | 31 + code/jasmin/mlkem_avx2/test/test_poly_sub.c | 29 + .../mlkem_avx2/test/test_poly_tobytes.c | 31 + code/jasmin/mlkem_avx2/test/test_poly_tomsg.c | 31 + .../mlkem_avx2/test/test_polyvec_add2.c | 31 + .../mlkem_avx2/test/test_polyvec_compress.c | 33 + .../mlkem_avx2/test/test_polyvec_csubq.c | 33 + .../mlkem_avx2/test/test_polyvec_decompress.c | 22 + .../mlkem_avx2/test/test_polyvec_frombytes.c | 22 + .../mlkem_avx2/test/test_polyvec_invntt.c | 36 + .../jasmin/mlkem_avx2/test/test_polyvec_ntt.c | 36 + .../test/test_polyvec_pointwise_acc.c | 32 + .../mlkem_avx2/test/test_polyvec_reduce.c | 32 + .../mlkem_avx2/test/test_polyvec_tobytes.c | 35 + code/jasmin/mlkem_avx2/verify.jinc | 81 + 126 files changed, 17600 insertions(+) create mode 100644 code/jasmin/mlkem_avx2/Makefile create mode 100644 code/jasmin/mlkem_avx2/cbd.c create mode 100644 code/jasmin/mlkem_avx2/cbd.h create mode 100644 code/jasmin/mlkem_avx2/compile.bench create mode 100644 code/jasmin/mlkem_avx2/compile.bench.old create mode 100644 code/jasmin/mlkem_avx2/consts.c create mode 100644 code/jasmin/mlkem_avx2/consts.h create mode 100644 code/jasmin/mlkem_avx2/consts.jinc create mode 100644 code/jasmin/mlkem_avx2/extraction/Array1088.ec create mode 100644 code/jasmin/mlkem_avx2/extraction/Array128.ec create mode 100644 code/jasmin/mlkem_avx2/extraction/Array136.ec create mode 100644 code/jasmin/mlkem_avx2/extraction/Array16.ec create mode 100644 code/jasmin/mlkem_avx2/extraction/Array168.ec create mode 100644 code/jasmin/mlkem_avx2/extraction/Array2304.ec create mode 100644 code/jasmin/mlkem_avx2/extraction/Array24.ec create mode 100644 code/jasmin/mlkem_avx2/extraction/Array25.ec create mode 100644 code/jasmin/mlkem_avx2/extraction/Array256.ec create mode 100644 code/jasmin/mlkem_avx2/extraction/Array32.ec create mode 100644 code/jasmin/mlkem_avx2/extraction/Array33.ec create mode 100644 code/jasmin/mlkem_avx2/extraction/Array34.ec create mode 100644 code/jasmin/mlkem_avx2/extraction/Array4.ec create mode 100644 code/jasmin/mlkem_avx2/extraction/Array400.ec create mode 100644 code/jasmin/mlkem_avx2/extraction/Array5.ec create mode 100644 code/jasmin/mlkem_avx2/extraction/Array64.ec create mode 100644 code/jasmin/mlkem_avx2/extraction/Array768.ec create mode 100644 code/jasmin/mlkem_avx2/extraction/Array8.ec create mode 100644 code/jasmin/mlkem_avx2/extraction/Array960.ec create mode 100644 code/jasmin/mlkem_avx2/extraction/Makefile create mode 100644 code/jasmin/mlkem_avx2/extraction/WArray1088.ec create mode 100644 code/jasmin/mlkem_avx2/extraction/WArray128.ec create mode 100644 code/jasmin/mlkem_avx2/extraction/WArray136.ec create mode 100644 code/jasmin/mlkem_avx2/extraction/WArray1536.ec create mode 100644 code/jasmin/mlkem_avx2/extraction/WArray16.ec create mode 100644 code/jasmin/mlkem_avx2/extraction/WArray168.ec create mode 100644 code/jasmin/mlkem_avx2/extraction/WArray192.ec create mode 100644 code/jasmin/mlkem_avx2/extraction/WArray200.ec create mode 100644 code/jasmin/mlkem_avx2/extraction/WArray256.ec create mode 100644 code/jasmin/mlkem_avx2/extraction/WArray32.ec create mode 100644 code/jasmin/mlkem_avx2/extraction/WArray33.ec create mode 100644 code/jasmin/mlkem_avx2/extraction/WArray34.ec create mode 100644 code/jasmin/mlkem_avx2/extraction/WArray40.ec create mode 100644 code/jasmin/mlkem_avx2/extraction/WArray4608.ec create mode 100644 code/jasmin/mlkem_avx2/extraction/WArray512.ec create mode 100644 code/jasmin/mlkem_avx2/extraction/WArray64.ec create mode 100644 code/jasmin/mlkem_avx2/extraction/WArray768.ec create mode 100644 code/jasmin/mlkem_avx2/extraction/WArray800.ec create mode 100644 code/jasmin/mlkem_avx2/extraction/WArray960.ec create mode 100644 code/jasmin/mlkem_avx2/extraction/jkem_avx2.ec create mode 100644 code/jasmin/mlkem_avx2/fips202.c create mode 100644 code/jasmin/mlkem_avx2/fips202.h create mode 100644 code/jasmin/mlkem_avx2/fips202.jinc create mode 100644 code/jasmin/mlkem_avx2/fips202_4x.jinc create mode 100644 code/jasmin/mlkem_avx2/fips202_common.jinc create mode 100644 code/jasmin/mlkem_avx2/fq.S create mode 100644 code/jasmin/mlkem_avx2/fq.inc create mode 100644 code/jasmin/mlkem_avx2/gen_matrix.jazz create mode 100644 code/jasmin/mlkem_avx2/gen_matrix.jinc create mode 100644 code/jasmin/mlkem_avx2/gen_matrix.jinc.try0 create mode 100644 code/jasmin/mlkem_avx2/gen_matrix_old.jinc create mode 100644 code/jasmin/mlkem_avx2/indcpa.c create mode 100644 code/jasmin/mlkem_avx2/indcpa.h create mode 100644 code/jasmin/mlkem_avx2/indcpa.jinc create mode 100755 code/jasmin/mlkem_avx2/jbench.sh create mode 100644 code/jasmin/mlkem_avx2/jfips202.jazz create mode 100644 code/jasmin/mlkem_avx2/jindcpa.jazz create mode 100644 code/jasmin/mlkem_avx2/jkem.jazz create mode 100644 code/jasmin/mlkem_avx2/jpoly.jazz create mode 100644 code/jasmin/mlkem_avx2/jpolyvec.jazz create mode 100644 code/jasmin/mlkem_avx2/jspeed.jazz create mode 100644 code/jasmin/mlkem_avx2/keccakf1600.jinc create mode 100644 code/jasmin/mlkem_avx2/kem.c create mode 100644 code/jasmin/mlkem_avx2/kem.h create mode 100644 code/jasmin/mlkem_avx2/kem.jinc create mode 100644 code/jasmin/mlkem_avx2/ntt.S create mode 100644 code/jasmin/mlkem_avx2/ntt.c create mode 100644 code/jasmin/mlkem_avx2/ntt.h create mode 100644 code/jasmin/mlkem_avx2/params.h create mode 100644 code/jasmin/mlkem_avx2/params.jinc create mode 100644 code/jasmin/mlkem_avx2/poly.c create mode 100644 code/jasmin/mlkem_avx2/poly.h create mode 100644 code/jasmin/mlkem_avx2/poly.jinc create mode 100644 code/jasmin/mlkem_avx2/poly_ntt.c create mode 100644 code/jasmin/mlkem_avx2/polyvec.c create mode 100644 code/jasmin/mlkem_avx2/polyvec.h create mode 100644 code/jasmin/mlkem_avx2/polyvec.jinc create mode 100644 code/jasmin/mlkem_avx2/reduce.c create mode 100644 code/jasmin/mlkem_avx2/reduce.h create mode 100644 code/jasmin/mlkem_avx2/reduce.jinc create mode 100644 code/jasmin/mlkem_avx2/shuffle.S create mode 100644 code/jasmin/mlkem_avx2/shuffle.inc create mode 100644 code/jasmin/mlkem_avx2/shuffle.jinc create mode 100644 code/jasmin/mlkem_avx2/speed.h create mode 100644 code/jasmin/mlkem_avx2/symmetric-fips202.c create mode 100644 code/jasmin/mlkem_avx2/symmetric.h create mode 100644 code/jasmin/mlkem_avx2/test/speed_indcpa.c create mode 100644 code/jasmin/mlkem_avx2/test/speed_kyber.c create mode 100755 code/jasmin/mlkem_avx2/test/test.sh create mode 100644 code/jasmin/mlkem_avx2/test/test_fips202.c create mode 100644 code/jasmin/mlkem_avx2/test/test_indcpa.c create mode 100644 code/jasmin/mlkem_avx2/test/test_kem.c create mode 100644 code/jasmin/mlkem_avx2/test/test_poly_add2.c create mode 100644 code/jasmin/mlkem_avx2/test/test_poly_basemul.c create mode 100644 code/jasmin/mlkem_avx2/test/test_poly_compress.c create mode 100644 code/jasmin/mlkem_avx2/test/test_poly_csubq.c create mode 100644 code/jasmin/mlkem_avx2/test/test_poly_decompress.c create mode 100644 code/jasmin/mlkem_avx2/test/test_poly_frombytes.c create mode 100644 code/jasmin/mlkem_avx2/test/test_poly_frommont.c create mode 100644 code/jasmin/mlkem_avx2/test/test_poly_frommsg.c create mode 100644 code/jasmin/mlkem_avx2/test/test_poly_getnoise.c create mode 100644 code/jasmin/mlkem_avx2/test/test_poly_invntt.c create mode 100644 code/jasmin/mlkem_avx2/test/test_poly_ntt.c create mode 100644 code/jasmin/mlkem_avx2/test/test_poly_reduce.c create mode 100644 code/jasmin/mlkem_avx2/test/test_poly_sub.c create mode 100644 code/jasmin/mlkem_avx2/test/test_poly_tobytes.c create mode 100644 code/jasmin/mlkem_avx2/test/test_poly_tomsg.c create mode 100644 code/jasmin/mlkem_avx2/test/test_polyvec_add2.c create mode 100644 code/jasmin/mlkem_avx2/test/test_polyvec_compress.c create mode 100644 code/jasmin/mlkem_avx2/test/test_polyvec_csubq.c create mode 100644 code/jasmin/mlkem_avx2/test/test_polyvec_decompress.c create mode 100644 code/jasmin/mlkem_avx2/test/test_polyvec_frombytes.c create mode 100644 code/jasmin/mlkem_avx2/test/test_polyvec_invntt.c create mode 100644 code/jasmin/mlkem_avx2/test/test_polyvec_ntt.c create mode 100644 code/jasmin/mlkem_avx2/test/test_polyvec_pointwise_acc.c create mode 100644 code/jasmin/mlkem_avx2/test/test_polyvec_reduce.c create mode 100644 code/jasmin/mlkem_avx2/test/test_polyvec_tobytes.c create mode 100644 code/jasmin/mlkem_avx2/verify.jinc diff --git a/code/jasmin/mlkem_avx2/Makefile b/code/jasmin/mlkem_avx2/Makefile new file mode 100644 index 00000000..a212eb72 --- /dev/null +++ b/code/jasmin/mlkem_avx2/Makefile @@ -0,0 +1,141 @@ +# -*- Makefile -*- + + +-include ../../Makefile.conf + +CC ?= /usr/bin/gcc +GFLAGS ?= +CFLAGS := -Wall -Wextra -g -Ofast -fomit-frame-pointer +JFLAGS := -lea ${JADDFLAGS} +OS := $(shell uname -s) + +.SECONDARY: jpoly.s jpolyvec.s jfips202.s jindcpa.s jindcpa.o jkem.s + +default: test speed + +test: test/test_poly_compress \ + test/test_poly_decompress \ + test/test_poly_tobytes \ + test/test_poly_frombytes \ + test/test_poly_tomsg \ + test/test_poly_frommsg \ + test/test_poly_add2 \ + test/test_poly_sub \ + test/test_poly_ntt \ + test/test_poly_invntt \ + test/test_poly_basemul \ + test/test_poly_frommont \ + test/test_poly_reduce \ + test/test_poly_csubq \ + test/test_poly_getnoise \ + test/test_polyvec_compress\ + test/test_polyvec_decompress\ + test/test_polyvec_tobytes \ + test/test_polyvec_frombytes \ + test/test_polyvec_add2 \ + test/test_polyvec_ntt \ + test/test_polyvec_invntt \ + test/test_polyvec_pointwise_acc \ + test/test_polyvec_reduce\ + test/test_polyvec_csubq \ + test/test_fips202 \ + test/test_indcpa \ + test/test_kem + +speed: test/speed_indcpa \ + test/speed_kyber + +HEADERS = params.h poly.h fips202.h ntt.h indcpa.h kem.h \ + +JHEADERS = params.jinc \ + reduce.jinc \ + fips202_common.jinc \ + fips202.jinc \ + fips202_4x.jinc \ + keccakf1600.jinc \ + consts.jinc \ + shuffle.jinc \ + indcpa.jinc \ + verify.jinc + +POLYHEADERS = poly.jinc \ + consts.jinc \ + +POLYVECHEADERS = polyvec.jinc \ + gen_matrix.jinc \ + +INCS = fq.inc shuffle.inc +SOURCES = poly.c polyvec.c cbd.c fips202.c ntt.c reduce.c symmetric-fips202.c indcpa.c kem.c consts.c shuffle.S fq.S\ + +test/test_indcpa: test/test_indcpa.c $(HEADERS) $(SOURCES) $(INCS) jindcpa.o + $(CC) $(CFLAGS) -o $@ $(SOURCES) jindcpa.o $< + +test/test_kem: test/test_kem.c $(HEADERS) $(SOURCES) $(INCS) jkem.o + $(CC) $(CFLAGS) -o $@ $(SOURCES) jkem.o ~/Desktop/Repos/jasmin/compiler/syscall/jasmin_syscall.o $< + +test/speed_indcpa: test/speed_indcpa.c $(HEADERS) $(SOURCES) $(INCS) jindcpa.o + $(CC) $(CFLAGS) -o $@ $(SOURCES) jindcpa.o $< + +test/speed_kyber: test/speed_kyber.c $(HEADERS) $(SOURCES) $(INCS) jspeed.s + $(CC) $(CFLAGS) -o $@ $(SOURCES) jspeed.s $< + +test/test_fips202: test/test_fips202.c $(HEADERS) fips202.c jfips202.s + $(CC) $(CFLAGS) -o $@ fips202.c jfips202.s $< + +test/test_gen_matrix: test/test_gen_matrix.c $(HEADERS) gen_matrix.s + $(CC) $(CFLAGS) -o $@ gen_matrix.s $< + +test/test_poly_%: test/test_poly_%.c $(HEADERS) $(SOURCES) $(INCS) jpoly.s + $(CC) $(CFLAGS) -o $@ $(SOURCES) jpoly.s $< + +test/test_polyvec_%: test/test_polyvec_%.c $(HEADERS) $(SOURCES) $(INCS) jpolyvec.s + $(CC) $(CFLAGS) -o $@ $(SOURCES) jpolyvec.s $< + +%.s: %.jazz + $(JASMINC) -o $@ $(JFLAGS) $^ + +.PHONY: clean + +clean: + -rm -f *.o + -rm -f gen_matrix.s + -rm -f jindcpa.s + -rm -f jkem.s + -rm -f jfips202.s + -rm -f jpoly.s + -rm -f jpolyvec.s + -rm -f jspeed.s + -rm -f test/test_poly_compress + -rm -f test/test_poly_decompress + -rm -f test/test_poly_tobytes + -rm -f test/test_poly_frombytes + -rm -f test/test_poly_tomsg + -rm -f test/test_poly_frommsg + -rm -f test/test_poly_add2 + -rm -f test/test_poly_sub + -rm -f test/test_poly_ntt + -rm -f test/test_poly_invntt + -rm -f test/test_poly_basemul + -rm -f test/test_poly_frommont + -rm -f test/test_poly_reduce + -rm -f test/test_poly_csubq + -rm -f test/test_poly_getnoise + -rm -f test/test_polyvec_compress + -rm -f test/test_polyvec_decompress + -rm -f test/test_polyvec_tobytes + -rm -f test/test_polyvec_frombytes + -rm -f test/test_polyvec_add2 + -rm -f test/test_polyvec_ntt + -rm -f test/test_polyvec_invntt + -rm -f test/test_polyvec_pointwise_acc + -rm -f test/test_polyvec_reduce + -rm -f test/test_polyvec_csubq + -rm -f test/test_fips202 + -rm -f test/test_gen_matrix + -rm -f test/test_indcpa + -rm -f test/test_kem + -rm -f test/speed_indcpa + -rm -f test/speed_kyber +ifeq ($(OS),Darwin) + -rm -r -f test/*.dSYM +endif diff --git a/code/jasmin/mlkem_avx2/cbd.c b/code/jasmin/mlkem_avx2/cbd.c new file mode 100644 index 00000000..ddcd7860 --- /dev/null +++ b/code/jasmin/mlkem_avx2/cbd.c @@ -0,0 +1,128 @@ +#include +#include "params.h" +#include "cbd.h" + +/************************************************* +* Name: load32_littleendian +* +* Description: load bytes into a 32-bit integer +* in little-endian order +* +* Arguments: - const unsigned char *x: pointer to input byte array +* +* Returns 32-bit unsigned integer loaded from x +**************************************************/ +static uint32_t load32_littleendian(const unsigned char *x) +{ + uint32_t r; + r = (uint32_t)x[0]; + r |= (uint32_t)x[1] << 8; + r |= (uint32_t)x[2] << 16; + r |= (uint32_t)x[3] << 24; + return r; +} + +/************************************************* +* Name: load24_littleendian +* +* Description: load 3 bytes into a 32-bit integer +* in little-endian order. +* This function is only needed for Kyber-512 +* +* Arguments: - const uint8_t *x: pointer to input byte array +* +* Returns 32-bit unsigned integer loaded from x (most significant byte is zero) +**************************************************/ +#if KYBER_ETA1 == 3 +static uint32_t load24_littleendian(const uint8_t x[3]) +{ + uint32_t r; + r = (uint32_t)x[0]; + r |= (uint32_t)x[1] << 8; + r |= (uint32_t)x[2] << 16; + return r; +} +#endif + + +/************************************************* +* Name: cbd2 +* +* Description: Given an array of uniformly random bytes, compute +* polynomial with coefficients distributed according to +* a centered binomial distribution with parameter eta=2 +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *buf: pointer to input byte array +**************************************************/ +static void cbd2(poly *r, const uint8_t buf[2*KYBER_N/4]) +{ + unsigned int i,j; + uint32_t t,d; + int16_t a,b; + + for(i=0;i>1) & 0x55555555; + + for(j=0;j<8;j++) { + a = (d >> (4*j+0)) & 0x3; + b = (d >> (4*j+2)) & 0x3; + r->coeffs[8*i+j] = a - b; + } + } +} + +/************************************************* +* Name: cbd3 +* +* Description: Given an array of uniformly random bytes, compute +* polynomial with coefficients distributed according to +* a centered binomial distribution with parameter eta=3. +* This function is only needed for Kyber-512 +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *buf: pointer to input byte array +**************************************************/ +#if KYBER_ETA1 == 3 +static void cbd3(poly *r, const uint8_t buf[3*KYBER_N/4]) +{ + unsigned int i,j; + uint32_t t,d; + int16_t a,b; + + for(i=0;i>1) & 0x00249249; + d += (t>>2) & 0x00249249; + + for(j=0;j<4;j++) { + a = (d >> (6*j+0)) & 0x7; + b = (d >> (6*j+3)) & 0x7; + r->coeffs[4*i+j] = a - b; + } + } +} +#endif + +void poly_cbd_eta1(poly *r, const uint8_t buf[KYBER_ETA1*KYBER_N/4]) +{ +#if KYBER_ETA1 == 2 + cbd2(r, buf); +#elif KYBER_ETA1 == 3 + cbd3(r, buf); +#else +#error "This implementation requires eta1 in {2,3}" +#endif +} + +void poly_cbd_eta2(poly *r, const uint8_t buf[KYBER_ETA2*KYBER_N/4]) +{ +#if KYBER_ETA2 == 2 + cbd2(r, buf); +#else +#error "This implementation requires eta2 = 2" +#endif +} diff --git a/code/jasmin/mlkem_avx2/cbd.h b/code/jasmin/mlkem_avx2/cbd.h new file mode 100644 index 00000000..b057c161 --- /dev/null +++ b/code/jasmin/mlkem_avx2/cbd.h @@ -0,0 +1,9 @@ +#ifndef CBD_H +#define CBD_H + +#include "poly.h" + +void poly_cbd_eta1(poly *r, const uint8_t buf[KYBER_ETA1*KYBER_N/4]); +void poly_cbd_eta2(poly *r, const uint8_t buf[KYBER_ETA2*KYBER_N/4]); + +#endif diff --git a/code/jasmin/mlkem_avx2/compile.bench b/code/jasmin/mlkem_avx2/compile.bench new file mode 100644 index 00000000..0981229c --- /dev/null +++ b/code/jasmin/mlkem_avx2/compile.bench @@ -0,0 +1,147 @@ +===================================================== +===== Benchmark with flag -until_typing +===================================================== + +real 0m0.127s +user 0m0.095s +sys 0m0.025s +===================================================== +===== Benchmark with flag -until_cstexp +===================================================== + +real 0m0.114s +user 0m0.092s +sys 0m0.016s +===================================================== +===== Benchmark with flag -until_inline +===================================================== + +real 0m0.323s +user 0m0.297s +sys 0m0.019s +===================================================== +===== Benchmark with flag -until_rmfunc +===================================================== + +real 0m0.320s +user 0m0.294s +sys 0m0.020s +===================================================== +===== Benchmark with flag -until_unroll +===================================================== + +real 0m0.453s +user 0m0.424s +sys 0m0.022s +===================================================== +===== Benchmark with flag -until_splitting +===================================================== + +real 0m0.632s +user 0m0.597s +sys 0m0.027s +===================================================== +===== Benchmark with flag -until_valloc +===================================================== + +real 0m0.036s +user 0m0.016s +sys 0m0.015s +===================================================== +===== Benchmark with flag -until_vallocd +===================================================== + +real 0m0.034s +user 0m0.015s +sys 0m0.013s +===================================================== +===== Benchmark with flag -until_vshare +===================================================== + +real 0m0.036s +user 0m0.016s +sys 0m0.014s +===================================================== +===== Benchmark with flag -until_vshared +===================================================== + +real 0m0.035s +user 0m0.016s +sys 0m0.014s +===================================================== +===== Benchmark with flag -until_arrexp +===================================================== + +real 0m1.359s +user 0m1.317s +sys 0m0.033s +===================================================== +===== Benchmark with flag -until_rmarrinit +===================================================== + +real 0m1.125s +user 0m1.082s +sys 0m0.033s +===================================================== +===== Benchmark with flag -until_rmglobals +===================================================== + +real 0m2.040s +user 0m1.996s +sys 0m0.034s +===================================================== +===== Benchmark with flag -until_arrexp +===================================================== + +real 0m1.371s +user 0m1.327s +sys 0m0.034s +===================================================== +===== Benchmark with flag -until_makeref +===================================================== + +real 0m1.242s +user 0m1.199s +sys 0m0.034s +===================================================== +===== Benchmark with flag -until_lowering +===================================================== + +real 0m2.186s +user 0m2.140s +sys 0m0.041s +===================================================== +===== Benchmark with flag -until_stkalloc +===================================================== + +real 0m3.252s +user 0m3.202s +sys 0m0.041s +===================================================== +===== Benchmark with flag -until_ralloc +===================================================== + +real 0m3.866s +user 0m3.819s +sys 0m0.037s +===================================================== +===== Benchmark with flag -until_rallocd +===================================================== + +real 0m3.965s +user 0m3.920s +sys 0m0.037s +===================================================== +===== Benchmark with flag -until_linear +===================================================== + +real 0m3.983s +user 0m3.938s +sys 0m0.038s +===================================================== +===== Benchmark with flag -until_asm +===================================================== + +real 0m4.384s +user 0m4.233s +sys 0m0.140s diff --git a/code/jasmin/mlkem_avx2/compile.bench.old b/code/jasmin/mlkem_avx2/compile.bench.old new file mode 100644 index 00000000..e46e66ee --- /dev/null +++ b/code/jasmin/mlkem_avx2/compile.bench.old @@ -0,0 +1,147 @@ +===================================================== +===== Benchmark with flag -until_typing +===================================================== + +real 0m0.026s +user 0m0.023s +sys 0m0.004s +===================================================== +===== Benchmark with flag -until_cstexp +===================================================== + +real 0m0.027s +user 0m0.024s +sys 0m0.003s +===================================================== +===== Benchmark with flag -until_inline +===================================================== + +real 0m0.128s +user 0m0.119s +sys 0m0.009s +===================================================== +===== Benchmark with flag -until_rmfunc +===================================================== + +real 0m0.128s +user 0m0.124s +sys 0m0.004s +===================================================== +===== Benchmark with flag -until_unroll +===================================================== + +real 0m0.813s +user 0m0.789s +sys 0m0.024s +===================================================== +===== Benchmark with flag -until_splitting +===================================================== + +real 0m1.017s +user 0m1.012s +sys 0m0.004s +===================================================== +===== Benchmark with flag -until_valloc +===================================================== + +real 0m2.145s +user 0m2.116s +sys 0m0.029s +===================================================== +===== Benchmark with flag -until_vallocd +===================================================== + +real 0m3.375s +user 0m3.322s +sys 0m0.032s +===================================================== +===== Benchmark with flag -until_vshare +===================================================== + +real 0m6.072s +user 0m6.005s +sys 0m0.067s +===================================================== +===== Benchmark with flag -until_vshared +===================================================== + +real 0m9.594s +user 0m9.554s +sys 0m0.039s +===================================================== +===== Benchmark with flag -until_arrexp +===================================================== + +real 0m10.981s +user 0m10.943s +sys 0m0.036s +===================================================== +===== Benchmark with flag -until_rmarrinit +===================================================== + +real 0m9.608s +user 0m9.564s +sys 0m0.043s +===================================================== +===== Benchmark with flag -until_rmglobals +===================================================== + +real 0m11.234s +user 0m11.184s +sys 0m0.050s +===================================================== +===== Benchmark with flag -until_arrexp +===================================================== + +real 0m10.989s +user 0m10.908s +sys 0m0.052s +===================================================== +===== Benchmark with flag -until_makeref +===================================================== + +real 0m11.783s +user 0m11.750s +sys 0m0.032s +===================================================== +===== Benchmark with flag -until_lowering +===================================================== + +real 0m12.629s +user 0m12.561s +sys 0m0.068s +===================================================== +===== Benchmark with flag -until_stkalloc +===================================================== + +real 2m27.958s +user 2m27.867s +sys 0m0.088s +===================================================== +===== Benchmark with flag -until_ralloc +===================================================== + +real 4m43.603s +user 4m43.537s +sys 0m0.057s +===================================================== +===== Benchmark with flag -until_rallocd +===================================================== + +real 4m39.180s +user 4m39.085s +sys 0m0.095s +===================================================== +===== Benchmark with flag -until_linear +===================================================== + +real 4m43.906s +user 4m43.843s +sys 0m0.063s +===================================================== +===== Benchmark with flag -until_asm +===================================================== + +real 4m51.571s +user 4m51.416s +sys 0m0.156s diff --git a/code/jasmin/mlkem_avx2/consts.c b/code/jasmin/mlkem_avx2/consts.c new file mode 100644 index 00000000..7999477d --- /dev/null +++ b/code/jasmin/mlkem_avx2/consts.c @@ -0,0 +1,153 @@ +#include +#include "params.h" +#include "consts.h" + +#define Q KYBER_Q +#define MONT ((1U << 16) % Q) +#define QINV 62209 // q^-1 mod 2^16 +#define V (((1U << 26) + Q/2)/Q) +#define FHI (MONT*(MONT*(Q-1)*((Q-1)/128) % Q) % Q) +#define FLO (FHI*QINV % 65536) +#define MONTSQHI (MONT*MONT % Q) +#define MONTSQLO (MONTSQHI*QINV % 65536) +#define MASK 4095 + +const uint16_t qdata[928] __attribute__((aligned(32))) = { +#define _16XQ 0 + Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, + +#define _16XQINV 16 + QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, + QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, + +#define _16XV 32 + V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, + +#define _16XFLO 48 + FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, + FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, + +#define _16XFHI 64 + FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, + FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, + +#define _16XMONTSQLO 80 + MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, + MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, + MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, + MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, + +#define _16XMONTSQHI 96 + MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, + MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, + MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, + MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, + +#define _16XMASK 112 + MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, + MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, + +#define _ZETAS_EXP 128 + 31499, 31499, 2571, 2571, 14746, 14746, 2970, 2970, + 13525, 13525, 13525, 13525, 13525, 13525, 13525, 13525, + 53134, 53134, 53134, 53134, 53134, 53134, 53134, 53134, + 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, + 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, + 44630, 44630, 44630, 44630, 27758, 27758, 27758, 27758, + 61737, 61737, 61737, 61737, 49846, 49846, 49846, 49846, + 3158, 3158, 3158, 3158, 622, 622, 622, 622, + 1577, 1577, 1577, 1577, 182, 182, 182, 182, + 59709, 59709, 17364, 17364, 39176, 39176, 36479, 36479, + 5572, 5572, 64434, 64434, 21439, 21439, 39295, 39295, + 573, 573, 2004, 2004, 264, 264, 383, 383, + 2500, 2500, 1458, 1458, 1727, 1727, 3199, 3199, + 59847, 59020, 1497, 30967, 41972, 20179, 20711, 25081, + 52740, 26617, 16065, 53095, 9135, 64887, 39550, 27837, + 1223, 652, 2777, 1015, 2036, 1491, 3047, 1785, + 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, + 65202, 54059, 33310, 20494, 37798, 945, 50654, 6182, + 32011, 10631, 29176, 36775, 47051, 17561, 51106, 60261, + 2226, 555, 2078, 1550, 422, 177, 3038, 1574, + 3083, 1159, 2552, 2727, 1739, 2457, 418, 3173, + 11182, 13387, 51303, 43881, 13131, 60950, 23093, 5493, + 33034, 30318, 46795, 12639, 20100, 18525, 19529, 52918, + 430, 843, 871, 105, 587, 3094, 2869, 1653, + 778, 3182, 1483, 1119, 644, 349, 329, 3254, + 788, 788, 1812, 1812, 28191, 28191, 28191, 28191, + 28191, 28191, 28191, 28191, 48842, 48842, 48842, 48842, + 48842, 48842, 48842, 48842, 287, 287, 287, 287, + 287, 287, 287, 287, 202, 202, 202, 202, + 202, 202, 202, 202, 10690, 10690, 10690, 10690, + 1359, 1359, 1359, 1359, 54335, 54335, 54335, 54335, + 31164, 31164, 31164, 31164, 962, 962, 962, 962, + 2127, 2127, 2127, 2127, 1855, 1855, 1855, 1855, + 1468, 1468, 1468, 1468, 37464, 37464, 24313, 24313, + 55004, 55004, 8800, 8800, 18427, 18427, 8859, 8859, + 26676, 26676, 49374, 49374, 2648, 2648, 1017, 1017, + 732, 732, 608, 608, 1787, 1787, 411, 411, + 3124, 3124, 1758, 1758, 19884, 37287, 49650, 56638, + 37227, 9076, 35338, 18250, 13427, 14017, 36381, 52780, + 16832, 4312, 41381, 47622, 2476, 3239, 3058, 830, + 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, + 448, 2264, 677, 2054, 34353, 25435, 58154, 24392, + 44610, 10946, 24215, 16990, 10336, 57603, 43035, 10907, + 31637, 28644, 23998, 48114, 817, 603, 1322, 1864, + 2114, 1218, 2455, 2142, 2144, 2051, 1819, 2459, + 3221, 996, 958, 1522, 20297, 2146, 15356, 33152, + 59257, 50634, 54492, 14470, 44039, 45338, 23211, 48094, + 41677, 45279, 7757, 23132, 1097, 610, 2044, 384, + 3193, 1994, 220, 1670, 1799, 794, 2475, 478, + 3021, 991, 1869, 1628, 0, 0, 0, 0, + +#define _ZETAS_INV_EXP 528 + 42405, 57780, 20258, 23860, 17443, 42326, 20199, 21498, + 51067, 11045, 14903, 6280, 32385, 50181, 63391, 45240, + 1701, 1460, 2338, 308, 2851, 854, 2535, 1530, + 1659, 3109, 1335, 136, 2945, 1285, 2719, 2232, + 17423, 41539, 36893, 33900, 54630, 22502, 7934, 55201, + 48547, 41322, 54591, 20927, 41145, 7383, 40102, 31184, + 1807, 2371, 2333, 108, 870, 1510, 1278, 1185, + 1187, 874, 2111, 1215, 1465, 2007, 2726, 2512, + 17915, 24156, 61225, 48705, 12757, 29156, 51520, 52110, + 47287, 30199, 56461, 28310, 8899, 15887, 28250, 45653, + 1275, 2652, 1065, 2881, 725, 1508, 2368, 398, + 951, 247, 1421, 3222, 2499, 271, 90, 853, + 16163, 16163, 38861, 38861, 56678, 56678, 47110, 47110, + 56737, 56737, 10533, 10533, 41224, 41224, 28073, 28073, + 1571, 1571, 205, 205, 2918, 2918, 1542, 1542, + 2721, 2721, 2597, 2597, 2312, 2312, 681, 681, + 34373, 34373, 34373, 34373, 11202, 11202, 11202, 11202, + 64178, 64178, 64178, 64178, 54847, 54847, 54847, 54847, + 1861, 1861, 1861, 1861, 1474, 1474, 1474, 1474, + 1202, 1202, 1202, 1202, 2367, 2367, 2367, 2367, + 16695, 16695, 16695, 16695, 16695, 16695, 16695, 16695, + 37346, 37346, 37346, 37346, 37346, 37346, 37346, 37346, + 3127, 3127, 3127, 3127, 3127, 3127, 3127, 3127, + 3042, 3042, 3042, 3042, 3042, 3042, 3042, 3042, + 64749, 64749, 1517, 1517, 12619, 46008, 47012, 45437, + 52898, 18742, 35219, 32503, 60044, 42444, 4587, 52406, + 21656, 14234, 52150, 54355, 75, 3000, 2980, 2685, + 2210, 1846, 147, 2551, 1676, 460, 235, 2742, + 3224, 2458, 2486, 2899, 5276, 14431, 47976, 18486, + 28762, 36361, 54906, 33526, 59355, 14883, 64592, 27739, + 45043, 32227, 11478, 335, 156, 2911, 872, 1590, + 602, 777, 2170, 246, 1755, 291, 3152, 2907, + 1779, 1251, 2774, 1103, 37700, 25987, 650, 56402, + 12442, 49472, 38920, 12797, 40456, 44826, 45358, 23565, + 34570, 64040, 6517, 5690, 1860, 3203, 1162, 1618, + 666, 320, 8, 2813, 1544, 282, 1838, 1293, + 2314, 552, 2677, 2106, 26242, 26242, 44098, 44098, + 1103, 1103, 59965, 59965, 29058, 29058, 26361, 26361, + 48173, 48173, 5828, 5828, 130, 130, 1602, 1602, + 1871, 1871, 829, 829, 2946, 2946, 3065, 3065, + 1325, 1325, 2756, 2756, 15691, 15691, 15691, 15691, + 3800, 3800, 3800, 3800, 37779, 37779, 37779, 37779, + 20907, 20907, 20907, 20907, 3147, 3147, 3147, 3147, + 1752, 1752, 1752, 1752, 2707, 2707, 2707, 2707, + 171, 171, 171, 171, 12403, 12403, 12403, 12403, + 12403, 12403, 12403, 12403, 52012, 52012, 52012, 52012, + 52012, 52012, 52012, 52012, 1907, 1907, 1907, 1907, + 1907, 1907, 1907, 1907, 1836, 1836, 1836, 1836, + 1836, 1836, 1836, 1836, 50791, 50791, 359, 359, + 60300, 60300, 1932, 1932, 0, 0, 0, 0 +}; diff --git a/code/jasmin/mlkem_avx2/consts.h b/code/jasmin/mlkem_avx2/consts.h new file mode 100644 index 00000000..93edff96 --- /dev/null +++ b/code/jasmin/mlkem_avx2/consts.h @@ -0,0 +1,40 @@ +#ifndef CONSTS_H +#define CONSTS_H + +#include "params.h" + +#define _16XQ 0 +#define _16XQINV 16 +#define _16XV 32 +#define _16XFLO 48 +#define _16XFHI 64 +#define _16XMONTSQLO 80 +#define _16XMONTSQHI 96 +#define _16XMASK 112 +#define _ZETAS_EXP 128 +#define _ZETAS_INV_EXP 528 + +/* The C ABI on MacOS exports all symbols with a leading + * underscore. This means that any symbols we refer to from + * C files (functions) can't be found, and all symbols we + * refer to from ASM also can't be found. + * + * This define helps us get around this + */ +#ifdef __ASSEMBLER__ +#if defined(__WIN32__) || defined(__APPLE__) +#define decorate(s) _##s +#define cdecl2(s) decorate(s) +#define cdecl(s) cdecl2(KYBER_NAMESPACE(##s)) +#else +#define cdecl(s) KYBER_NAMESPACE(##s) +#endif +#endif + +#ifndef __ASSEMBLER__ +#include +#define qdata KYBER_NAMESPACE(qdata) +extern const uint16_t qdata[]; +#endif + +#endif diff --git a/code/jasmin/mlkem_avx2/consts.jinc b/code/jasmin/mlkem_avx2/consts.jinc new file mode 100644 index 00000000..ddefdb70 --- /dev/null +++ b/code/jasmin/mlkem_avx2/consts.jinc @@ -0,0 +1,104 @@ +u16[128] jzetas = {2285, 2571, 2970, 1812, 1493, 1422, 287, 202, 3158, 622, 1577, 182, 962, 2127, 1855, 1468, + 573, 2004, 264, 383, 2500, 1458, 1727, 3199, 2648, 1017, 732, 608, 1787, 411, 3124, 1758, + 1223, 652, 2777, 1015, 2036, 1491, 3047, 1785, 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, + 2476, 3239, 3058, 830, 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, 448, 2264, 677, 2054, + 2226, 430, 555, 843, 2078, 871, 1550, 105, 422, 587, 177, 3094, 3038, 2869, 1574, 1653, + 3083, 778, 1159, 3182, 2552, 1483, 2727, 1119, 1739, 644, 2457, 349, 418, 329, 3173, 3254, + 817, 1097, 603, 610, 1322, 2044, 1864, 384, 2114, 3193, 1218, 1994, 2455, 220, 2142, 1670, + 2144, 1799, 2051, 794, 1819, 2475, 2459, 478, 3221, 3021, 996, 991, 958, 1869, 1522, 1628}; + + +u16[128] jzetas_inv = {1701, 1807, 1460, 2371, 2338, 2333, 308, 108, 2851, 870, 854, 1510, 2535, 1278, 1530, 1185, + 1659, 1187, 3109, 874, 1335, 2111, 136, 1215, 2945, 1465, 1285, 2007, 2719, 2726, 2232, 2512, + 75, 156, 3000, 2911, 2980, 872, 2685, 1590, 2210, 602, 1846, 777, 147, 2170, 2551, 246, + 1676, 1755, 460, 291, 235, 3152, 2742, 2907, 3224, 1779, 2458, 1251, 2486, 2774, 2899, 1103, + 1275, 2652, 1065, 2881, 725, 1508, 2368, 398, 951, 247, 1421, 3222, 2499, 271, 90, 853, + 1860, 3203, 1162, 1618, 666, 320, 8, 2813, 1544, 282, 1838, 1293, 2314, 552, 2677, 2106, + 1571, 205, 2918, 1542, 2721, 2597, 2312, 681, 130, 1602, 1871, 829, 2946, 3065, 1325, 2756, + 1861, 1474, 1202, 2367, 3147, 1752, 2707, 171, 3127, 3042, 1907, 1836, 1517, 359, 758, 1441}; + +u16[400] jzetas_exp = {31499, 31499, 2571, 2571, 14746, 14746, 2970, 2970, 13525, 13525, 13525, 13525, 13525, 13525, 13525, 13525, + 53134, 53134, 53134, 53134, 53134, 53134, 53134, 53134, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, + 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 44630, 44630, 44630, 44630, 27758, 27758, 27758, 27758, + 61737, 61737, 61737, 61737, 49846, 49846, 49846, 49846, 3158, 3158, 3158, 3158, 622, 622, 622, 622, + 1577, 1577, 1577, 1577, 182, 182, 182, 182, 59709, 59709, 17364, 17364, 39176, 39176, 36479, 36479, + 5572, 5572, 64434, 64434, 21439, 21439, 39295, 39295, 573, 573, 2004, 2004, 264, 264, 383, 383, + 2500, 2500, 1458, 1458, 1727, 1727, 3199, 3199, 59847, 59020, 1497, 30967, 41972, 20179, 20711, 25081, + 52740, 26617, 16065, 53095, 9135, 64887, 39550, 27837, 1223, 652, 2777, 1015, 2036, 1491, 3047, 1785, + 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, 65202, 54059, 33310, 20494, 37798, 945, 50654, 6182, + 32011, 10631, 29176, 36775, 47051, 17561, 51106, 60261, 2226, 555, 2078, 1550, 422, 177, 3038, 1574, + 3083, 1159, 2552, 2727, 1739, 2457, 418, 3173, 11182, 13387, 51303, 43881, 13131, 60950, 23093, 5493, + 33034, 30318, 46795, 12639, 20100, 18525, 19529, 52918, 430, 843, 871, 105, 587, 3094, 2869, 1653, + 778, 3182, 1483, 1119, 644, 349, 329, 3254, 788, 788, 1812, 1812, 28191, 28191, 28191, 28191, + 28191, 28191, 28191, 28191, 48842, 48842, 48842, 48842, 48842, 48842, 48842, 48842, 287, 287, 287, 287, + 287, 287, 287, 287, 202, 202, 202, 202, 202, 202, 202, 202, 10690, 10690, 10690, 10690, + 1359, 1359, 1359, 1359, 54335, 54335, 54335, 54335, 31164, 31164, 31164, 31164, 962, 962, 962, 962, + 2127, 2127, 2127, 2127, 1855, 1855, 1855, 1855, 1468, 1468, 1468, 1468, 37464, 37464, 24313, 24313, + 55004, 55004, 8800, 8800, 18427, 18427, 8859, 8859, 26676, 26676, 49374, 49374, 2648, 2648, 1017, 1017, + 732, 732, 608, 608, 1787, 1787, 411, 411, 3124, 3124, 1758, 1758, 19884, 37287, 49650, 56638, + 37227, 9076, 35338, 18250, 13427, 14017, 36381, 52780, 16832, 4312, 41381, 47622, 2476, 3239, 3058, 830, + 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, 448, 2264, 677, 2054, 34353, 25435, 58154, 24392, + 44610, 10946, 24215, 16990, 10336, 57603, 43035, 10907, 31637, 28644, 23998, 48114, 817, 603, 1322, 1864, + 2114, 1218, 2455, 2142, 2144, 2051, 1819, 2459, 3221, 996, 958, 1522, 20297, 2146, 15356, 33152, + 59257, 50634, 54492, 14470, 44039, 45338, 23211, 48094, 41677, 45279, 7757, 23132, 1097, 610, 2044, 384, + 3193, 1994, 220, 1670, 1799, 794, 2475, 478, 3021, 991, 1869, 1628, 0, 0, 0, 0}; + +u16[400] jzetas_inv_exp = {42405, 57780, 20258, 23860, 17443, 42326, 20199, 21498, 51067, 11045, 14903, 6280, 32385, 50181, 63391, 45240, + 1701, 1460, 2338, 308, 2851, 854, 2535, 1530, 1659, 3109, 1335, 136, 2945, 1285, 2719, 2232, + 17423, 41539, 36893, 33900, 54630, 22502, 7934, 55201, 48547, 41322, 54591, 20927, 41145, 7383, 40102, 31184, + 1807, 2371, 2333, 108, 870, 1510, 1278, 1185, 1187, 874, 2111, 1215, 1465, 2007, 2726, 2512, + 17915, 24156, 61225, 48705, 12757, 29156, 51520, 52110, 47287, 30199, 56461, 28310, 8899, 15887, 28250, 45653, + 1275, 2652, 1065, 2881, 725, 1508, 2368, 398, 951, 247, 1421, 3222, 2499, 271, 90, 853, + 16163, 16163, 38861, 38861, 56678, 56678, 47110, 47110, 56737, 56737, 10533, 10533, 41224, 41224, 28073, 28073, + 1571, 1571, 205, 205, 2918, 2918, 1542, 1542, 2721, 2721, 2597, 2597, 2312, 2312, 681, 681, + 34373, 34373, 34373, 34373, 11202, 11202, 11202, 11202, 64178, 64178, 64178, 64178, 54847, 54847, 54847, 54847, + 1861, 1861, 1861, 1861, 1474, 1474, 1474, 1474, 1202, 1202, 1202, 1202, 2367, 2367, 2367, 2367, + 16695, 16695, 16695, 16695, 16695, 16695, 16695, 16695, 37346, 37346, 37346, 37346, 37346, 37346, 37346, 37346, + 3127, 3127, 3127, 3127, 3127, 3127, 3127, 3127, 3042, 3042, 3042, 3042, 3042, 3042, 3042, 3042, + 64749, 64749, 1517, 1517, 12619, 46008, 47012, 45437, 52898, 18742, 35219, 32503, 60044, 42444, 4587, 52406, + 21656, 14234, 52150, 54355, 75, 3000, 2980, 2685, 2210, 1846, 147, 2551, 1676, 460, 235, 2742, + 3224, 2458, 2486, 2899, 5276, 14431, 47976, 18486, 28762, 36361, 54906, 33526, 59355, 14883, 64592, 27739, + 45043, 32227, 11478, 335, 156, 2911, 872, 1590, 602, 777, 2170, 246, 1755, 291, 3152, 2907, + 1779, 1251, 2774, 1103, 37700, 25987, 650, 56402, 12442, 49472, 38920, 12797, 40456, 44826, 45358, 23565, + 34570, 64040, 6517, 5690, 1860, 3203, 1162, 1618, 666, 320, 8, 2813, 1544, 282, 1838, 1293, + 2314, 552, 2677, 2106, 26242, 26242, 44098, 44098, 1103, 1103, 59965, 59965, 29058, 29058, 26361, 26361, + 48173, 48173, 5828, 5828, 130, 130, 1602, 1602, 1871, 1871, 829, 829, 2946, 2946, 3065, 3065, + 1325, 1325, 2756, 2756, 15691, 15691, 15691, 15691, 3800, 3800, 3800, 3800, 37779, 37779, 37779, 37779, + 20907, 20907, 20907, 20907, 3147, 3147, 3147, 3147, 1752, 1752, 1752, 1752, 2707, 2707, 2707, 2707, + 171, 171, 171, 171, 12403, 12403, 12403, 12403, 12403, 12403, 12403, 12403, 52012, 52012, 52012, 52012, + 52012, 52012, 52012, 52012, 1907, 1907, 1907, 1907, 1907, 1907, 1907, 1907, 1836, 1836, 1836, 1836, + 1836, 1836, 1836, 1836, 50791, 50791, 359, 359, 60300, 60300, 1932, 1932, 0, 0, 0, 0 +}; + +u16[16] jqx16 = {KYBER_Q, KYBER_Q, KYBER_Q, KYBER_Q, KYBER_Q, KYBER_Q, KYBER_Q, KYBER_Q, + KYBER_Q, KYBER_Q, KYBER_Q, KYBER_Q, KYBER_Q, KYBER_Q, KYBER_Q, KYBER_Q}; + +u16[16] jqinvx16 = {62209, 62209, 62209, 62209, 62209, 62209, 62209, 62209, + 62209, 62209, 62209, 62209, 62209, 62209, 62209, 62209}; + +u16[16] jvx16 = {20159, 20159, 20159, 20159, 20159, 20159, 20159, 20159, + 20159, 20159, 20159, 20159, 20159, 20159, 20159, 20159}; + +u16[16] jfhix16 = {1441, 1441, 1441, 1441, 1441, 1441, 1441, 1441, + 1441, 1441, 1441, 1441, 1441, 1441, 1441, 1441}; + +u16[16] jflox16 = {55457, 55457, 55457, 55457, 55457, 55457, 55457, 55457, + 55457, 55457, 55457, 55457, 55457, 55457, 55457, 55457}; + +u16[16] maskx16 = {4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, + 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095}; + +u16[16] hqx16_p1 = {1665, 1665, 1665, 1665, 1665, 1665, 1665, 1665, + 1665, 1665, 1665, 1665, 1665, 1665, 1665, 1665}; + +u16[16] hqx16_m1 = {1664, 1664, 1664, 1664, 1664, 1664, 1664, 1664, + 1664, 1664, 1664, 1664, 1664, 1664, 1664, 1664}; + +u16[16] hhqx16 = {832, 832, 832, 832, 832, 832, 832, 832, + 832, 832, 832, 832, 832, 832, 832, 832}; + +u16[16] mqinvx16 = {80635, 80635, 80635, 80635, 80635, 80635, 80635, 80635, + 80635, 80635, 80635, 80635, 80635, 80635, 80635, 80635}; + +u16[16] jdmontx16 = {1353, 1353, 1353, 1353, 1353, 1353, 1353, 1353, + 1353, 1353, 1353, 1353, 1353, 1353, 1353, 1353}; diff --git a/code/jasmin/mlkem_avx2/extraction/Array1088.ec b/code/jasmin/mlkem_avx2/extraction/Array1088.ec new file mode 100644 index 00000000..de2a1ea4 --- /dev/null +++ b/code/jasmin/mlkem_avx2/extraction/Array1088.ec @@ -0,0 +1,3 @@ +from Jasmin require import JArray. + +clone export PolyArray as Array1088 with op size <- 1088. diff --git a/code/jasmin/mlkem_avx2/extraction/Array128.ec b/code/jasmin/mlkem_avx2/extraction/Array128.ec new file mode 100644 index 00000000..e5880272 --- /dev/null +++ b/code/jasmin/mlkem_avx2/extraction/Array128.ec @@ -0,0 +1,3 @@ +from Jasmin require import JArray. + +clone export PolyArray as Array128 with op size <- 128. diff --git a/code/jasmin/mlkem_avx2/extraction/Array136.ec b/code/jasmin/mlkem_avx2/extraction/Array136.ec new file mode 100644 index 00000000..d73cf0b1 --- /dev/null +++ b/code/jasmin/mlkem_avx2/extraction/Array136.ec @@ -0,0 +1,3 @@ +from Jasmin require import JArray. + +clone export PolyArray as Array136 with op size <- 136. diff --git a/code/jasmin/mlkem_avx2/extraction/Array16.ec b/code/jasmin/mlkem_avx2/extraction/Array16.ec new file mode 100644 index 00000000..429639d4 --- /dev/null +++ b/code/jasmin/mlkem_avx2/extraction/Array16.ec @@ -0,0 +1,3 @@ +from Jasmin require import JArray. + +clone export PolyArray as Array16 with op size <- 16. diff --git a/code/jasmin/mlkem_avx2/extraction/Array168.ec b/code/jasmin/mlkem_avx2/extraction/Array168.ec new file mode 100644 index 00000000..6abfbe44 --- /dev/null +++ b/code/jasmin/mlkem_avx2/extraction/Array168.ec @@ -0,0 +1,3 @@ +from Jasmin require import JArray. + +clone export PolyArray as Array168 with op size <- 168. diff --git a/code/jasmin/mlkem_avx2/extraction/Array2304.ec b/code/jasmin/mlkem_avx2/extraction/Array2304.ec new file mode 100644 index 00000000..f0038311 --- /dev/null +++ b/code/jasmin/mlkem_avx2/extraction/Array2304.ec @@ -0,0 +1,3 @@ +from Jasmin require import JArray. + +clone export PolyArray as Array2304 with op size <- 2304. diff --git a/code/jasmin/mlkem_avx2/extraction/Array24.ec b/code/jasmin/mlkem_avx2/extraction/Array24.ec new file mode 100644 index 00000000..8982b77c --- /dev/null +++ b/code/jasmin/mlkem_avx2/extraction/Array24.ec @@ -0,0 +1,3 @@ +from Jasmin require import JArray. + +clone export PolyArray as Array24 with op size <- 24. diff --git a/code/jasmin/mlkem_avx2/extraction/Array25.ec b/code/jasmin/mlkem_avx2/extraction/Array25.ec new file mode 100644 index 00000000..30bcb172 --- /dev/null +++ b/code/jasmin/mlkem_avx2/extraction/Array25.ec @@ -0,0 +1,3 @@ +from Jasmin require import JArray. + +clone export PolyArray as Array25 with op size <- 25. diff --git a/code/jasmin/mlkem_avx2/extraction/Array256.ec b/code/jasmin/mlkem_avx2/extraction/Array256.ec new file mode 100644 index 00000000..6f03a141 --- /dev/null +++ b/code/jasmin/mlkem_avx2/extraction/Array256.ec @@ -0,0 +1,3 @@ +from Jasmin require import JArray. + +clone export PolyArray as Array256 with op size <- 256. diff --git a/code/jasmin/mlkem_avx2/extraction/Array32.ec b/code/jasmin/mlkem_avx2/extraction/Array32.ec new file mode 100644 index 00000000..c72b94f2 --- /dev/null +++ b/code/jasmin/mlkem_avx2/extraction/Array32.ec @@ -0,0 +1,3 @@ +from Jasmin require import JArray. + +clone export PolyArray as Array32 with op size <- 32. diff --git a/code/jasmin/mlkem_avx2/extraction/Array33.ec b/code/jasmin/mlkem_avx2/extraction/Array33.ec new file mode 100644 index 00000000..c60f0144 --- /dev/null +++ b/code/jasmin/mlkem_avx2/extraction/Array33.ec @@ -0,0 +1,3 @@ +from Jasmin require import JArray. + +clone export PolyArray as Array33 with op size <- 33. diff --git a/code/jasmin/mlkem_avx2/extraction/Array34.ec b/code/jasmin/mlkem_avx2/extraction/Array34.ec new file mode 100644 index 00000000..d6bb77b1 --- /dev/null +++ b/code/jasmin/mlkem_avx2/extraction/Array34.ec @@ -0,0 +1,3 @@ +from Jasmin require import JArray. + +clone export PolyArray as Array34 with op size <- 34. diff --git a/code/jasmin/mlkem_avx2/extraction/Array4.ec b/code/jasmin/mlkem_avx2/extraction/Array4.ec new file mode 100644 index 00000000..bc0e12ed --- /dev/null +++ b/code/jasmin/mlkem_avx2/extraction/Array4.ec @@ -0,0 +1,3 @@ +from Jasmin require import JArray. + +clone export PolyArray as Array4 with op size <- 4. diff --git a/code/jasmin/mlkem_avx2/extraction/Array400.ec b/code/jasmin/mlkem_avx2/extraction/Array400.ec new file mode 100644 index 00000000..3e9031a1 --- /dev/null +++ b/code/jasmin/mlkem_avx2/extraction/Array400.ec @@ -0,0 +1,3 @@ +from Jasmin require import JArray. + +clone export PolyArray as Array400 with op size <- 400. diff --git a/code/jasmin/mlkem_avx2/extraction/Array5.ec b/code/jasmin/mlkem_avx2/extraction/Array5.ec new file mode 100644 index 00000000..8dc7b36e --- /dev/null +++ b/code/jasmin/mlkem_avx2/extraction/Array5.ec @@ -0,0 +1,3 @@ +from Jasmin require import JArray. + +clone export PolyArray as Array5 with op size <- 5. diff --git a/code/jasmin/mlkem_avx2/extraction/Array64.ec b/code/jasmin/mlkem_avx2/extraction/Array64.ec new file mode 100644 index 00000000..3ccc4576 --- /dev/null +++ b/code/jasmin/mlkem_avx2/extraction/Array64.ec @@ -0,0 +1,3 @@ +from Jasmin require import JArray. + +clone export PolyArray as Array64 with op size <- 64. diff --git a/code/jasmin/mlkem_avx2/extraction/Array768.ec b/code/jasmin/mlkem_avx2/extraction/Array768.ec new file mode 100644 index 00000000..241538a0 --- /dev/null +++ b/code/jasmin/mlkem_avx2/extraction/Array768.ec @@ -0,0 +1,3 @@ +from Jasmin require import JArray. + +clone export PolyArray as Array768 with op size <- 768. diff --git a/code/jasmin/mlkem_avx2/extraction/Array8.ec b/code/jasmin/mlkem_avx2/extraction/Array8.ec new file mode 100644 index 00000000..0f8b9ee8 --- /dev/null +++ b/code/jasmin/mlkem_avx2/extraction/Array8.ec @@ -0,0 +1,3 @@ +from Jasmin require import JArray. + +clone export PolyArray as Array8 with op size <- 8. diff --git a/code/jasmin/mlkem_avx2/extraction/Array960.ec b/code/jasmin/mlkem_avx2/extraction/Array960.ec new file mode 100644 index 00000000..bb0f324a --- /dev/null +++ b/code/jasmin/mlkem_avx2/extraction/Array960.ec @@ -0,0 +1,3 @@ +from Jasmin require import JArray. + +clone export PolyArray as Array960 with op size <- 960. diff --git a/code/jasmin/mlkem_avx2/extraction/Makefile b/code/jasmin/mlkem_avx2/extraction/Makefile new file mode 100644 index 00000000..640317a1 --- /dev/null +++ b/code/jasmin/mlkem_avx2/extraction/Makefile @@ -0,0 +1,19 @@ +# -*- Makefile -*- + +# -------------------------------------------------------------------- +-include ../../../Makefile.conf + +# -------------------------------------------------------------------- +.PHONY: all ec clean + +# -------------------------------------------------------------------- +all: ec + +ec: + $(JASMINC) ../jkem.jazz -oec jkem_avx2.ec \ + -ec jade_kem_kyber_kyber768_amd64_avx2v_keypair \ + -ec jade_kem_kyber_kyber768_amd64_avx2v_enc \ + -ec jade_kem_kyber_kyber768_amd64_avx2v_dec + +clean: + rm -f *.ec diff --git a/code/jasmin/mlkem_avx2/extraction/WArray1088.ec b/code/jasmin/mlkem_avx2/extraction/WArray1088.ec new file mode 100644 index 00000000..811cd399 --- /dev/null +++ b/code/jasmin/mlkem_avx2/extraction/WArray1088.ec @@ -0,0 +1,3 @@ +from Jasmin require import JWord_array. + +clone export WArray as WArray1088 with op size <- 1088. diff --git a/code/jasmin/mlkem_avx2/extraction/WArray128.ec b/code/jasmin/mlkem_avx2/extraction/WArray128.ec new file mode 100644 index 00000000..3c9d6893 --- /dev/null +++ b/code/jasmin/mlkem_avx2/extraction/WArray128.ec @@ -0,0 +1,3 @@ +from Jasmin require import JWord_array. + +clone export WArray as WArray128 with op size <- 128. diff --git a/code/jasmin/mlkem_avx2/extraction/WArray136.ec b/code/jasmin/mlkem_avx2/extraction/WArray136.ec new file mode 100644 index 00000000..6fa8f20e --- /dev/null +++ b/code/jasmin/mlkem_avx2/extraction/WArray136.ec @@ -0,0 +1,3 @@ +from Jasmin require import JWord_array. + +clone export WArray as WArray136 with op size <- 136. diff --git a/code/jasmin/mlkem_avx2/extraction/WArray1536.ec b/code/jasmin/mlkem_avx2/extraction/WArray1536.ec new file mode 100644 index 00000000..83524e52 --- /dev/null +++ b/code/jasmin/mlkem_avx2/extraction/WArray1536.ec @@ -0,0 +1,3 @@ +from Jasmin require import JWord_array. + +clone export WArray as WArray1536 with op size <- 1536. diff --git a/code/jasmin/mlkem_avx2/extraction/WArray16.ec b/code/jasmin/mlkem_avx2/extraction/WArray16.ec new file mode 100644 index 00000000..f2ed50c2 --- /dev/null +++ b/code/jasmin/mlkem_avx2/extraction/WArray16.ec @@ -0,0 +1,3 @@ +from Jasmin require import JWord_array. + +clone export WArray as WArray16 with op size <- 16. diff --git a/code/jasmin/mlkem_avx2/extraction/WArray168.ec b/code/jasmin/mlkem_avx2/extraction/WArray168.ec new file mode 100644 index 00000000..7292dff0 --- /dev/null +++ b/code/jasmin/mlkem_avx2/extraction/WArray168.ec @@ -0,0 +1,3 @@ +from Jasmin require import JWord_array. + +clone export WArray as WArray168 with op size <- 168. diff --git a/code/jasmin/mlkem_avx2/extraction/WArray192.ec b/code/jasmin/mlkem_avx2/extraction/WArray192.ec new file mode 100644 index 00000000..c8564c54 --- /dev/null +++ b/code/jasmin/mlkem_avx2/extraction/WArray192.ec @@ -0,0 +1,3 @@ +from Jasmin require import JWord_array. + +clone export WArray as WArray192 with op size <- 192. diff --git a/code/jasmin/mlkem_avx2/extraction/WArray200.ec b/code/jasmin/mlkem_avx2/extraction/WArray200.ec new file mode 100644 index 00000000..99b887c8 --- /dev/null +++ b/code/jasmin/mlkem_avx2/extraction/WArray200.ec @@ -0,0 +1,3 @@ +from Jasmin require import JWord_array. + +clone export WArray as WArray200 with op size <- 200. diff --git a/code/jasmin/mlkem_avx2/extraction/WArray256.ec b/code/jasmin/mlkem_avx2/extraction/WArray256.ec new file mode 100644 index 00000000..b07b1c22 --- /dev/null +++ b/code/jasmin/mlkem_avx2/extraction/WArray256.ec @@ -0,0 +1,3 @@ +from Jasmin require import JWord_array. + +clone export WArray as WArray256 with op size <- 256. diff --git a/code/jasmin/mlkem_avx2/extraction/WArray32.ec b/code/jasmin/mlkem_avx2/extraction/WArray32.ec new file mode 100644 index 00000000..b828f8d3 --- /dev/null +++ b/code/jasmin/mlkem_avx2/extraction/WArray32.ec @@ -0,0 +1,3 @@ +from Jasmin require import JWord_array. + +clone export WArray as WArray32 with op size <- 32. diff --git a/code/jasmin/mlkem_avx2/extraction/WArray33.ec b/code/jasmin/mlkem_avx2/extraction/WArray33.ec new file mode 100644 index 00000000..1e8a9d93 --- /dev/null +++ b/code/jasmin/mlkem_avx2/extraction/WArray33.ec @@ -0,0 +1,3 @@ +from Jasmin require import JWord_array. + +clone export WArray as WArray33 with op size <- 33. diff --git a/code/jasmin/mlkem_avx2/extraction/WArray34.ec b/code/jasmin/mlkem_avx2/extraction/WArray34.ec new file mode 100644 index 00000000..1f331a9b --- /dev/null +++ b/code/jasmin/mlkem_avx2/extraction/WArray34.ec @@ -0,0 +1,3 @@ +from Jasmin require import JWord_array. + +clone export WArray as WArray34 with op size <- 34. diff --git a/code/jasmin/mlkem_avx2/extraction/WArray40.ec b/code/jasmin/mlkem_avx2/extraction/WArray40.ec new file mode 100644 index 00000000..003b6e23 --- /dev/null +++ b/code/jasmin/mlkem_avx2/extraction/WArray40.ec @@ -0,0 +1,3 @@ +from Jasmin require import JWord_array. + +clone export WArray as WArray40 with op size <- 40. diff --git a/code/jasmin/mlkem_avx2/extraction/WArray4608.ec b/code/jasmin/mlkem_avx2/extraction/WArray4608.ec new file mode 100644 index 00000000..e32c47df --- /dev/null +++ b/code/jasmin/mlkem_avx2/extraction/WArray4608.ec @@ -0,0 +1,3 @@ +from Jasmin require import JWord_array. + +clone export WArray as WArray4608 with op size <- 4608. diff --git a/code/jasmin/mlkem_avx2/extraction/WArray512.ec b/code/jasmin/mlkem_avx2/extraction/WArray512.ec new file mode 100644 index 00000000..a690df87 --- /dev/null +++ b/code/jasmin/mlkem_avx2/extraction/WArray512.ec @@ -0,0 +1,3 @@ +from Jasmin require import JWord_array. + +clone export WArray as WArray512 with op size <- 512. diff --git a/code/jasmin/mlkem_avx2/extraction/WArray64.ec b/code/jasmin/mlkem_avx2/extraction/WArray64.ec new file mode 100644 index 00000000..6f4aeb6b --- /dev/null +++ b/code/jasmin/mlkem_avx2/extraction/WArray64.ec @@ -0,0 +1,3 @@ +from Jasmin require import JWord_array. + +clone export WArray as WArray64 with op size <- 64. diff --git a/code/jasmin/mlkem_avx2/extraction/WArray768.ec b/code/jasmin/mlkem_avx2/extraction/WArray768.ec new file mode 100644 index 00000000..8852789a --- /dev/null +++ b/code/jasmin/mlkem_avx2/extraction/WArray768.ec @@ -0,0 +1,3 @@ +from Jasmin require import JWord_array. + +clone export WArray as WArray768 with op size <- 768. diff --git a/code/jasmin/mlkem_avx2/extraction/WArray800.ec b/code/jasmin/mlkem_avx2/extraction/WArray800.ec new file mode 100644 index 00000000..2cf23ef6 --- /dev/null +++ b/code/jasmin/mlkem_avx2/extraction/WArray800.ec @@ -0,0 +1,3 @@ +from Jasmin require import JWord_array. + +clone export WArray as WArray800 with op size <- 800. diff --git a/code/jasmin/mlkem_avx2/extraction/WArray960.ec b/code/jasmin/mlkem_avx2/extraction/WArray960.ec new file mode 100644 index 00000000..c2f56fe9 --- /dev/null +++ b/code/jasmin/mlkem_avx2/extraction/WArray960.ec @@ -0,0 +1,3 @@ +from Jasmin require import JWord_array. + +clone export WArray as WArray960 with op size <- 960. diff --git a/code/jasmin/mlkem_avx2/extraction/jkem_avx2.ec b/code/jasmin/mlkem_avx2/extraction/jkem_avx2.ec new file mode 100644 index 00000000..1504daa1 --- /dev/null +++ b/code/jasmin/mlkem_avx2/extraction/jkem_avx2.ec @@ -0,0 +1,5263 @@ +require import AllCore IntDiv CoreMap List Distr. +from Jasmin require import JModel_x86. +import SLH64. + + +require import Array4 Array5 Array8 Array16 Array24 Array25 Array32 Array33 + Array34 Array64 Array128 Array136 Array168 Array256 Array400 + Array768 Array960 Array1088 Array2304. +require import WArray16 WArray32 WArray33 WArray34 WArray40 WArray64 + WArray128 WArray136 WArray168 WArray192 WArray200 WArray256 + WArray512 WArray768 WArray800 WArray960 WArray1088 WArray1536 + WArray4608. + +abbrev pvc_shufbidx_s = Array32.of_list witness [W8.of_int 0; W8.of_int 1; +W8.of_int 2; W8.of_int 3; W8.of_int 4; W8.of_int 8; W8.of_int 9; +W8.of_int 10; W8.of_int 11; W8.of_int 12; W8.of_int (-1); W8.of_int (-1); +W8.of_int (-1); W8.of_int (-1); W8.of_int (-1); W8.of_int (-1); W8.of_int 9; +W8.of_int 10; W8.of_int 11; W8.of_int 12; W8.of_int (-1); W8.of_int (-1); +W8.of_int (-1); W8.of_int (-1); W8.of_int (-1); W8.of_int (-1); W8.of_int 0; +W8.of_int 1; W8.of_int 2; W8.of_int 3; W8.of_int 4; W8.of_int 8]. + + +abbrev pvc_sllvdidx_s = W64.of_int 12. + + +abbrev pvc_shift2_s = W64.of_int 288230380513787905. + + +abbrev pvc_mask_s = W16.of_int 1023. + + +abbrev pvc_shift1_s = W16.of_int 4096. + + +abbrev pvc_off_s = W16.of_int 15. + + +abbrev pvd_mask_s = W32.of_int 2145394680. + + +abbrev pvd_sllvdidx_s = W64.of_int 4. + + +abbrev pvd_shufbdidx_s = Array32.of_list witness [W8.of_int 0; W8.of_int 1; +W8.of_int 1; W8.of_int 2; W8.of_int 2; W8.of_int 3; W8.of_int 3; W8.of_int 4; +W8.of_int 5; W8.of_int 6; W8.of_int 6; W8.of_int 7; W8.of_int 7; W8.of_int 8; +W8.of_int 8; W8.of_int 9; W8.of_int 2; W8.of_int 3; W8.of_int 3; W8.of_int 4; +W8.of_int 4; W8.of_int 5; W8.of_int 5; W8.of_int 6; W8.of_int 7; W8.of_int 8; +W8.of_int 8; W8.of_int 9; W8.of_int 9; W8.of_int 10; W8.of_int 10; +W8.of_int 11]. + + +abbrev pvd_q_s = W32.of_int 218182660. + + +abbrev cbd_jshufbidx = Array32.of_list witness [W8.of_int 0; W8.of_int 1; +W8.of_int 2; W8.of_int (-1); W8.of_int 3; W8.of_int 4; W8.of_int 5; +W8.of_int (-1); W8.of_int 6; W8.of_int 7; W8.of_int 8; W8.of_int (-1); +W8.of_int 9; W8.of_int 10; W8.of_int 11; W8.of_int (-1); W8.of_int 4; +W8.of_int 5; W8.of_int 6; W8.of_int (-1); W8.of_int 7; W8.of_int 8; +W8.of_int 9; W8.of_int (-1); W8.of_int 10; W8.of_int 11; W8.of_int 12; +W8.of_int (-1); W8.of_int 13; W8.of_int 14; W8.of_int 15; W8.of_int (-1)]. + + +abbrev pfm_idx_s = Array16.of_list witness [W8.of_int 0; W8.of_int 1; +W8.of_int 4; W8.of_int 5; W8.of_int 8; W8.of_int 9; W8.of_int 12; +W8.of_int 13; W8.of_int 2; W8.of_int 3; W8.of_int 6; W8.of_int 7; +W8.of_int 10; W8.of_int 11; W8.of_int 14; W8.of_int 15]. + + +abbrev pfm_shift_s = Array4.of_list witness [W32.of_int 3; W32.of_int 2; +W32.of_int 1; W32.of_int 0]. + + +abbrev pd_shift_s = W32.of_int 8390656. + + +abbrev pd_mask_s = W32.of_int 15728655. + + +abbrev pd_jshufbidx = Array32.of_list witness [W8.of_int 0; W8.of_int 0; +W8.of_int 0; W8.of_int 0; W8.of_int 1; W8.of_int 1; W8.of_int 1; W8.of_int 1; +W8.of_int 2; W8.of_int 2; W8.of_int 2; W8.of_int 2; W8.of_int 3; W8.of_int 3; +W8.of_int 3; W8.of_int 3; W8.of_int 4; W8.of_int 4; W8.of_int 4; W8.of_int 4; +W8.of_int 5; W8.of_int 5; W8.of_int 5; W8.of_int 5; W8.of_int 6; W8.of_int 6; +W8.of_int 6; W8.of_int 6; W8.of_int 7; W8.of_int 7; W8.of_int 7; +W8.of_int 7]. + + +abbrev pc_permidx_s = Array8.of_list witness [W32.of_int 0; W32.of_int 4; +W32.of_int 1; W32.of_int 5; W32.of_int 2; W32.of_int 6; W32.of_int 3; +W32.of_int 7]. + + +abbrev pc_shift2_s = W16.of_int 4097. + + +abbrev pc_mask_s = W16.of_int 15. + + +abbrev pc_shift1_s = W16.of_int 512. + + +abbrev KeccakF1600RoundConstants = Array24.of_list witness [W256.of_int 6277101735386680764176071790128604879584176795969512275969; +W256.of_int 206504092890751023779864409751650843328560248233805014854828162; +W256.of_int (-57896044618657891154337237002533387566728630465883811983015055433200855646070); +W256.of_int (-57896044605177918687001956587831074660851270707671256656745893357814858874880); +W256.of_int 206560586806369503906741994397762000772476505824968740465311883; +W256.of_int 13479973339852421633450939126351338586088633588469736715148203130881; +W256.of_int (-57896044605177917877255832722949256082138009781081227190387086677747775274879); +W256.of_int (-57896044618657891964083360867415206145441891392473841449373862113267939246071); +W256.of_int 866240039483361945456297907037747473382616397843792694083722; +W256.of_int 853685836012588583927945763457490263623448044251853669531784; +W256.of_int 13480179078138900667299665761280331841242166839448401411882560290825; +W256.of_int 13479973396346337251931066003935984697246077504727327878873813614602; +W256.of_int 13480179894162126267568165104169664557960801185391384887919156166795; +W256.of_int (-57896044618658096836129800417901987324072977609879901317736128966209602322293); +W256.of_int (-57896044618657891160614338737920068330904702256012416862599232229170367922039); +W256.of_int (-57896044618657892001745971279735290730498322133245470726878922889085012901885); +W256.of_int (-57896044618657892008023073015121971494674393923374075606463099685054525177854); +W256.of_int (-57896044618658096905177919507155475730009767301294554993162073721874237357952); +W256.of_int 205750840682504622088163281136835410743010147018288673381711882; +W256.of_int (-57896044605178124312300604384719547540610971740509902075209375727097995067382); +W256.of_int (-57896044605177917877255832722949256082138009781081227190387086677747775274879); +W256.of_int (-57896044618657891217108254356400195208489348367169860778856823392895978405760); +W256.of_int 13479973339852421633450939126351338586088633588469736715148203130881; +W256.of_int (-57896044605177918636785142704737628547442696386642417620072478990058760667128)]. + + +abbrev rho8 = W256.of_int 13620818001941277694121380808605999856886653716761013959207994299728839901191. + + +abbrev rho56 = W256.of_int 10910488462195273559651782724632284871561478246514020268633800075540923875841. + + +abbrev shake_sep = Array4.of_list witness [W64.of_int (-9223372036854775808); +W64.of_int (-9223372036854775808); W64.of_int (-9223372036854775808); +W64.of_int (-9223372036854775808)]. + + +abbrev KECCAK_RC = Array24.of_list witness [W64.of_int 1; W64.of_int 32898; +W64.of_int (-9223372036854742902); W64.of_int (-9223372034707259392); +W64.of_int 32907; W64.of_int 2147483649; W64.of_int (-9223372034707259263); +W64.of_int (-9223372036854743031); W64.of_int 138; W64.of_int 136; +W64.of_int 2147516425; W64.of_int 2147483658; W64.of_int 2147516555; +W64.of_int (-9223372036854775669); W64.of_int (-9223372036854742903); +W64.of_int (-9223372036854743037); W64.of_int (-9223372036854743038); +W64.of_int (-9223372036854775680); W64.of_int 32778; +W64.of_int (-9223372034707292150); W64.of_int (-9223372034707259263); +W64.of_int (-9223372036854742912); W64.of_int 2147483649; +W64.of_int (-9223372034707259384)]. + + +abbrev jdmontx16 = Array16.of_list witness [W16.of_int 1353; W16.of_int 1353; +W16.of_int 1353; W16.of_int 1353; W16.of_int 1353; W16.of_int 1353; +W16.of_int 1353; W16.of_int 1353; W16.of_int 1353; W16.of_int 1353; +W16.of_int 1353; W16.of_int 1353; W16.of_int 1353; W16.of_int 1353; +W16.of_int 1353; W16.of_int 1353]. + + +abbrev mqinvx16 = Array16.of_list witness [W16.of_int 15099; +W16.of_int 15099; W16.of_int 15099; W16.of_int 15099; W16.of_int 15099; +W16.of_int 15099; W16.of_int 15099; W16.of_int 15099; W16.of_int 15099; +W16.of_int 15099; W16.of_int 15099; W16.of_int 15099; W16.of_int 15099; +W16.of_int 15099; W16.of_int 15099; W16.of_int 15099]. + + +abbrev hhqx16 = Array16.of_list witness [W16.of_int 832; W16.of_int 832; +W16.of_int 832; W16.of_int 832; W16.of_int 832; W16.of_int 832; +W16.of_int 832; W16.of_int 832; W16.of_int 832; W16.of_int 832; +W16.of_int 832; W16.of_int 832; W16.of_int 832; W16.of_int 832; +W16.of_int 832; W16.of_int 832]. + + +abbrev hqx16_m1 = Array16.of_list witness [W16.of_int 1664; W16.of_int 1664; +W16.of_int 1664; W16.of_int 1664; W16.of_int 1664; W16.of_int 1664; +W16.of_int 1664; W16.of_int 1664; W16.of_int 1664; W16.of_int 1664; +W16.of_int 1664; W16.of_int 1664; W16.of_int 1664; W16.of_int 1664; +W16.of_int 1664; W16.of_int 1664]. + + +abbrev hqx16_p1 = Array16.of_list witness [W16.of_int 1665; W16.of_int 1665; +W16.of_int 1665; W16.of_int 1665; W16.of_int 1665; W16.of_int 1665; +W16.of_int 1665; W16.of_int 1665; W16.of_int 1665; W16.of_int 1665; +W16.of_int 1665; W16.of_int 1665; W16.of_int 1665; W16.of_int 1665; +W16.of_int 1665; W16.of_int 1665]. + + +abbrev maskx16 = Array16.of_list witness [W16.of_int 4095; W16.of_int 4095; +W16.of_int 4095; W16.of_int 4095; W16.of_int 4095; W16.of_int 4095; +W16.of_int 4095; W16.of_int 4095; W16.of_int 4095; W16.of_int 4095; +W16.of_int 4095; W16.of_int 4095; W16.of_int 4095; W16.of_int 4095; +W16.of_int 4095; W16.of_int 4095]. + + +abbrev jflox16 = Array16.of_list witness [W16.of_int (-10079); +W16.of_int (-10079); W16.of_int (-10079); W16.of_int (-10079); +W16.of_int (-10079); W16.of_int (-10079); W16.of_int (-10079); +W16.of_int (-10079); W16.of_int (-10079); W16.of_int (-10079); +W16.of_int (-10079); W16.of_int (-10079); W16.of_int (-10079); +W16.of_int (-10079); W16.of_int (-10079); W16.of_int (-10079)]. + + +abbrev jfhix16 = Array16.of_list witness [W16.of_int 1441; W16.of_int 1441; +W16.of_int 1441; W16.of_int 1441; W16.of_int 1441; W16.of_int 1441; +W16.of_int 1441; W16.of_int 1441; W16.of_int 1441; W16.of_int 1441; +W16.of_int 1441; W16.of_int 1441; W16.of_int 1441; W16.of_int 1441; +W16.of_int 1441; W16.of_int 1441]. + + +abbrev jvx16 = Array16.of_list witness [W16.of_int 20159; W16.of_int 20159; +W16.of_int 20159; W16.of_int 20159; W16.of_int 20159; W16.of_int 20159; +W16.of_int 20159; W16.of_int 20159; W16.of_int 20159; W16.of_int 20159; +W16.of_int 20159; W16.of_int 20159; W16.of_int 20159; W16.of_int 20159; +W16.of_int 20159; W16.of_int 20159]. + + +abbrev jqinvx16 = Array16.of_list witness [W16.of_int (-3327); +W16.of_int (-3327); W16.of_int (-3327); W16.of_int (-3327); +W16.of_int (-3327); W16.of_int (-3327); W16.of_int (-3327); +W16.of_int (-3327); W16.of_int (-3327); W16.of_int (-3327); +W16.of_int (-3327); W16.of_int (-3327); W16.of_int (-3327); +W16.of_int (-3327); W16.of_int (-3327); W16.of_int (-3327)]. + + +abbrev jqx16 = Array16.of_list witness [W16.of_int 3329; W16.of_int 3329; +W16.of_int 3329; W16.of_int 3329; W16.of_int 3329; W16.of_int 3329; +W16.of_int 3329; W16.of_int 3329; W16.of_int 3329; W16.of_int 3329; +W16.of_int 3329; W16.of_int 3329; W16.of_int 3329; W16.of_int 3329; +W16.of_int 3329; W16.of_int 3329]. + + +abbrev jzetas_inv_exp = Array400.of_list witness [W16.of_int (-23131); +W16.of_int (-7756); W16.of_int 20258; W16.of_int 23860; W16.of_int 17443; +W16.of_int (-23210); W16.of_int 20199; W16.of_int 21498; W16.of_int (-14469); +W16.of_int 11045; W16.of_int 14903; W16.of_int 6280; W16.of_int 32385; +W16.of_int (-15355); W16.of_int (-2145); W16.of_int (-20296); +W16.of_int 1701; W16.of_int 1460; W16.of_int 2338; W16.of_int 308; +W16.of_int 2851; W16.of_int 854; W16.of_int 2535; W16.of_int 1530; +W16.of_int 1659; W16.of_int 3109; W16.of_int 1335; W16.of_int 136; +W16.of_int 2945; W16.of_int 1285; W16.of_int 2719; W16.of_int 2232; +W16.of_int 17423; W16.of_int (-23997); W16.of_int (-28643); +W16.of_int (-31636); W16.of_int (-10906); W16.of_int 22502; W16.of_int 7934; +W16.of_int (-10335); W16.of_int (-16989); W16.of_int (-24214); +W16.of_int (-10945); W16.of_int 20927; W16.of_int (-24391); W16.of_int 7383; +W16.of_int (-25434); W16.of_int 31184; W16.of_int 1807; W16.of_int 2371; +W16.of_int 2333; W16.of_int 108; W16.of_int 870; W16.of_int 1510; +W16.of_int 1278; W16.of_int 1185; W16.of_int 1187; W16.of_int 874; +W16.of_int 2111; W16.of_int 1215; W16.of_int 1465; W16.of_int 2007; +W16.of_int 2726; W16.of_int 2512; W16.of_int 17915; W16.of_int 24156; +W16.of_int (-4311); W16.of_int (-16831); W16.of_int 12757; W16.of_int 29156; +W16.of_int (-14016); W16.of_int (-13426); W16.of_int (-18249); +W16.of_int 30199; W16.of_int (-9075); W16.of_int 28310; W16.of_int 8899; +W16.of_int 15887; W16.of_int 28250; W16.of_int (-19883); W16.of_int 1275; +W16.of_int 2652; W16.of_int 1065; W16.of_int 2881; W16.of_int 725; +W16.of_int 1508; W16.of_int 2368; W16.of_int 398; W16.of_int 951; +W16.of_int 247; W16.of_int 1421; W16.of_int 3222; W16.of_int 2499; +W16.of_int 271; W16.of_int 90; W16.of_int 853; W16.of_int 16163; +W16.of_int 16163; W16.of_int (-26675); W16.of_int (-26675); +W16.of_int (-8858); W16.of_int (-8858); W16.of_int (-18426); +W16.of_int (-18426); W16.of_int (-8799); W16.of_int (-8799); +W16.of_int 10533; W16.of_int 10533; W16.of_int (-24312); W16.of_int (-24312); +W16.of_int 28073; W16.of_int 28073; W16.of_int 1571; W16.of_int 1571; +W16.of_int 205; W16.of_int 205; W16.of_int 2918; W16.of_int 2918; +W16.of_int 1542; W16.of_int 1542; W16.of_int 2721; W16.of_int 2721; +W16.of_int 2597; W16.of_int 2597; W16.of_int 2312; W16.of_int 2312; +W16.of_int 681; W16.of_int 681; W16.of_int (-31163); W16.of_int (-31163); +W16.of_int (-31163); W16.of_int (-31163); W16.of_int 11202; W16.of_int 11202; +W16.of_int 11202; W16.of_int 11202; W16.of_int (-1358); W16.of_int (-1358); +W16.of_int (-1358); W16.of_int (-1358); W16.of_int (-10689); +W16.of_int (-10689); W16.of_int (-10689); W16.of_int (-10689); +W16.of_int 1861; W16.of_int 1861; W16.of_int 1861; W16.of_int 1861; +W16.of_int 1474; W16.of_int 1474; W16.of_int 1474; W16.of_int 1474; +W16.of_int 1202; W16.of_int 1202; W16.of_int 1202; W16.of_int 1202; +W16.of_int 2367; W16.of_int 2367; W16.of_int 2367; W16.of_int 2367; +W16.of_int 16695; W16.of_int 16695; W16.of_int 16695; W16.of_int 16695; +W16.of_int 16695; W16.of_int 16695; W16.of_int 16695; W16.of_int 16695; +W16.of_int (-28190); W16.of_int (-28190); W16.of_int (-28190); +W16.of_int (-28190); W16.of_int (-28190); W16.of_int (-28190); +W16.of_int (-28190); W16.of_int (-28190); W16.of_int 3127; W16.of_int 3127; +W16.of_int 3127; W16.of_int 3127; W16.of_int 3127; W16.of_int 3127; +W16.of_int 3127; W16.of_int 3127; W16.of_int 3042; W16.of_int 3042; +W16.of_int 3042; W16.of_int 3042; W16.of_int 3042; W16.of_int 3042; +W16.of_int 3042; W16.of_int 3042; W16.of_int (-787); W16.of_int (-787); +W16.of_int 1517; W16.of_int 1517; W16.of_int 12619; W16.of_int (-19528); +W16.of_int (-18524); W16.of_int (-20099); W16.of_int (-12638); +W16.of_int 18742; W16.of_int (-30317); W16.of_int 32503; W16.of_int (-5492); +W16.of_int (-23092); W16.of_int 4587; W16.of_int (-13130); W16.of_int 21656; +W16.of_int 14234; W16.of_int (-13386); W16.of_int (-11181); W16.of_int 75; +W16.of_int 3000; W16.of_int 2980; W16.of_int 2685; W16.of_int 2210; +W16.of_int 1846; W16.of_int 147; W16.of_int 2551; W16.of_int 1676; +W16.of_int 460; W16.of_int 235; W16.of_int 2742; W16.of_int 3224; +W16.of_int 2458; W16.of_int 2486; W16.of_int 2899; W16.of_int 5276; +W16.of_int 14431; W16.of_int (-17560); W16.of_int 18486; W16.of_int 28762; +W16.of_int (-29175); W16.of_int (-10630); W16.of_int (-32010); +W16.of_int (-6181); W16.of_int 14883; W16.of_int (-944); W16.of_int 27739; +W16.of_int (-20493); W16.of_int 32227; W16.of_int 11478; W16.of_int 335; +W16.of_int 156; W16.of_int 2911; W16.of_int 872; W16.of_int 1590; +W16.of_int 602; W16.of_int 777; W16.of_int 2170; W16.of_int 246; +W16.of_int 1755; W16.of_int 291; W16.of_int 3152; W16.of_int 2907; +W16.of_int 1779; W16.of_int 1251; W16.of_int 2774; W16.of_int 1103; +W16.of_int (-27836); W16.of_int 25987; W16.of_int 650; W16.of_int (-9134); +W16.of_int 12442; W16.of_int (-16064); W16.of_int (-26616); W16.of_int 12797; +W16.of_int (-25080); W16.of_int (-20710); W16.of_int (-20178); +W16.of_int 23565; W16.of_int (-30966); W16.of_int (-1496); W16.of_int 6517; +W16.of_int 5690; W16.of_int 1860; W16.of_int 3203; W16.of_int 1162; +W16.of_int 1618; W16.of_int 666; W16.of_int 320; W16.of_int 8; +W16.of_int 2813; W16.of_int 1544; W16.of_int 282; W16.of_int 1838; +W16.of_int 1293; W16.of_int 2314; W16.of_int 552; W16.of_int 2677; +W16.of_int 2106; W16.of_int 26242; W16.of_int 26242; W16.of_int (-21438); +W16.of_int (-21438); W16.of_int 1103; W16.of_int 1103; W16.of_int (-5571); +W16.of_int (-5571); W16.of_int 29058; W16.of_int 29058; W16.of_int 26361; +W16.of_int 26361; W16.of_int (-17363); W16.of_int (-17363); W16.of_int 5828; +W16.of_int 5828; W16.of_int 130; W16.of_int 130; W16.of_int 1602; +W16.of_int 1602; W16.of_int 1871; W16.of_int 1871; W16.of_int 829; +W16.of_int 829; W16.of_int 2946; W16.of_int 2946; W16.of_int 3065; +W16.of_int 3065; W16.of_int 1325; W16.of_int 1325; W16.of_int 2756; +W16.of_int 2756; W16.of_int 15691; W16.of_int 15691; W16.of_int 15691; +W16.of_int 15691; W16.of_int 3800; W16.of_int 3800; W16.of_int 3800; +W16.of_int 3800; W16.of_int (-27757); W16.of_int (-27757); +W16.of_int (-27757); W16.of_int (-27757); W16.of_int 20907; W16.of_int 20907; +W16.of_int 20907; W16.of_int 20907; W16.of_int 3147; W16.of_int 3147; +W16.of_int 3147; W16.of_int 3147; W16.of_int 1752; W16.of_int 1752; +W16.of_int 1752; W16.of_int 1752; W16.of_int 2707; W16.of_int 2707; +W16.of_int 2707; W16.of_int 2707; W16.of_int 171; W16.of_int 171; +W16.of_int 171; W16.of_int 171; W16.of_int 12403; W16.of_int 12403; +W16.of_int 12403; W16.of_int 12403; W16.of_int 12403; W16.of_int 12403; +W16.of_int 12403; W16.of_int 12403; W16.of_int (-13524); W16.of_int (-13524); +W16.of_int (-13524); W16.of_int (-13524); W16.of_int (-13524); +W16.of_int (-13524); W16.of_int (-13524); W16.of_int (-13524); +W16.of_int 1907; W16.of_int 1907; W16.of_int 1907; W16.of_int 1907; +W16.of_int 1907; W16.of_int 1907; W16.of_int 1907; W16.of_int 1907; +W16.of_int 1836; W16.of_int 1836; W16.of_int 1836; W16.of_int 1836; +W16.of_int 1836; W16.of_int 1836; W16.of_int 1836; W16.of_int 1836; +W16.of_int (-14745); W16.of_int (-14745); W16.of_int 359; W16.of_int 359; +W16.of_int (-5236); W16.of_int (-5236); W16.of_int 1932; W16.of_int 1932; +W16.of_int 0; W16.of_int 0; W16.of_int 0; W16.of_int 0]. + + +abbrev jzetas_exp = Array400.of_list witness [W16.of_int 31499; +W16.of_int 31499; W16.of_int 2571; W16.of_int 2571; W16.of_int 14746; +W16.of_int 14746; W16.of_int 2970; W16.of_int 2970; W16.of_int 13525; +W16.of_int 13525; W16.of_int 13525; W16.of_int 13525; W16.of_int 13525; +W16.of_int 13525; W16.of_int 13525; W16.of_int 13525; W16.of_int (-12402); +W16.of_int (-12402); W16.of_int (-12402); W16.of_int (-12402); +W16.of_int (-12402); W16.of_int (-12402); W16.of_int (-12402); +W16.of_int (-12402); W16.of_int 1493; W16.of_int 1493; W16.of_int 1493; +W16.of_int 1493; W16.of_int 1493; W16.of_int 1493; W16.of_int 1493; +W16.of_int 1493; W16.of_int 1422; W16.of_int 1422; W16.of_int 1422; +W16.of_int 1422; W16.of_int 1422; W16.of_int 1422; W16.of_int 1422; +W16.of_int 1422; W16.of_int (-20906); W16.of_int (-20906); +W16.of_int (-20906); W16.of_int (-20906); W16.of_int 27758; W16.of_int 27758; +W16.of_int 27758; W16.of_int 27758; W16.of_int (-3799); W16.of_int (-3799); +W16.of_int (-3799); W16.of_int (-3799); W16.of_int (-15690); +W16.of_int (-15690); W16.of_int (-15690); W16.of_int (-15690); +W16.of_int 3158; W16.of_int 3158; W16.of_int 3158; W16.of_int 3158; +W16.of_int 622; W16.of_int 622; W16.of_int 622; W16.of_int 622; +W16.of_int 1577; W16.of_int 1577; W16.of_int 1577; W16.of_int 1577; +W16.of_int 182; W16.of_int 182; W16.of_int 182; W16.of_int 182; +W16.of_int (-5827); W16.of_int (-5827); W16.of_int 17364; W16.of_int 17364; +W16.of_int (-26360); W16.of_int (-26360); W16.of_int (-29057); +W16.of_int (-29057); W16.of_int 5572; W16.of_int 5572; W16.of_int (-1102); +W16.of_int (-1102); W16.of_int 21439; W16.of_int 21439; W16.of_int (-26241); +W16.of_int (-26241); W16.of_int 573; W16.of_int 573; W16.of_int 2004; +W16.of_int 2004; W16.of_int 264; W16.of_int 264; W16.of_int 383; +W16.of_int 383; W16.of_int 2500; W16.of_int 2500; W16.of_int 1458; +W16.of_int 1458; W16.of_int 1727; W16.of_int 1727; W16.of_int 3199; +W16.of_int 3199; W16.of_int (-5689); W16.of_int (-6516); W16.of_int 1497; +W16.of_int 30967; W16.of_int (-23564); W16.of_int 20179; W16.of_int 20711; +W16.of_int 25081; W16.of_int (-12796); W16.of_int 26617; W16.of_int 16065; +W16.of_int (-12441); W16.of_int 9135; W16.of_int (-649); W16.of_int (-25986); +W16.of_int 27837; W16.of_int 1223; W16.of_int 652; W16.of_int 2777; +W16.of_int 1015; W16.of_int 2036; W16.of_int 1491; W16.of_int 3047; +W16.of_int 1785; W16.of_int 516; W16.of_int 3321; W16.of_int 3009; +W16.of_int 2663; W16.of_int 1711; W16.of_int 2167; W16.of_int 126; +W16.of_int 1469; W16.of_int (-334); W16.of_int (-11477); W16.of_int (-32226); +W16.of_int 20494; W16.of_int (-27738); W16.of_int 945; W16.of_int (-14882); +W16.of_int 6182; W16.of_int 32011; W16.of_int 10631; W16.of_int 29176; +W16.of_int (-28761); W16.of_int (-18485); W16.of_int 17561; +W16.of_int (-14430); W16.of_int (-5275); W16.of_int 2226; W16.of_int 555; +W16.of_int 2078; W16.of_int 1550; W16.of_int 422; W16.of_int 177; +W16.of_int 3038; W16.of_int 1574; W16.of_int 3083; W16.of_int 1159; +W16.of_int 2552; W16.of_int 2727; W16.of_int 1739; W16.of_int 2457; +W16.of_int 418; W16.of_int 3173; W16.of_int 11182; W16.of_int 13387; +W16.of_int (-14233); W16.of_int (-21655); W16.of_int 13131; +W16.of_int (-4586); W16.of_int 23093; W16.of_int 5493; W16.of_int (-32502); +W16.of_int 30318; W16.of_int (-18741); W16.of_int 12639; W16.of_int 20100; +W16.of_int 18525; W16.of_int 19529; W16.of_int (-12618); W16.of_int 430; +W16.of_int 843; W16.of_int 871; W16.of_int 105; W16.of_int 587; +W16.of_int 3094; W16.of_int 2869; W16.of_int 1653; W16.of_int 778; +W16.of_int 3182; W16.of_int 1483; W16.of_int 1119; W16.of_int 644; +W16.of_int 349; W16.of_int 329; W16.of_int 3254; W16.of_int 788; +W16.of_int 788; W16.of_int 1812; W16.of_int 1812; W16.of_int 28191; +W16.of_int 28191; W16.of_int 28191; W16.of_int 28191; W16.of_int 28191; +W16.of_int 28191; W16.of_int 28191; W16.of_int 28191; W16.of_int (-16694); +W16.of_int (-16694); W16.of_int (-16694); W16.of_int (-16694); +W16.of_int (-16694); W16.of_int (-16694); W16.of_int (-16694); +W16.of_int (-16694); W16.of_int 287; W16.of_int 287; W16.of_int 287; +W16.of_int 287; W16.of_int 287; W16.of_int 287; W16.of_int 287; +W16.of_int 287; W16.of_int 202; W16.of_int 202; W16.of_int 202; +W16.of_int 202; W16.of_int 202; W16.of_int 202; W16.of_int 202; +W16.of_int 202; W16.of_int 10690; W16.of_int 10690; W16.of_int 10690; +W16.of_int 10690; W16.of_int 1359; W16.of_int 1359; W16.of_int 1359; +W16.of_int 1359; W16.of_int (-11201); W16.of_int (-11201); +W16.of_int (-11201); W16.of_int (-11201); W16.of_int 31164; W16.of_int 31164; +W16.of_int 31164; W16.of_int 31164; W16.of_int 962; W16.of_int 962; +W16.of_int 962; W16.of_int 962; W16.of_int 2127; W16.of_int 2127; +W16.of_int 2127; W16.of_int 2127; W16.of_int 1855; W16.of_int 1855; +W16.of_int 1855; W16.of_int 1855; W16.of_int 1468; W16.of_int 1468; +W16.of_int 1468; W16.of_int 1468; W16.of_int (-28072); W16.of_int (-28072); +W16.of_int 24313; W16.of_int 24313; W16.of_int (-10532); W16.of_int (-10532); +W16.of_int 8800; W16.of_int 8800; W16.of_int 18427; W16.of_int 18427; +W16.of_int 8859; W16.of_int 8859; W16.of_int 26676; W16.of_int 26676; +W16.of_int (-16162); W16.of_int (-16162); W16.of_int 2648; W16.of_int 2648; +W16.of_int 1017; W16.of_int 1017; W16.of_int 732; W16.of_int 732; +W16.of_int 608; W16.of_int 608; W16.of_int 1787; W16.of_int 1787; +W16.of_int 411; W16.of_int 411; W16.of_int 3124; W16.of_int 3124; +W16.of_int 1758; W16.of_int 1758; W16.of_int 19884; W16.of_int (-28249); +W16.of_int (-15886); W16.of_int (-8898); W16.of_int (-28309); +W16.of_int 9076; W16.of_int (-30198); W16.of_int 18250; W16.of_int 13427; +W16.of_int 14017; W16.of_int (-29155); W16.of_int (-12756); W16.of_int 16832; +W16.of_int 4312; W16.of_int (-24155); W16.of_int (-17914); W16.of_int 2476; +W16.of_int 3239; W16.of_int 3058; W16.of_int 830; W16.of_int 107; +W16.of_int 1908; W16.of_int 3082; W16.of_int 2378; W16.of_int 2931; +W16.of_int 961; W16.of_int 1821; W16.of_int 2604; W16.of_int 448; +W16.of_int 2264; W16.of_int 677; W16.of_int 2054; W16.of_int (-31183); +W16.of_int 25435; W16.of_int (-7382); W16.of_int 24392; W16.of_int (-20926); +W16.of_int 10946; W16.of_int 24215; W16.of_int 16990; W16.of_int 10336; +W16.of_int (-7933); W16.of_int (-22501); W16.of_int 10907; W16.of_int 31637; +W16.of_int 28644; W16.of_int 23998; W16.of_int (-17422); W16.of_int 817; +W16.of_int 603; W16.of_int 1322; W16.of_int 1864; W16.of_int 2114; +W16.of_int 1218; W16.of_int 2455; W16.of_int 2142; W16.of_int 2144; +W16.of_int 2051; W16.of_int 1819; W16.of_int 2459; W16.of_int 3221; +W16.of_int 996; W16.of_int 958; W16.of_int 1522; W16.of_int 20297; +W16.of_int 2146; W16.of_int 15356; W16.of_int (-32384); W16.of_int (-6279); +W16.of_int (-14902); W16.of_int (-11044); W16.of_int 14470; +W16.of_int (-21497); W16.of_int (-20198); W16.of_int 23211; +W16.of_int (-17442); W16.of_int (-23859); W16.of_int (-20257); +W16.of_int 7757; W16.of_int 23132; W16.of_int 1097; W16.of_int 610; +W16.of_int 2044; W16.of_int 384; W16.of_int 3193; W16.of_int 1994; +W16.of_int 220; W16.of_int 1670; W16.of_int 1799; W16.of_int 794; +W16.of_int 2475; W16.of_int 478; W16.of_int 3021; W16.of_int 991; +W16.of_int 1869; W16.of_int 1628; W16.of_int 0; W16.of_int 0; W16.of_int 0; +W16.of_int 0]. + + +abbrev jzetas_inv = Array128.of_list witness [W16.of_int 1701; +W16.of_int 1807; W16.of_int 1460; W16.of_int 2371; W16.of_int 2338; +W16.of_int 2333; W16.of_int 308; W16.of_int 108; W16.of_int 2851; +W16.of_int 870; W16.of_int 854; W16.of_int 1510; W16.of_int 2535; +W16.of_int 1278; W16.of_int 1530; W16.of_int 1185; W16.of_int 1659; +W16.of_int 1187; W16.of_int 3109; W16.of_int 874; W16.of_int 1335; +W16.of_int 2111; W16.of_int 136; W16.of_int 1215; W16.of_int 2945; +W16.of_int 1465; W16.of_int 1285; W16.of_int 2007; W16.of_int 2719; +W16.of_int 2726; W16.of_int 2232; W16.of_int 2512; W16.of_int 75; +W16.of_int 156; W16.of_int 3000; W16.of_int 2911; W16.of_int 2980; +W16.of_int 872; W16.of_int 2685; W16.of_int 1590; W16.of_int 2210; +W16.of_int 602; W16.of_int 1846; W16.of_int 777; W16.of_int 147; +W16.of_int 2170; W16.of_int 2551; W16.of_int 246; W16.of_int 1676; +W16.of_int 1755; W16.of_int 460; W16.of_int 291; W16.of_int 235; +W16.of_int 3152; W16.of_int 2742; W16.of_int 2907; W16.of_int 3224; +W16.of_int 1779; W16.of_int 2458; W16.of_int 1251; W16.of_int 2486; +W16.of_int 2774; W16.of_int 2899; W16.of_int 1103; W16.of_int 1275; +W16.of_int 2652; W16.of_int 1065; W16.of_int 2881; W16.of_int 725; +W16.of_int 1508; W16.of_int 2368; W16.of_int 398; W16.of_int 951; +W16.of_int 247; W16.of_int 1421; W16.of_int 3222; W16.of_int 2499; +W16.of_int 271; W16.of_int 90; W16.of_int 853; W16.of_int 1860; +W16.of_int 3203; W16.of_int 1162; W16.of_int 1618; W16.of_int 666; +W16.of_int 320; W16.of_int 8; W16.of_int 2813; W16.of_int 1544; +W16.of_int 282; W16.of_int 1838; W16.of_int 1293; W16.of_int 2314; +W16.of_int 552; W16.of_int 2677; W16.of_int 2106; W16.of_int 1571; +W16.of_int 205; W16.of_int 2918; W16.of_int 1542; W16.of_int 2721; +W16.of_int 2597; W16.of_int 2312; W16.of_int 681; W16.of_int 130; +W16.of_int 1602; W16.of_int 1871; W16.of_int 829; W16.of_int 2946; +W16.of_int 3065; W16.of_int 1325; W16.of_int 2756; W16.of_int 1861; +W16.of_int 1474; W16.of_int 1202; W16.of_int 2367; W16.of_int 3147; +W16.of_int 1752; W16.of_int 2707; W16.of_int 171; W16.of_int 3127; +W16.of_int 3042; W16.of_int 1907; W16.of_int 1836; W16.of_int 1517; +W16.of_int 359; W16.of_int 758; W16.of_int 1441]. + + +abbrev jzetas = Array128.of_list witness [W16.of_int 2285; W16.of_int 2571; +W16.of_int 2970; W16.of_int 1812; W16.of_int 1493; W16.of_int 1422; +W16.of_int 287; W16.of_int 202; W16.of_int 3158; W16.of_int 622; +W16.of_int 1577; W16.of_int 182; W16.of_int 962; W16.of_int 2127; +W16.of_int 1855; W16.of_int 1468; W16.of_int 573; W16.of_int 2004; +W16.of_int 264; W16.of_int 383; W16.of_int 2500; W16.of_int 1458; +W16.of_int 1727; W16.of_int 3199; W16.of_int 2648; W16.of_int 1017; +W16.of_int 732; W16.of_int 608; W16.of_int 1787; W16.of_int 411; +W16.of_int 3124; W16.of_int 1758; W16.of_int 1223; W16.of_int 652; +W16.of_int 2777; W16.of_int 1015; W16.of_int 2036; W16.of_int 1491; +W16.of_int 3047; W16.of_int 1785; W16.of_int 516; W16.of_int 3321; +W16.of_int 3009; W16.of_int 2663; W16.of_int 1711; W16.of_int 2167; +W16.of_int 126; W16.of_int 1469; W16.of_int 2476; W16.of_int 3239; +W16.of_int 3058; W16.of_int 830; W16.of_int 107; W16.of_int 1908; +W16.of_int 3082; W16.of_int 2378; W16.of_int 2931; W16.of_int 961; +W16.of_int 1821; W16.of_int 2604; W16.of_int 448; W16.of_int 2264; +W16.of_int 677; W16.of_int 2054; W16.of_int 2226; W16.of_int 430; +W16.of_int 555; W16.of_int 843; W16.of_int 2078; W16.of_int 871; +W16.of_int 1550; W16.of_int 105; W16.of_int 422; W16.of_int 587; +W16.of_int 177; W16.of_int 3094; W16.of_int 3038; W16.of_int 2869; +W16.of_int 1574; W16.of_int 1653; W16.of_int 3083; W16.of_int 778; +W16.of_int 1159; W16.of_int 3182; W16.of_int 2552; W16.of_int 1483; +W16.of_int 2727; W16.of_int 1119; W16.of_int 1739; W16.of_int 644; +W16.of_int 2457; W16.of_int 349; W16.of_int 418; W16.of_int 329; +W16.of_int 3173; W16.of_int 3254; W16.of_int 817; W16.of_int 1097; +W16.of_int 603; W16.of_int 610; W16.of_int 1322; W16.of_int 2044; +W16.of_int 1864; W16.of_int 384; W16.of_int 2114; W16.of_int 3193; +W16.of_int 1218; W16.of_int 1994; W16.of_int 2455; W16.of_int 220; +W16.of_int 2142; W16.of_int 1670; W16.of_int 2144; W16.of_int 1799; +W16.of_int 2051; W16.of_int 794; W16.of_int 1819; W16.of_int 2475; +W16.of_int 2459; W16.of_int 478; W16.of_int 3221; W16.of_int 3021; +W16.of_int 996; W16.of_int 991; W16.of_int 958; W16.of_int 1869; +W16.of_int 1522; W16.of_int 1628]. + + +module type Syscall_t = { + proc randombytes_32(_:W8.t Array32.t) : W8.t Array32.t + proc randombytes_64(_:W8.t Array64.t) : W8.t Array64.t +}. + +module Syscall : Syscall_t = { + proc randombytes_32(a:W8.t Array32.t) : W8.t Array32.t = { + a <$ dmap WArray32.darray + (fun a => Array32.init (fun i => WArray32.get8 a i)); + return a; + } + + proc randombytes_64(a:W8.t Array64.t) : W8.t Array64.t = { + a <$ dmap WArray64.darray + (fun a => Array64.init (fun i => WArray64.get8 a i)); + return a; + } +}. + +module M(SC:Syscall_t) = { + proc __shuffle8 (a:W256.t, b:W256.t) : W256.t * W256.t = { + + var r0:W256.t; + var r1:W256.t; + + r0 <- VPERM2I128 a b (W8.of_int 32); + r1 <- VPERM2I128 a b (W8.of_int 49); + return (r0, r1); + } + + proc __shuffle4 (a:W256.t, b:W256.t) : W256.t * W256.t = { + + var r0:W256.t; + var r1:W256.t; + + r0 <- VPUNPCKL_4u64 a b; + r1 <- VPUNPCKH_4u64 a b; + return (r0, r1); + } + + proc __shuffle2 (a:W256.t, b:W256.t) : W256.t * W256.t = { + + var t0:W256.t; + var t1:W256.t; + + t0 <- VMOVSLDUP_256 b; + t0 <- VPBLEND_8u32 a t0 (W8.of_int 170); + a <- VPSRL_4u64 a (W8.of_int 32); + t1 <- VPBLEND_8u32 a b (W8.of_int 170); + return (t0, t1); + } + + proc __shuffle1 (a:W256.t, b:W256.t) : W256.t * W256.t = { + + var r0:W256.t; + var r1:W256.t; + var t0:W256.t; + var t1:W256.t; + + t0 <- VPSLL_8u32 b (W8.of_int 16); + r0 <- VPBLEND_16u16 a t0 (W8.of_int 170); + t1 <- VPSRL_8u32 a (W8.of_int 16); + r1 <- VPBLEND_16u16 t1 b (W8.of_int 170); + return (r0, r1); + } + + proc __nttunpack128 (r0:W256.t, r1:W256.t, r2:W256.t, r3:W256.t, r4:W256.t, + r5:W256.t, r6:W256.t, r7:W256.t) : W256.t * W256.t * + W256.t * W256.t * + W256.t * W256.t * + W256.t * W256.t = { + + + + (r0, r4) <@ __shuffle8 (r0, r4); + (r1, r5) <@ __shuffle8 (r1, r5); + (r2, r6) <@ __shuffle8 (r2, r6); + (r3, r7) <@ __shuffle8 (r3, r7); + (r0, r2) <@ __shuffle4 (r0, r2); + (r4, r6) <@ __shuffle4 (r4, r6); + (r1, r3) <@ __shuffle4 (r1, r3); + (r5, r7) <@ __shuffle4 (r5, r7); + (r0, r1) <@ __shuffle2 (r0, r1); + (r2, r3) <@ __shuffle2 (r2, r3); + (r4, r5) <@ __shuffle2 (r4, r5); + (r6, r7) <@ __shuffle2 (r6, r7); + (r0, r4) <@ __shuffle1 (r0, r4); + (r1, r5) <@ __shuffle1 (r1, r5); + (r2, r6) <@ __shuffle1 (r2, r6); + (r3, r7) <@ __shuffle1 (r3, r7); + return (r0, r4, r1, r5, r2, r6, r3, r7); + } + + proc _nttunpack (rp:W16.t Array256.t) : W16.t Array256.t = { + + var r0:W256.t; + var r1:W256.t; + var r2:W256.t; + var r3:W256.t; + var r4:W256.t; + var r5:W256.t; + var r6:W256.t; + var r7:W256.t; + + r0 <- (get256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 0)); + r1 <- (get256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 1)); + r2 <- (get256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 2)); + r3 <- (get256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 3)); + r4 <- (get256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 4)); + r5 <- (get256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 5)); + r6 <- (get256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 6)); + r7 <- (get256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 7)); + (r0, r1, r2, r3, r4, r5, r6, r7) <@ __nttunpack128 (r0, r1, r2, r3, r4, + r5, r6, r7); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 0) (r0))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 1) (r1))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 2) (r2))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 3) (r3))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 4) (r4))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 5) (r5))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 6) (r6))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 7) (r7))); + r0 <- (get256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 8)); + r1 <- (get256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 9)); + r2 <- (get256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 10)); + r3 <- (get256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 11)); + r4 <- (get256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 12)); + r5 <- (get256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 13)); + r6 <- (get256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 14)); + r7 <- (get256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 15)); + (r0, r1, r2, r3, r4, r5, r6, r7) <@ __nttunpack128 (r0, r1, r2, r3, r4, + r5, r6, r7); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 8) (r0))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 9) (r1))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 10) (r2))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 11) (r3))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 12) (r4))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 13) (r5))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 14) (r6))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 15) (r7))); + return (rp); + } + + proc __csubq (r:W256.t, qx16:W256.t) : W256.t = { + + var t:W256.t; + + r <- VPSUB_16u16 r qx16; + t <- VPSRA_16u16 r (W8.of_int 15); + t <- VPAND_256 t qx16; + r <- VPADD_16u16 t r; + return (r); + } + + proc __red16x (r:W256.t, qx16:W256.t, vx16:W256.t) : W256.t = { + + var x:W256.t; + + x <- VPMULH_16u16 r vx16; + x <- VPSRA_16u16 x (W8.of_int 10); + x <- VPMULL_16u16 x qx16; + r <- VPSUB_16u16 r x; + return (r); + } + + proc __fqmulprecomp16x (b:W256.t, al:W256.t, ah:W256.t, qx16:W256.t) : + W256.t = { + + var x:W256.t; + + x <- VPMULL_16u16 al b; + b <- VPMULH_16u16 ah b; + x <- VPMULH_16u16 x qx16; + b <- VPSUB_16u16 b x; + return (b); + } + + proc __fqmulx16 (a:W256.t, b:W256.t, qx16:W256.t, qinvx16:W256.t) : + W256.t = { + + var rd:W256.t; + var rhi:W256.t; + var rlo:W256.t; + + rhi <- VPMULH_16u16 a b; + rlo <- VPMULL_16u16 a b; + rlo <- VPMULL_16u16 rlo qinvx16; + rlo <- VPMULH_16u16 rlo qx16; + rd <- VPSUB_16u16 rhi rlo; + return (rd); + } + + proc __index (x:int, y:int) : int = { + + var r:int; + + r <- ((x %% 5) + (5 * (y %% 5))); + return (r); + } + + proc __keccak_rho_offsets (i:int) : int = { + var aux: int; + + var r:int; + var x:int; + var y:int; + var t:int; + var z:int; + + r <- 0; + x <- 1; + y <- 0; + t <- 0; + while (t < 24) { + if ((i = (x + (5 * y)))) { + r <- ((((t + 1) * (t + 2)) %/ 2) %% 64); + } else { + + } + z <- (((2 * x) + (3 * y)) %% 5); + x <- y; + y <- z; + t <- t + 1; + } + return (r); + } + + proc __rhotates (x:int, y:int) : int = { + + var r:int; + var i:int; + + i <@ __index (x, y); + r <@ __keccak_rho_offsets (i); + return (r); + } + + proc __theta_sum_scalar (a:W64.t Array25.t) : W64.t Array5.t = { + var aux: int; + + var c:W64.t Array5.t; + var i:int; + var ti:int; + var j:int; + c <- witness; + i <- 0; + while (i < 5) { + ti <@ __index (i, 0); + c.[i] <- a.[ti]; + i <- i + 1; + } + j <- 1; + while (j < 5) { + i <- 0; + while (i < 5) { + ti <@ __index (i, j); + c.[i] <- (c.[i] `^` a.[ti]); + i <- i + 1; + } + j <- j + 1; + } + return (c); + } + + proc __theta_rol_scalar (c:W64.t Array5.t) : W64.t Array5.t = { + var aux_1: bool; + var aux_0: bool; + var aux: int; + var aux_2: W64.t; + + var d:W64.t Array5.t; + var i:int; + var _0:bool; + var _1:bool; + d <- witness; + i <- 0; + while (i < 5) { + d.[i] <- c.[((i + 1) %% 5)]; + (aux_1, aux_0, aux_2) <- ROL_64 d.[i] (W8.of_int 1); + _0 <- aux_1; + _1 <- aux_0; + d.[i] <- aux_2; + d.[i] <- (d.[i] `^` c.[((i + 4) %% 5)]); + i <- i + 1; + } + return (d); + } + + proc __rol_sum_scalar (d:W64.t Array5.t, a:W64.t Array25.t, offset:int) : + W64.t Array5.t = { + var aux_1: bool; + var aux_0: bool; + var aux: int; + var aux_2: W64.t; + + var c:W64.t Array5.t; + var j:int; + var j1:int; + var k:int; + var ti:int; + var _0:bool; + var _1:bool; + c <- witness; + j <- 0; + while (j < 5) { + j1 <- ((j + offset) %% 5); + k <@ __rhotates (j1, j); + ti <@ __index (j1, j); + c.[j] <- a.[ti]; + c.[j] <- (c.[j] `^` d.[j1]); + (aux_1, aux_0, aux_2) <- ROL_64 c.[j] (W8.of_int k); + _0 <- aux_1; + _1 <- aux_0; + c.[j] <- aux_2; + j <- j + 1; + } + return (c); + } + + proc __set_row_scalar (r:W64.t Array25.t, row:int, c:W64.t Array5.t, + iota_0:W64.t) : W64.t Array25.t = { + var aux: int; + + var j:int; + var j1:int; + var j2:int; + var t:W64.t; + var ti:int; + + j <- 0; + while (j < 5) { + j1 <- ((j + 1) %% 5); + j2 <- ((j + 2) %% 5); + t <- ((invw c.[j1]) `&` c.[j2]); + if (((row = 0) /\ (j = 0))) { + t <- (t `^` iota_0); + } else { + + } + t <- (t `^` c.[j]); + ti <@ __index (j, row); + r.[ti] <- t; + j <- j + 1; + } + return (r); + } + + proc __round2x_scalar (a:W64.t Array25.t, r:W64.t Array25.t, iota_0:W64.t) : + W64.t Array25.t * W64.t Array25.t = { + + var c:W64.t Array5.t; + var d:W64.t Array5.t; + c <- witness; + d <- witness; + c <@ __theta_sum_scalar (a); + d <@ __theta_rol_scalar (c); + c <@ __rol_sum_scalar (d, a, 0); + r <@ __set_row_scalar (r, 0, c, iota_0); + c <@ __rol_sum_scalar (d, a, 3); + r <@ __set_row_scalar (r, 1, c, iota_0); + c <@ __rol_sum_scalar (d, a, 1); + r <@ __set_row_scalar (r, 2, c, iota_0); + c <@ __rol_sum_scalar (d, a, 4); + r <@ __set_row_scalar (r, 3, c, iota_0); + c <@ __rol_sum_scalar (d, a, 2); + r <@ __set_row_scalar (r, 4, c, iota_0); + return (a, r); + } + + proc _keccakf1600_scalar (a:W64.t Array25.t) : W64.t Array25.t = { + + var iotas_p:W64.t Array24.t; + var round:W64.t; + var iota_0:W64.t; + var round_s:W64.t; + var r:W64.t Array25.t; + iotas_p <- witness; + r <- witness; + iotas_p <- KECCAK_RC; + round <- (W64.of_int 0); + + while ((round \ult (W64.of_int 24))) { + iota_0 <- iotas_p.[(W64.to_uint round)]; + round_s <- round; + (a, r) <@ __round2x_scalar (a, r, iota_0); + round <- round_s; + round <- (round + (W64.of_int 1)); + iota_0 <- iotas_p.[(W64.to_uint round)]; + round_s <- round; + (r, a) <@ __round2x_scalar (r, a, iotas_p.[(W64.to_uint round)]); + round <- round_s; + round <- (round + (W64.of_int 1)); + } + return (a); + } + + proc __st0 (state:W64.t Array25.t) : W64.t Array25.t = { + var aux: int; + + var i:int; + + i <- 0; + while (i < 25) { + state.[i] <- (W64.of_int 0); + i <- i + 1; + } + return (state); + } + + proc __add_full_block (state:W64.t Array25.t, in_0:W64.t, inlen:W64.t, + r8:W64.t) : W64.t Array25.t * W64.t * W64.t = { + + var r64:W64.t; + var i:W64.t; + var t:W64.t; + + r64 <- r8; + r64 <- (r64 `>>` (W8.of_int 3)); + i <- (W64.of_int 0); + + while ((i \ult r64)) { + t <- (loadW64 Glob.mem (W64.to_uint (in_0 + ((W64.of_int 8) * i)))); + state.[(W64.to_uint i)] <- (state.[(W64.to_uint i)] `^` t); + i <- (i + (W64.of_int 1)); + } + in_0 <- (in_0 + r8); + inlen <- (inlen - r8); + return (state, in_0, inlen); + } + + proc __add_final_block (state:W64.t Array25.t, in_0:W64.t, inlen:W64.t, + trail_byte:W8.t, r8:W64.t) : W64.t Array25.t = { + + var inlen8:W64.t; + var i:W64.t; + var t:W64.t; + var c:W8.t; + + inlen8 <- inlen; + inlen8 <- (inlen8 `>>` (W8.of_int 3)); + i <- (W64.of_int 0); + + while ((i \ult inlen8)) { + t <- (loadW64 Glob.mem (W64.to_uint (in_0 + ((W64.of_int 8) * i)))); + state.[(W64.to_uint i)] <- (state.[(W64.to_uint i)] `^` t); + i <- (i + (W64.of_int 1)); + } + i <- (i `<<` (W8.of_int 3)); + + while ((i \ult inlen)) { + c <- (loadW8 Glob.mem (W64.to_uint (in_0 + i))); + state <- + Array25.init + (WArray200.get64 (WArray200.set8 (WArray200.init64 (fun i_0 => (state).[i_0])) (W64.to_uint i) (( + (get8 (WArray200.init64 (fun i_0 => (state).[i_0])) (W64.to_uint i)) `^` c)))); + i <- (i + (W64.of_int 1)); + } + state <- + Array25.init + (WArray200.get64 (WArray200.set8 (WArray200.init64 (fun i_0 => (state).[i_0])) (W64.to_uint i) (( + (get8 (WArray200.init64 (fun i_0 => (state).[i_0])) (W64.to_uint i)) `^` trail_byte)))); + i <- r8; + i <- (i - (W64.of_int 1)); + state <- + Array25.init + (WArray200.get64 (WArray200.set8 (WArray200.init64 (fun i_0 => (state).[i_0])) (W64.to_uint i) (( + (get8 (WArray200.init64 (fun i_0 => (state).[i_0])) (W64.to_uint i)) `^` (W8.of_int 128))))); + return (state); + } + + proc _isha3_256 (out:W8.t Array32.t, in_0:W64.t, inlen:W64.t) : W8.t Array32.t = { + var aux: int; + + var s_out:W8.t Array32.t; + var state:W64.t Array25.t; + var r8:W64.t; + var ilen:W64.t; + var s_in:W64.t; + var s_ilen:W64.t; + var s_r8:W64.t; + var t8:W8.t; + var i:int; + var t64:W64.t; + s_out <- witness; + state <- witness; + s_out <- out; + state <@ __st0 (state); + r8 <- (W64.of_int 136); + ilen <- inlen; + + while ((r8 \ule ilen)) { + (state, in_0, ilen) <@ __add_full_block (state, in_0, ilen, r8); + s_in <- in_0; + s_ilen <- ilen; + s_r8 <- r8; + state <@ _keccakf1600_scalar (state); + in_0 <- s_in; + ilen <- s_ilen; + r8 <- s_r8; + } + t8 <- (W8.of_int 6); + state <@ __add_final_block (state, in_0, ilen, t8, r8); + state <@ _keccakf1600_scalar (state); + out <- s_out; + i <- 0; + while (i < 4) { + t64 <- state.[i]; + out <- + Array32.init + (WArray32.get8 (WArray32.set64 (WArray32.init8 (fun i_0 => (out).[i_0])) i (t64))); + i <- i + 1; + } + return (out); + } + + proc _shake256_64 (out:W64.t, outlen:W64.t, in_0:W8.t Array64.t) : unit = { + var aux: int; + + var s_out:W64.t; + var s_outlen:W64.t; + var state:W64.t Array25.t; + var i:int; + var t64:W64.t; + var j:W64.t; + var c:W8.t; + state <- witness; + s_out <- out; + s_outlen <- outlen; + state <@ __st0 (state); + i <- 0; + while (i < 8) { + t64 <- (get64 (WArray64.init8 (fun i_0 => (in_0).[i_0])) i); + state.[i] <- (state.[i] `^` t64); + i <- i + 1; + } + state <- + Array25.init + (WArray200.get64 (WArray200.set8 (WArray200.init64 (fun i_0 => (state).[i_0])) 64 (( + (get8 (WArray200.init64 (fun i_0 => (state).[i_0])) 64) `^` (W8.of_int 31))))); + state <- + Array25.init + (WArray200.get64 (WArray200.set8 (WArray200.init64 (fun i_0 => (state).[i_0])) (136 - 1) (( + (get8 (WArray200.init64 (fun i_0 => (state).[i_0])) (136 - 1)) `^` (W8.of_int 128))))); + state <@ _keccakf1600_scalar (state); + outlen <- s_outlen; + out <- s_out; + + while (((W64.of_int 136) \ult outlen)) { + aux <- (136 %/ 8); + i <- 0; + while (i < aux) { + t64 <- state.[i]; + Glob.mem <- + storeW64 Glob.mem (W64.to_uint (out + (W64.of_int (8 * i)))) (t64); + i <- i + 1; + } + out <- (out + (W64.of_int 136)); + outlen <- (outlen - (W64.of_int 136)); + s_out <- out; + s_outlen <- outlen; + state <@ _keccakf1600_scalar (state); + outlen <- s_outlen; + out <- s_out; + } + s_outlen <- outlen; + outlen <- (outlen `>>` (W8.of_int 3)); + j <- (W64.of_int 0); + + while ((j \ult outlen)) { + t64 <- state.[(W64.to_uint j)]; + Glob.mem <- + storeW64 Glob.mem (W64.to_uint (out + ((W64.of_int 8) * j))) (t64); + j <- (j + (W64.of_int 1)); + } + j <- (j `<<` (W8.of_int 3)); + outlen <- s_outlen; + + while ((j \ult outlen)) { + c <- + (get8 (WArray200.init64 (fun i_0 => (state).[i_0])) (W64.to_uint j)); + Glob.mem <- storeW8 Glob.mem (W64.to_uint (out + j)) (c); + j <- (j + (W64.of_int 1)); + } + return (); + } + + proc _isha3_256_32 (out:W8.t Array32.t, in_0:W8.t Array32.t) : W8.t Array32.t = { + var aux: int; + + var s_out:W8.t Array32.t; + var state:W64.t Array25.t; + var i:int; + var t64:W64.t; + s_out <- witness; + state <- witness; + s_out <- out; + state <@ __st0 (state); + aux <- (32 %/ 8); + i <- 0; + while (i < aux) { + t64 <- (get64 (WArray32.init8 (fun i_0 => (in_0).[i_0])) i); + state.[i] <- t64; + i <- i + 1; + } + state <- + Array25.init + (WArray200.get64 (WArray200.set8 (WArray200.init64 (fun i_0 => (state).[i_0])) 32 (( + (get8 (WArray200.init64 (fun i_0 => (state).[i_0])) 32) `^` (W8.of_int 6))))); + state <- + Array25.init + (WArray200.get64 (WArray200.set8 (WArray200.init64 (fun i_0 => (state).[i_0])) (136 - 1) ((W8.of_int 128)))); + state <@ _keccakf1600_scalar (state); + out <- s_out; + i <- 0; + while (i < 4) { + t64 <- state.[i]; + out <- + Array32.init + (WArray32.get8 (WArray32.set64 (WArray32.init8 (fun i_0 => (out).[i_0])) i (t64))); + i <- i + 1; + } + return (out); + } + + proc _sha3_512_64 (out:W8.t Array64.t, in_0:W8.t Array64.t) : W8.t Array64.t = { + var aux: int; + + var state:W64.t Array25.t; + var i:int; + var t64:W64.t; + var out_s:W8.t Array64.t; + out_s <- witness; + state <- witness; + state <@ __st0 (state); + i <- 0; + while (i < 8) { + t64 <- (get64 (WArray64.init8 (fun i_0 => (in_0).[i_0])) i); + state.[i] <- (state.[i] `^` t64); + i <- i + 1; + } + state <- + Array25.init + (WArray200.get64 (WArray200.set8 (WArray200.init64 (fun i_0 => (state).[i_0])) 64 (( + (get8 (WArray200.init64 (fun i_0 => (state).[i_0])) 64) `^` (W8.of_int 6))))); + state <- + Array25.init + (WArray200.get64 (WArray200.set8 (WArray200.init64 (fun i_0 => (state).[i_0])) (72 - 1) (( + (get8 (WArray200.init64 (fun i_0 => (state).[i_0])) (72 - 1)) `^` (W8.of_int 128))))); + out_s <- out; + state <@ _keccakf1600_scalar (state); + out <- out_s; + i <- 0; + while (i < 8) { + t64 <- state.[i]; + out <- + Array64.init + (WArray64.get8 (WArray64.set64 (WArray64.init8 (fun i_0 => (out).[i_0])) i (t64))); + i <- i + 1; + } + return (out); + } + + proc _sha3_512_32 (out:W8.t Array64.t, in_0:W8.t Array32.t) : W8.t Array64.t = { + var aux: int; + + var state:W64.t Array25.t; + var i:int; + var t64:W64.t; + var out_s:W8.t Array64.t; + out_s <- witness; + state <- witness; + state <@ __st0 (state); + i <- 0; + while (i < 4) { + t64 <- (get64 (WArray32.init8 (fun i_0 => (in_0).[i_0])) i); + state.[i] <- (state.[i] `^` t64); + i <- i + 1; + } + state <- + Array25.init + (WArray200.get64 (WArray200.set8 (WArray200.init64 (fun i_0 => (state).[i_0])) 32 (( + (get8 (WArray200.init64 (fun i_0 => (state).[i_0])) 32) `^` (W8.of_int 6))))); + state <- + Array25.init + (WArray200.get64 (WArray200.set8 (WArray200.init64 (fun i_0 => (state).[i_0])) (72 - 1) (( + (get8 (WArray200.init64 (fun i_0 => (state).[i_0])) (72 - 1)) `^` (W8.of_int 128))))); + out_s <- out; + state <@ _keccakf1600_scalar (state); + out <- out_s; + i <- 0; + while (i < 8) { + t64 <- state.[i]; + out <- + Array64.init + (WArray64.get8 (WArray64.set64 (WArray64.init8 (fun i_0 => (out).[i_0])) i (t64))); + i <- i + 1; + } + return (out); + } + + proc _shake128_absorb34 (state:W64.t Array25.t, in_0:W8.t Array34.t) : + W64.t Array25.t = { + var aux: int; + + var i:int; + var t64:W64.t; + var t16:W16.t; + + state <@ __st0 (state); + i <- 0; + while (i < 4) { + t64 <- (get64 (WArray34.init8 (fun i_0 => (in_0).[i_0])) i); + state.[i] <- (state.[i] `^` t64); + i <- i + 1; + } + t16 <- (get16_direct (WArray34.init8 (fun i_0 => (in_0).[i_0])) 32); + state <- + Array25.init + (WArray200.get64 (WArray200.set16 (WArray200.init64 (fun i_0 => (state).[i_0])) 16 (( + (get16 (WArray200.init64 (fun i_0 => (state).[i_0])) 16) `^` t16)))); + state <- + Array25.init + (WArray200.get64 (WArray200.set8 (WArray200.init64 (fun i_0 => (state).[i_0])) 34 (( + (get8 (WArray200.init64 (fun i_0 => (state).[i_0])) 34) `^` (W8.of_int 31))))); + state <- + Array25.init + (WArray200.get64 (WArray200.set8 (WArray200.init64 (fun i_0 => (state).[i_0])) (168 - 1) (( + (get8 (WArray200.init64 (fun i_0 => (state).[i_0])) (168 - 1)) `^` (W8.of_int 128))))); + return (state); + } + + proc _shake128_squeezeblock (state:W64.t Array25.t, out:W8.t Array168.t) : + W64.t Array25.t * W8.t Array168.t = { + var aux: int; + + var out_s:W8.t Array168.t; + var i:int; + var t:W64.t; + out_s <- witness; + out_s <- out; + state <@ _keccakf1600_scalar (state); + out <- out_s; + aux <- (168 %/ 8); + i <- 0; + while (i < aux) { + t <- state.[i]; + out <- + Array168.init + (WArray168.get8 (WArray168.set64 (WArray168.init8 (fun i_0 => (out).[i_0])) i (t))); + i <- i + 1; + } + return (state, out); + } + + proc __rol_4u64_rho56 (a:W256.t) : W256.t = { + + var r:W256.t; + + r <- VPSHUFB_256 a rho56; + return (r); + } + + proc __rol_4u64_rho8 (a:W256.t) : W256.t = { + + var r:W256.t; + + r <- VPSHUFB_256 a rho8; + return (r); + } + + proc __rol_4u64 (a:W256.t, o:int) : W256.t = { + + var r:W256.t; + var t256:W256.t; + + r <- VPSLL_4u64 a (W8.of_int o); + t256 <- VPSRL_4u64 a (W8.of_int (64 - o)); + r <- (r `|` t256); + return (r); + } + + proc __prepare_theta (a_4x:W256.t Array25.t) : W256.t * W256.t * W256.t * + W256.t * W256.t = { + + var ca:W256.t; + var ce:W256.t; + var ci:W256.t; + var co:W256.t; + var cu:W256.t; + + ca <- a_4x.[20]; + ca <- (ca `^` a_4x.[15]); + ca <- (ca `^` a_4x.[10]); + ca <- (ca `^` a_4x.[5]); + ca <- (ca `^` a_4x.[0]); + ce <- a_4x.[21]; + ce <- (ce `^` a_4x.[16]); + ce <- (ce `^` a_4x.[11]); + ce <- (ce `^` a_4x.[6]); + ce <- (ce `^` a_4x.[1]); + ci <- a_4x.[22]; + ci <- (ci `^` a_4x.[17]); + ci <- (ci `^` a_4x.[12]); + ci <- (ci `^` a_4x.[7]); + ci <- (ci `^` a_4x.[2]); + co <- a_4x.[23]; + co <- (co `^` a_4x.[18]); + co <- (co `^` a_4x.[13]); + co <- (co `^` a_4x.[8]); + co <- (co `^` a_4x.[3]); + cu <- a_4x.[24]; + cu <- (cu `^` a_4x.[19]); + cu <- (cu `^` a_4x.[14]); + cu <- (cu `^` a_4x.[9]); + cu <- (cu `^` a_4x.[4]); + return (ca, ce, ci, co, cu); + } + + proc __first (ca:W256.t, ce:W256.t, ci:W256.t, co:W256.t, cu:W256.t) : + W256.t * W256.t * W256.t * W256.t * W256.t = { + + var da:W256.t; + var de:W256.t; + var di:W256.t; + var do_0:W256.t; + var du:W256.t; + var ce1:W256.t; + var ci1:W256.t; + var co1:W256.t; + var cu1:W256.t; + var ca1:W256.t; + + ce1 <@ __rol_4u64 (ce, 1); + da <- (cu `^` ce1); + ci1 <@ __rol_4u64 (ci, 1); + de <- (ca `^` ci1); + co1 <@ __rol_4u64 (co, 1); + di <- (ce `^` co1); + cu1 <@ __rol_4u64 (cu, 1); + do_0 <- (ci `^` cu1); + ca1 <@ __rol_4u64 (ca, 1); + du <- (co `^` ca1); + return (da, de, di, do_0, du); + } + + proc __second_even (a_4x:W256.t Array25.t, e_4x:W256.t Array25.t, + index:int, ca:W256.t, ce:W256.t, ci:W256.t, co:W256.t, + cu:W256.t, da:W256.t, de:W256.t, di:W256.t, + do_0:W256.t, du:W256.t) : W256.t Array25.t * + W256.t Array25.t * W256.t * + W256.t * W256.t * W256.t * + W256.t = { + + var t256:W256.t; + var bba:W256.t; + var bbe:W256.t; + var bbi:W256.t; + var bbo:W256.t; + var bbu:W256.t; + + t256 <- a_4x.[0]; + t256 <- (t256 `^` da); + a_4x.[0] <- t256; + bba <- t256; + t256 <- a_4x.[6]; + t256 <- (t256 `^` de); + a_4x.[6] <- t256; + bbe <@ __rol_4u64 (t256, 44); + t256 <- a_4x.[12]; + t256 <- (t256 `^` di); + a_4x.[12] <- t256; + bbi <@ __rol_4u64 (t256, 43); + t256 <- VPANDN_256 bbe bbi; + t256 <- (t256 `^` bba); + t256 <- (t256 `^` KeccakF1600RoundConstants.[index]); + e_4x.[0] <- t256; + ca <- t256; + t256 <- a_4x.[18]; + t256 <- (t256 `^` do_0); + a_4x.[18] <- t256; + bbo <@ __rol_4u64 (t256, 21); + t256 <- VPANDN_256 bbi bbo; + t256 <- (t256 `^` bbe); + e_4x.[1] <- t256; + ce <- t256; + t256 <- a_4x.[24]; + t256 <- (t256 `^` du); + a_4x.[24] <- t256; + bbu <@ __rol_4u64 (t256, 14); + t256 <- VPANDN_256 bbo bbu; + t256 <- (t256 `^` bbi); + e_4x.[2] <- t256; + ci <- t256; + t256 <- VPANDN_256 bbu bba; + t256 <- (t256 `^` bbo); + e_4x.[3] <- t256; + co <- t256; + t256 <- VPANDN_256 bba bbe; + t256 <- (t256 `^` bbu); + e_4x.[4] <- t256; + cu <- t256; + return (a_4x, e_4x, ca, ce, ci, co, cu); + } + + proc __third_even (a_4x:W256.t Array25.t, e_4x:W256.t Array25.t, ca:W256.t, + ce:W256.t, ci:W256.t, co:W256.t, cu:W256.t, da:W256.t, + de:W256.t, di:W256.t, do_0:W256.t, du:W256.t) : + W256.t Array25.t * W256.t Array25.t * W256.t * W256.t * W256.t * W256.t * + W256.t = { + + var t256:W256.t; + var bga:W256.t; + var bge:W256.t; + var bgi:W256.t; + var bgo:W256.t; + var bgu:W256.t; + + t256 <- a_4x.[3]; + t256 <- (t256 `^` do_0); + a_4x.[3] <- t256; + bga <@ __rol_4u64 (t256, 28); + t256 <- a_4x.[9]; + t256 <- (t256 `^` du); + a_4x.[9] <- t256; + bge <@ __rol_4u64 (t256, 20); + t256 <- a_4x.[10]; + t256 <- (t256 `^` da); + a_4x.[10] <- t256; + bgi <@ __rol_4u64 (t256, 3); + t256 <- VPANDN_256 bge bgi; + t256 <- (t256 `^` bga); + e_4x.[5] <- t256; + ca <- (ca `^` t256); + t256 <- a_4x.[16]; + t256 <- (t256 `^` de); + a_4x.[16] <- t256; + bgo <@ __rol_4u64 (t256, 45); + t256 <- VPANDN_256 bgi bgo; + t256 <- (t256 `^` bge); + e_4x.[6] <- t256; + ce <- (ce `^` t256); + t256 <- a_4x.[22]; + t256 <- (t256 `^` di); + a_4x.[22] <- t256; + bgu <@ __rol_4u64 (t256, 61); + t256 <- VPANDN_256 bgo bgu; + t256 <- (t256 `^` bgi); + e_4x.[7] <- t256; + ci <- (ci `^` t256); + t256 <- VPANDN_256 bgu bga; + t256 <- (t256 `^` bgo); + e_4x.[8] <- t256; + co <- (co `^` t256); + t256 <- VPANDN_256 bga bge; + t256 <- (t256 `^` bgu); + e_4x.[9] <- t256; + cu <- (cu `^` t256); + return (a_4x, e_4x, ca, ce, ci, co, cu); + } + + proc __fourth_even (a_4x:W256.t Array25.t, e_4x:W256.t Array25.t, + ca:W256.t, ce:W256.t, ci:W256.t, co:W256.t, cu:W256.t, + da:W256.t, de:W256.t, di:W256.t, do_0:W256.t, du:W256.t) : + W256.t Array25.t * W256.t Array25.t * W256.t * W256.t * W256.t * W256.t * + W256.t = { + + var t256:W256.t; + var bka:W256.t; + var bke:W256.t; + var bki:W256.t; + var bko:W256.t; + var bku:W256.t; + + t256 <- a_4x.[1]; + t256 <- (t256 `^` de); + a_4x.[1] <- t256; + bka <@ __rol_4u64 (t256, 1); + t256 <- a_4x.[7]; + t256 <- (t256 `^` di); + a_4x.[7] <- t256; + bke <@ __rol_4u64 (t256, 6); + t256 <- a_4x.[13]; + t256 <- (t256 `^` do_0); + a_4x.[13] <- t256; + bki <@ __rol_4u64 (t256, 25); + t256 <- VPANDN_256 bke bki; + t256 <- (t256 `^` bka); + e_4x.[10] <- t256; + ca <- (ca `^` t256); + t256 <- a_4x.[19]; + t256 <- (t256 `^` du); + a_4x.[19] <- t256; + bko <@ __rol_4u64_rho8 (t256); + t256 <- VPANDN_256 bki bko; + t256 <- (t256 `^` bke); + e_4x.[11] <- t256; + ce <- (ce `^` t256); + t256 <- a_4x.[20]; + t256 <- (t256 `^` da); + a_4x.[20] <- t256; + bku <@ __rol_4u64 (t256, 18); + t256 <- VPANDN_256 bko bku; + t256 <- (t256 `^` bki); + e_4x.[12] <- t256; + ci <- (ci `^` t256); + t256 <- VPANDN_256 bku bka; + t256 <- (t256 `^` bko); + e_4x.[13] <- t256; + co <- (co `^` t256); + t256 <- VPANDN_256 bka bke; + t256 <- (t256 `^` bku); + e_4x.[14] <- t256; + cu <- (cu `^` t256); + return (a_4x, e_4x, ca, ce, ci, co, cu); + } + + proc __fifth_even (a_4x:W256.t Array25.t, e_4x:W256.t Array25.t, ca:W256.t, + ce:W256.t, ci:W256.t, co:W256.t, cu:W256.t, da:W256.t, + de:W256.t, di:W256.t, do_0:W256.t, du:W256.t) : + W256.t Array25.t * W256.t Array25.t * W256.t * W256.t * W256.t * W256.t * + W256.t = { + + var t256:W256.t; + var bma:W256.t; + var bme:W256.t; + var bmi:W256.t; + var bmo:W256.t; + var bmu:W256.t; + + t256 <- a_4x.[4]; + t256 <- (t256 `^` du); + a_4x.[4] <- t256; + bma <@ __rol_4u64 (t256, 27); + t256 <- a_4x.[5]; + t256 <- (t256 `^` da); + a_4x.[5] <- t256; + bme <@ __rol_4u64 (t256, 36); + t256 <- a_4x.[11]; + t256 <- (t256 `^` de); + a_4x.[11] <- t256; + bmi <@ __rol_4u64 (t256, 10); + t256 <- VPANDN_256 bme bmi; + t256 <- (t256 `^` bma); + e_4x.[15] <- t256; + ca <- (ca `^` t256); + t256 <- a_4x.[17]; + t256 <- (t256 `^` di); + a_4x.[17] <- t256; + bmo <@ __rol_4u64 (t256, 15); + t256 <- VPANDN_256 bmi bmo; + t256 <- (t256 `^` bme); + e_4x.[16] <- t256; + ce <- (ce `^` t256); + t256 <- a_4x.[23]; + t256 <- (t256 `^` do_0); + a_4x.[23] <- t256; + bmu <@ __rol_4u64_rho56 (t256); + t256 <- VPANDN_256 bmo bmu; + t256 <- (t256 `^` bmi); + e_4x.[17] <- t256; + ci <- (ci `^` t256); + t256 <- VPANDN_256 bmu bma; + t256 <- (t256 `^` bmo); + e_4x.[18] <- t256; + co <- (co `^` t256); + t256 <- VPANDN_256 bma bme; + t256 <- (t256 `^` bmu); + e_4x.[19] <- t256; + cu <- (cu `^` t256); + return (a_4x, e_4x, ca, ce, ci, co, cu); + } + + proc __sixth_even (a_4x:W256.t Array25.t, e_4x:W256.t Array25.t, ca:W256.t, + ce:W256.t, ci:W256.t, co:W256.t, cu:W256.t, da:W256.t, + de:W256.t, di:W256.t, do_0:W256.t, du:W256.t) : + W256.t Array25.t * W256.t Array25.t * W256.t * W256.t * W256.t * W256.t * + W256.t = { + + var t256:W256.t; + var bsa:W256.t; + var bse:W256.t; + var bsi:W256.t; + var bso:W256.t; + var bsu:W256.t; + + t256 <- a_4x.[2]; + t256 <- (t256 `^` di); + a_4x.[2] <- t256; + bsa <@ __rol_4u64 (t256, 62); + t256 <- a_4x.[8]; + t256 <- (t256 `^` do_0); + a_4x.[8] <- t256; + bse <@ __rol_4u64 (t256, 55); + t256 <- a_4x.[14]; + t256 <- (t256 `^` du); + a_4x.[14] <- t256; + bsi <@ __rol_4u64 (t256, 39); + t256 <- VPANDN_256 bse bsi; + t256 <- (t256 `^` bsa); + e_4x.[20] <- t256; + ca <- (ca `^` t256); + t256 <- a_4x.[15]; + t256 <- (t256 `^` da); + a_4x.[15] <- t256; + bso <@ __rol_4u64 (t256, 41); + t256 <- VPANDN_256 bsi bso; + t256 <- (t256 `^` bse); + e_4x.[21] <- t256; + ce <- (ce `^` t256); + t256 <- a_4x.[21]; + t256 <- (t256 `^` de); + a_4x.[21] <- t256; + bsu <@ __rol_4u64 (t256, 2); + t256 <- VPANDN_256 bso bsu; + t256 <- (t256 `^` bsi); + e_4x.[22] <- t256; + ci <- (ci `^` t256); + t256 <- VPANDN_256 bsu bsa; + t256 <- (t256 `^` bso); + e_4x.[23] <- t256; + co <- (co `^` t256); + t256 <- VPANDN_256 bsa bse; + t256 <- (t256 `^` bsu); + e_4x.[24] <- t256; + cu <- (cu `^` t256); + return (a_4x, e_4x, ca, ce, ci, co, cu); + } + + proc __second_odd (a_4x:W256.t Array25.t, e_4x:W256.t Array25.t, index:int, + ca:W256.t, ce:W256.t, ci:W256.t, co:W256.t, cu:W256.t, + da:W256.t, de:W256.t, di:W256.t, do_0:W256.t, du:W256.t) : + W256.t Array25.t * W256.t Array25.t * W256.t * W256.t * W256.t * W256.t * + W256.t = { + + var t256:W256.t; + var bba:W256.t; + var bbe:W256.t; + var bbi:W256.t; + var bbo:W256.t; + var bbu:W256.t; + + t256 <- a_4x.[0]; + t256 <- (t256 `^` da); + a_4x.[0] <- t256; + bba <- t256; + t256 <- a_4x.[6]; + t256 <- (t256 `^` de); + a_4x.[6] <- t256; + bbe <@ __rol_4u64 (t256, 44); + t256 <- a_4x.[12]; + t256 <- (t256 `^` di); + a_4x.[12] <- t256; + bbi <@ __rol_4u64 (t256, 43); + t256 <- VPANDN_256 bbe bbi; + t256 <- (t256 `^` bba); + t256 <- (t256 `^` KeccakF1600RoundConstants.[index]); + e_4x.[0] <- t256; + ca <- t256; + t256 <- a_4x.[18]; + t256 <- (t256 `^` do_0); + a_4x.[18] <- t256; + bbo <@ __rol_4u64 (t256, 21); + t256 <- VPANDN_256 bbi bbo; + t256 <- (t256 `^` bbe); + e_4x.[1] <- t256; + ce <- t256; + t256 <- a_4x.[24]; + t256 <- (t256 `^` du); + a_4x.[24] <- t256; + bbu <@ __rol_4u64 (t256, 14); + t256 <- VPANDN_256 bbo bbu; + t256 <- (t256 `^` bbi); + e_4x.[2] <- t256; + ci <- t256; + t256 <- VPANDN_256 bbu bba; + t256 <- (t256 `^` bbo); + e_4x.[3] <- t256; + co <- t256; + t256 <- VPANDN_256 bba bbe; + t256 <- (t256 `^` bbu); + e_4x.[4] <- t256; + cu <- t256; + return (a_4x, e_4x, ca, ce, ci, co, cu); + } + + proc __third_odd (a_4x:W256.t Array25.t, e_4x:W256.t Array25.t, ca:W256.t, + ce:W256.t, ci:W256.t, co:W256.t, cu:W256.t, da:W256.t, + de:W256.t, di:W256.t, do_0:W256.t, du:W256.t) : W256.t Array25.t * + W256.t Array25.t * + W256.t * + W256.t * + W256.t * + W256.t * + W256.t = { + + var t256:W256.t; + var bga:W256.t; + var bge:W256.t; + var bgi:W256.t; + var bgo:W256.t; + var bgu:W256.t; + + t256 <- a_4x.[3]; + t256 <- (t256 `^` do_0); + a_4x.[3] <- t256; + bga <@ __rol_4u64 (t256, 28); + t256 <- a_4x.[9]; + t256 <- (t256 `^` du); + a_4x.[9] <- t256; + bge <@ __rol_4u64 (t256, 20); + t256 <- a_4x.[10]; + t256 <- (t256 `^` da); + a_4x.[10] <- t256; + bgi <@ __rol_4u64 (t256, 3); + t256 <- VPANDN_256 bge bgi; + t256 <- (t256 `^` bga); + e_4x.[5] <- t256; + ca <- (ca `^` t256); + t256 <- a_4x.[16]; + t256 <- (t256 `^` de); + a_4x.[16] <- t256; + bgo <@ __rol_4u64 (t256, 45); + t256 <- VPANDN_256 bgi bgo; + t256 <- (t256 `^` bge); + e_4x.[6] <- t256; + ce <- (ce `^` t256); + t256 <- a_4x.[22]; + t256 <- (t256 `^` di); + a_4x.[22] <- t256; + bgu <@ __rol_4u64 (t256, 61); + t256 <- VPANDN_256 bgo bgu; + t256 <- (t256 `^` bgi); + e_4x.[7] <- t256; + ci <- (ci `^` t256); + t256 <- VPANDN_256 bgu bga; + t256 <- (t256 `^` bgo); + e_4x.[8] <- t256; + co <- (co `^` t256); + t256 <- VPANDN_256 bga bge; + t256 <- (t256 `^` bgu); + e_4x.[9] <- t256; + cu <- (cu `^` t256); + return (a_4x, e_4x, ca, ce, ci, co, cu); + } + + proc __fourth_odd (a_4x:W256.t Array25.t, e_4x:W256.t Array25.t, ca:W256.t, + ce:W256.t, ci:W256.t, co:W256.t, cu:W256.t, da:W256.t, + de:W256.t, di:W256.t, do_0:W256.t, du:W256.t) : + W256.t Array25.t * W256.t Array25.t * W256.t * W256.t * W256.t * W256.t * + W256.t = { + + var t256:W256.t; + var bka:W256.t; + var bke:W256.t; + var bki:W256.t; + var bko:W256.t; + var bku:W256.t; + + t256 <- a_4x.[1]; + t256 <- (t256 `^` de); + a_4x.[1] <- t256; + bka <@ __rol_4u64 (t256, 1); + t256 <- a_4x.[7]; + t256 <- (t256 `^` di); + a_4x.[7] <- t256; + bke <@ __rol_4u64 (t256, 6); + t256 <- a_4x.[13]; + t256 <- (t256 `^` do_0); + a_4x.[13] <- t256; + bki <@ __rol_4u64 (t256, 25); + t256 <- VPANDN_256 bke bki; + t256 <- (t256 `^` bka); + e_4x.[10] <- t256; + ca <- (ca `^` t256); + t256 <- a_4x.[19]; + t256 <- (t256 `^` du); + a_4x.[19] <- t256; + bko <@ __rol_4u64_rho8 (t256); + t256 <- VPANDN_256 bki bko; + t256 <- (t256 `^` bke); + e_4x.[11] <- t256; + ce <- (ce `^` t256); + t256 <- a_4x.[20]; + t256 <- (t256 `^` da); + a_4x.[20] <- t256; + bku <@ __rol_4u64 (t256, 18); + t256 <- VPANDN_256 bko bku; + t256 <- (t256 `^` bki); + e_4x.[12] <- t256; + ci <- (ci `^` t256); + t256 <- VPANDN_256 bku bka; + t256 <- (t256 `^` bko); + e_4x.[13] <- t256; + co <- (co `^` t256); + t256 <- VPANDN_256 bka bke; + t256 <- (t256 `^` bku); + e_4x.[14] <- t256; + cu <- (cu `^` t256); + return (a_4x, e_4x, ca, ce, ci, co, cu); + } + + proc __fifth_odd (a_4x:W256.t Array25.t, e_4x:W256.t Array25.t, ca:W256.t, + ce:W256.t, ci:W256.t, co:W256.t, cu:W256.t, da:W256.t, + de:W256.t, di:W256.t, do_0:W256.t, du:W256.t) : W256.t Array25.t * + W256.t Array25.t * + W256.t * + W256.t * + W256.t * + W256.t * + W256.t = { + + var t256:W256.t; + var bma:W256.t; + var bme:W256.t; + var bmi:W256.t; + var bmo:W256.t; + var bmu:W256.t; + + t256 <- a_4x.[4]; + t256 <- (t256 `^` du); + a_4x.[4] <- t256; + bma <@ __rol_4u64 (t256, 27); + t256 <- a_4x.[5]; + t256 <- (t256 `^` da); + a_4x.[5] <- t256; + bme <@ __rol_4u64 (t256, 36); + t256 <- a_4x.[11]; + t256 <- (t256 `^` de); + a_4x.[11] <- t256; + bmi <@ __rol_4u64 (t256, 10); + t256 <- VPANDN_256 bme bmi; + t256 <- (t256 `^` bma); + e_4x.[15] <- t256; + ca <- (ca `^` t256); + t256 <- a_4x.[17]; + t256 <- (t256 `^` di); + a_4x.[17] <- t256; + bmo <@ __rol_4u64 (t256, 15); + t256 <- VPANDN_256 bmi bmo; + t256 <- (t256 `^` bme); + e_4x.[16] <- t256; + ce <- (ce `^` t256); + t256 <- a_4x.[23]; + t256 <- (t256 `^` do_0); + a_4x.[23] <- t256; + bmu <@ __rol_4u64_rho56 (t256); + t256 <- VPANDN_256 bmo bmu; + t256 <- (t256 `^` bmi); + e_4x.[17] <- t256; + ci <- (ci `^` t256); + t256 <- VPANDN_256 bmu bma; + t256 <- (t256 `^` bmo); + e_4x.[18] <- t256; + co <- (co `^` t256); + t256 <- VPANDN_256 bma bme; + t256 <- (t256 `^` bmu); + e_4x.[19] <- t256; + cu <- (cu `^` t256); + return (a_4x, e_4x, ca, ce, ci, co, cu); + } + + proc __sixth_odd (a_4x:W256.t Array25.t, e_4x:W256.t Array25.t, ca:W256.t, + ce:W256.t, ci:W256.t, co:W256.t, cu:W256.t, da:W256.t, + de:W256.t, di:W256.t, do_0:W256.t, du:W256.t) : W256.t Array25.t * + W256.t Array25.t * + W256.t * + W256.t * + W256.t * + W256.t * + W256.t = { + + var t256:W256.t; + var bsa:W256.t; + var bse:W256.t; + var bsi:W256.t; + var bso:W256.t; + var bsu:W256.t; + + t256 <- a_4x.[2]; + t256 <- (t256 `^` di); + a_4x.[2] <- t256; + bsa <@ __rol_4u64 (t256, 62); + t256 <- a_4x.[8]; + t256 <- (t256 `^` do_0); + a_4x.[8] <- t256; + bse <@ __rol_4u64 (t256, 55); + t256 <- a_4x.[14]; + t256 <- (t256 `^` du); + a_4x.[14] <- t256; + bsi <@ __rol_4u64 (t256, 39); + t256 <- VPANDN_256 bse bsi; + t256 <- (t256 `^` bsa); + e_4x.[20] <- t256; + ca <- (ca `^` t256); + t256 <- a_4x.[15]; + t256 <- (t256 `^` da); + a_4x.[15] <- t256; + bso <@ __rol_4u64 (t256, 41); + t256 <- VPANDN_256 bsi bso; + t256 <- (t256 `^` bse); + e_4x.[21] <- t256; + ce <- (ce `^` t256); + t256 <- a_4x.[21]; + t256 <- (t256 `^` de); + a_4x.[21] <- t256; + bsu <@ __rol_4u64 (t256, 2); + t256 <- VPANDN_256 bso bsu; + t256 <- (t256 `^` bsi); + e_4x.[22] <- t256; + ci <- (ci `^` t256); + t256 <- VPANDN_256 bsu bsa; + t256 <- (t256 `^` bso); + e_4x.[23] <- t256; + co <- (co `^` t256); + t256 <- VPANDN_256 bsa bse; + t256 <- (t256 `^` bsu); + e_4x.[24] <- t256; + cu <- (cu `^` t256); + return (a_4x, e_4x, ca, ce, ci, co, cu); + } + + proc __second_last (a_4x:W256.t Array25.t, e_4x:W256.t Array25.t, + index:int, da:W256.t, de:W256.t, di:W256.t, + do_0:W256.t, du:W256.t) : W256.t Array25.t * + W256.t Array25.t = { + + var t256:W256.t; + var bba:W256.t; + var bbe:W256.t; + var bbi:W256.t; + var bbo:W256.t; + var bbu:W256.t; + + t256 <- a_4x.[0]; + t256 <- (t256 `^` da); + a_4x.[0] <- t256; + bba <- t256; + t256 <- a_4x.[6]; + t256 <- (t256 `^` de); + a_4x.[6] <- t256; + bbe <@ __rol_4u64 (t256, 44); + t256 <- a_4x.[12]; + t256 <- (t256 `^` di); + a_4x.[12] <- t256; + bbi <@ __rol_4u64 (t256, 43); + t256 <- VPANDN_256 bbe bbi; + t256 <- (t256 `^` bba); + t256 <- (t256 `^` KeccakF1600RoundConstants.[index]); + e_4x.[0] <- t256; + t256 <- a_4x.[18]; + t256 <- (t256 `^` do_0); + a_4x.[18] <- t256; + bbo <@ __rol_4u64 (t256, 21); + t256 <- VPANDN_256 bbi bbo; + t256 <- (t256 `^` bbe); + e_4x.[1] <- t256; + t256 <- a_4x.[24]; + t256 <- (t256 `^` du); + a_4x.[24] <- t256; + bbu <@ __rol_4u64 (t256, 14); + t256 <- VPANDN_256 bbo bbu; + t256 <- (t256 `^` bbi); + e_4x.[2] <- t256; + t256 <- VPANDN_256 bbu bba; + t256 <- (t256 `^` bbo); + e_4x.[3] <- t256; + t256 <- VPANDN_256 bba bbe; + t256 <- (t256 `^` bbu); + e_4x.[4] <- t256; + return (a_4x, e_4x); + } + + proc __third_last (a_4x:W256.t Array25.t, e_4x:W256.t Array25.t, da:W256.t, + de:W256.t, di:W256.t, do_0:W256.t, du:W256.t) : + W256.t Array25.t * W256.t Array25.t = { + + var t256:W256.t; + var bga:W256.t; + var bge:W256.t; + var bgi:W256.t; + var bgo:W256.t; + var bgu:W256.t; + + t256 <- a_4x.[3]; + t256 <- (t256 `^` do_0); + a_4x.[3] <- t256; + bga <@ __rol_4u64 (t256, 28); + t256 <- a_4x.[9]; + t256 <- (t256 `^` du); + a_4x.[9] <- t256; + bge <@ __rol_4u64 (t256, 20); + t256 <- a_4x.[10]; + t256 <- (t256 `^` da); + a_4x.[10] <- t256; + bgi <@ __rol_4u64 (t256, 3); + t256 <- VPANDN_256 bge bgi; + t256 <- (t256 `^` bga); + e_4x.[5] <- t256; + t256 <- a_4x.[16]; + t256 <- (t256 `^` de); + a_4x.[16] <- t256; + bgo <@ __rol_4u64 (t256, 45); + t256 <- VPANDN_256 bgi bgo; + t256 <- (t256 `^` bge); + e_4x.[6] <- t256; + t256 <- a_4x.[22]; + t256 <- (t256 `^` di); + a_4x.[22] <- t256; + bgu <@ __rol_4u64 (t256, 61); + t256 <- VPANDN_256 bgo bgu; + t256 <- (t256 `^` bgi); + e_4x.[7] <- t256; + t256 <- VPANDN_256 bgu bga; + t256 <- (t256 `^` bgo); + e_4x.[8] <- t256; + t256 <- VPANDN_256 bga bge; + t256 <- (t256 `^` bgu); + e_4x.[9] <- t256; + return (a_4x, e_4x); + } + + proc __fourth_last (a_4x:W256.t Array25.t, e_4x:W256.t Array25.t, + da:W256.t, de:W256.t, di:W256.t, do_0:W256.t, du:W256.t) : + W256.t Array25.t * W256.t Array25.t = { + + var t256:W256.t; + var bka:W256.t; + var bke:W256.t; + var bki:W256.t; + var bko:W256.t; + var bku:W256.t; + + t256 <- a_4x.[1]; + t256 <- (t256 `^` de); + a_4x.[1] <- t256; + bka <@ __rol_4u64 (t256, 1); + t256 <- a_4x.[7]; + t256 <- (t256 `^` di); + a_4x.[7] <- t256; + bke <@ __rol_4u64 (t256, 6); + t256 <- a_4x.[13]; + t256 <- (t256 `^` do_0); + a_4x.[13] <- t256; + bki <@ __rol_4u64 (t256, 25); + t256 <- VPANDN_256 bke bki; + t256 <- (t256 `^` bka); + e_4x.[10] <- t256; + t256 <- a_4x.[19]; + t256 <- (t256 `^` du); + a_4x.[19] <- t256; + bko <@ __rol_4u64_rho8 (t256); + t256 <- VPANDN_256 bki bko; + t256 <- (t256 `^` bke); + e_4x.[11] <- t256; + t256 <- a_4x.[20]; + t256 <- (t256 `^` da); + a_4x.[20] <- t256; + bku <@ __rol_4u64 (t256, 18); + t256 <- VPANDN_256 bko bku; + t256 <- (t256 `^` bki); + e_4x.[12] <- t256; + t256 <- VPANDN_256 bku bka; + t256 <- (t256 `^` bko); + e_4x.[13] <- t256; + t256 <- VPANDN_256 bka bke; + t256 <- (t256 `^` bku); + e_4x.[14] <- t256; + return (a_4x, e_4x); + } + + proc __fifth_last (a_4x:W256.t Array25.t, e_4x:W256.t Array25.t, da:W256.t, + de:W256.t, di:W256.t, do_0:W256.t, du:W256.t) : + W256.t Array25.t * W256.t Array25.t = { + + var t256:W256.t; + var bma:W256.t; + var bme:W256.t; + var bmi:W256.t; + var bmo:W256.t; + var bmu:W256.t; + + t256 <- a_4x.[4]; + t256 <- (t256 `^` du); + a_4x.[4] <- t256; + bma <@ __rol_4u64 (t256, 27); + t256 <- a_4x.[5]; + t256 <- (t256 `^` da); + a_4x.[5] <- t256; + bme <@ __rol_4u64 (t256, 36); + t256 <- a_4x.[11]; + t256 <- (t256 `^` de); + a_4x.[11] <- t256; + bmi <@ __rol_4u64 (t256, 10); + t256 <- VPANDN_256 bme bmi; + t256 <- (t256 `^` bma); + e_4x.[15] <- t256; + t256 <- a_4x.[17]; + t256 <- (t256 `^` di); + a_4x.[17] <- t256; + bmo <@ __rol_4u64 (t256, 15); + t256 <- VPANDN_256 bmi bmo; + t256 <- (t256 `^` bme); + e_4x.[16] <- t256; + t256 <- a_4x.[23]; + t256 <- (t256 `^` do_0); + a_4x.[23] <- t256; + bmu <@ __rol_4u64_rho56 (t256); + t256 <- VPANDN_256 bmo bmu; + t256 <- (t256 `^` bmi); + e_4x.[17] <- t256; + t256 <- VPANDN_256 bmu bma; + t256 <- (t256 `^` bmo); + e_4x.[18] <- t256; + t256 <- VPANDN_256 bma bme; + t256 <- (t256 `^` bmu); + e_4x.[19] <- t256; + return (a_4x, e_4x); + } + + proc __sixth_last (a_4x:W256.t Array25.t, e_4x:W256.t Array25.t, da:W256.t, + de:W256.t, di:W256.t, do_0:W256.t, du:W256.t) : + W256.t Array25.t * W256.t Array25.t = { + + var t256:W256.t; + var bsa:W256.t; + var bse:W256.t; + var bsi:W256.t; + var bso:W256.t; + var bsu:W256.t; + + t256 <- a_4x.[2]; + t256 <- (t256 `^` di); + a_4x.[2] <- t256; + bsa <@ __rol_4u64 (t256, 62); + t256 <- a_4x.[8]; + t256 <- (t256 `^` do_0); + a_4x.[8] <- t256; + bse <@ __rol_4u64 (t256, 55); + t256 <- a_4x.[14]; + t256 <- (t256 `^` du); + a_4x.[14] <- t256; + bsi <@ __rol_4u64 (t256, 39); + t256 <- VPANDN_256 bse bsi; + t256 <- (t256 `^` bsa); + e_4x.[20] <- t256; + t256 <- a_4x.[15]; + t256 <- (t256 `^` da); + a_4x.[15] <- t256; + bso <@ __rol_4u64 (t256, 41); + t256 <- VPANDN_256 bsi bso; + t256 <- (t256 `^` bse); + e_4x.[21] <- t256; + t256 <- a_4x.[21]; + t256 <- (t256 `^` de); + a_4x.[21] <- t256; + bsu <@ __rol_4u64 (t256, 2); + t256 <- VPANDN_256 bso bsu; + t256 <- (t256 `^` bsi); + e_4x.[22] <- t256; + t256 <- VPANDN_256 bsu bsa; + t256 <- (t256 `^` bso); + e_4x.[23] <- t256; + t256 <- VPANDN_256 bsa bse; + t256 <- (t256 `^` bsu); + e_4x.[24] <- t256; + return (a_4x, e_4x); + } + + proc __theta_rho_pi_chi_iota_prepare_theta_even (a_4x:W256.t Array25.t, + e_4x:W256.t Array25.t, + index:int, ca:W256.t, + ce:W256.t, ci:W256.t, + co:W256.t, cu:W256.t) : + W256.t Array25.t * W256.t Array25.t * W256.t * W256.t * W256.t * W256.t * + W256.t = { + + var da:W256.t; + var de:W256.t; + var di:W256.t; + var do_0:W256.t; + var du:W256.t; + + (da, de, di, do_0, du) <@ __first (ca, ce, ci, co, cu); + (a_4x, e_4x, ca, ce, ci, co, cu) <@ __second_even (a_4x, e_4x, index, ca, + ce, ci, co, cu, da, de, di, do_0, du); + (a_4x, e_4x, ca, ce, ci, co, cu) <@ __third_even (a_4x, e_4x, ca, ce, ci, + co, cu, da, de, di, do_0, du); + (a_4x, e_4x, ca, ce, ci, co, cu) <@ __fourth_even (a_4x, e_4x, ca, ce, + ci, co, cu, da, de, di, do_0, du); + (a_4x, e_4x, ca, ce, ci, co, cu) <@ __fifth_even (a_4x, e_4x, ca, ce, ci, + co, cu, da, de, di, do_0, du); + (a_4x, e_4x, ca, ce, ci, co, cu) <@ __sixth_even (a_4x, e_4x, ca, ce, ci, + co, cu, da, de, di, do_0, du); + return (a_4x, e_4x, ca, ce, ci, co, cu); + } + + proc __theta_rho_pi_chi_iota_prepare_theta_odd (a_4x:W256.t Array25.t, + e_4x:W256.t Array25.t, + index:int, ca:W256.t, + ce:W256.t, ci:W256.t, + co:W256.t, cu:W256.t) : + W256.t Array25.t * W256.t Array25.t * W256.t * W256.t * W256.t * W256.t * + W256.t = { + + var da:W256.t; + var de:W256.t; + var di:W256.t; + var do_0:W256.t; + var du:W256.t; + + (da, de, di, do_0, du) <@ __first (ca, ce, ci, co, cu); + (a_4x, e_4x, ca, ce, ci, co, cu) <@ __second_odd (a_4x, e_4x, index, ca, + ce, ci, co, cu, da, de, di, do_0, du); + (a_4x, e_4x, ca, ce, ci, co, cu) <@ __third_odd (a_4x, e_4x, ca, ce, ci, + co, cu, da, de, di, do_0, du); + (a_4x, e_4x, ca, ce, ci, co, cu) <@ __fourth_odd (a_4x, e_4x, ca, ce, ci, + co, cu, da, de, di, do_0, du); + (a_4x, e_4x, ca, ce, ci, co, cu) <@ __fifth_odd (a_4x, e_4x, ca, ce, ci, + co, cu, da, de, di, do_0, du); + (a_4x, e_4x, ca, ce, ci, co, cu) <@ __sixth_odd (a_4x, e_4x, ca, ce, ci, + co, cu, da, de, di, do_0, du); + return (a_4x, e_4x, ca, ce, ci, co, cu); + } + + proc __theta_rho_pi_chi_iota (a_4x:W256.t Array25.t, e_4x:W256.t Array25.t, + index:int, ca:W256.t, ce:W256.t, ci:W256.t, + co:W256.t, cu:W256.t) : W256.t Array25.t * + W256.t Array25.t = { + + var da:W256.t; + var de:W256.t; + var di:W256.t; + var do_0:W256.t; + var du:W256.t; + + (da, de, di, do_0, du) <@ __first (ca, ce, ci, co, cu); + (a_4x, e_4x) <@ __second_last (a_4x, e_4x, index, da, de, di, do_0, du); + (a_4x, e_4x) <@ __third_last (a_4x, e_4x, da, de, di, do_0, du); + (a_4x, e_4x) <@ __fourth_last (a_4x, e_4x, da, de, di, do_0, du); + (a_4x, e_4x) <@ __fifth_last (a_4x, e_4x, da, de, di, do_0, du); + (a_4x, e_4x) <@ __sixth_last (a_4x, e_4x, da, de, di, do_0, du); + return (a_4x, e_4x); + } + + proc _KeccakF1600_StatePermute4x (a_4x:W256.t Array25.t) : W256.t Array25.t = { + + var ca:W256.t; + var ce:W256.t; + var ci:W256.t; + var co:W256.t; + var cu:W256.t; + var e_4x:W256.t Array25.t; + e_4x <- witness; + (ca, ce, ci, co, cu) <@ __prepare_theta (a_4x); + (a_4x, e_4x, ca, ce, ci, co, + cu) <@ __theta_rho_pi_chi_iota_prepare_theta_even (a_4x, e_4x, 0, ca, ce, + ci, co, cu); + (e_4x, a_4x, ca, ce, ci, co, + cu) <@ __theta_rho_pi_chi_iota_prepare_theta_odd (e_4x, a_4x, 1, ca, ce, + ci, co, cu); + (a_4x, e_4x, ca, ce, ci, co, + cu) <@ __theta_rho_pi_chi_iota_prepare_theta_even (a_4x, e_4x, 2, ca, ce, + ci, co, cu); + (e_4x, a_4x, ca, ce, ci, co, + cu) <@ __theta_rho_pi_chi_iota_prepare_theta_odd (e_4x, a_4x, 3, ca, ce, + ci, co, cu); + (a_4x, e_4x, ca, ce, ci, co, + cu) <@ __theta_rho_pi_chi_iota_prepare_theta_even (a_4x, e_4x, 4, ca, ce, + ci, co, cu); + (e_4x, a_4x, ca, ce, ci, co, + cu) <@ __theta_rho_pi_chi_iota_prepare_theta_odd (e_4x, a_4x, 5, ca, ce, + ci, co, cu); + (a_4x, e_4x, ca, ce, ci, co, + cu) <@ __theta_rho_pi_chi_iota_prepare_theta_even (a_4x, e_4x, 6, ca, ce, + ci, co, cu); + (e_4x, a_4x, ca, ce, ci, co, + cu) <@ __theta_rho_pi_chi_iota_prepare_theta_odd (e_4x, a_4x, 7, ca, ce, + ci, co, cu); + (a_4x, e_4x, ca, ce, ci, co, + cu) <@ __theta_rho_pi_chi_iota_prepare_theta_even (a_4x, e_4x, 8, ca, ce, + ci, co, cu); + (e_4x, a_4x, ca, ce, ci, co, + cu) <@ __theta_rho_pi_chi_iota_prepare_theta_odd (e_4x, a_4x, 9, ca, ce, + ci, co, cu); + (a_4x, e_4x, ca, ce, ci, co, + cu) <@ __theta_rho_pi_chi_iota_prepare_theta_even (a_4x, e_4x, 10, ca, + ce, ci, co, cu); + (e_4x, a_4x, ca, ce, ci, co, + cu) <@ __theta_rho_pi_chi_iota_prepare_theta_odd (e_4x, a_4x, 11, ca, ce, + ci, co, cu); + (a_4x, e_4x, ca, ce, ci, co, + cu) <@ __theta_rho_pi_chi_iota_prepare_theta_even (a_4x, e_4x, 12, ca, + ce, ci, co, cu); + (e_4x, a_4x, ca, ce, ci, co, + cu) <@ __theta_rho_pi_chi_iota_prepare_theta_odd (e_4x, a_4x, 13, ca, ce, + ci, co, cu); + (a_4x, e_4x, ca, ce, ci, co, + cu) <@ __theta_rho_pi_chi_iota_prepare_theta_even (a_4x, e_4x, 14, ca, + ce, ci, co, cu); + (e_4x, a_4x, ca, ce, ci, co, + cu) <@ __theta_rho_pi_chi_iota_prepare_theta_odd (e_4x, a_4x, 15, ca, ce, + ci, co, cu); + (a_4x, e_4x, ca, ce, ci, co, + cu) <@ __theta_rho_pi_chi_iota_prepare_theta_even (a_4x, e_4x, 16, ca, + ce, ci, co, cu); + (e_4x, a_4x, ca, ce, ci, co, + cu) <@ __theta_rho_pi_chi_iota_prepare_theta_odd (e_4x, a_4x, 17, ca, ce, + ci, co, cu); + (a_4x, e_4x, ca, ce, ci, co, + cu) <@ __theta_rho_pi_chi_iota_prepare_theta_even (a_4x, e_4x, 18, ca, + ce, ci, co, cu); + (e_4x, a_4x, ca, ce, ci, co, + cu) <@ __theta_rho_pi_chi_iota_prepare_theta_odd (e_4x, a_4x, 19, ca, ce, + ci, co, cu); + (a_4x, e_4x, ca, ce, ci, co, + cu) <@ __theta_rho_pi_chi_iota_prepare_theta_even (a_4x, e_4x, 20, ca, + ce, ci, co, cu); + (e_4x, a_4x, ca, ce, ci, co, + cu) <@ __theta_rho_pi_chi_iota_prepare_theta_odd (e_4x, a_4x, 21, ca, ce, + ci, co, cu); + (a_4x, e_4x, ca, ce, ci, co, + cu) <@ __theta_rho_pi_chi_iota_prepare_theta_even (a_4x, e_4x, 22, ca, + ce, ci, co, cu); + (e_4x, a_4x) <@ __theta_rho_pi_chi_iota (e_4x, a_4x, 23, ca, ce, ci, co, + cu); + return (a_4x); + } + + proc _shake256_absorb4x_33 (s:W256.t Array25.t, m0:W8.t Array33.t, + m1:W8.t Array33.t, m2:W8.t Array33.t, + m3:W8.t Array33.t) : W256.t Array25.t = { + var aux: int; + + var t0:W256.t; + var i:int; + var t64:W64.t; + var t8:W8.t; + var t1:W256.t; + + i <- 0; + while (i < 25) { + t0 <- set0_256 ; + s.[i] <- t0; + i <- i + 1; + } + i <- 0; + while (i < 4) { + t64 <- (get64 (WArray33.init8 (fun i_0 => (m0).[i_0])) i); + s <- + Array25.init + (WArray800.get256 (WArray800.set64 (WArray800.init256 (fun i_0 => (s).[i_0])) (4 * i) (( + (get64 (WArray800.init256 (fun i_0 => (s).[i_0])) (4 * i)) `^` t64)))); + t64 <- (get64 (WArray33.init8 (fun i_0 => (m1).[i_0])) i); + s <- + Array25.init + (WArray800.get256 (WArray800.set64 (WArray800.init256 (fun i_0 => (s).[i_0])) ((4 * i) + 1) (( + (get64 (WArray800.init256 (fun i_0 => (s).[i_0])) ((4 * i) + 1)) `^` t64)))); + t64 <- (get64 (WArray33.init8 (fun i_0 => (m2).[i_0])) i); + s <- + Array25.init + (WArray800.get256 (WArray800.set64 (WArray800.init256 (fun i_0 => (s).[i_0])) ((4 * i) + 2) (( + (get64 (WArray800.init256 (fun i_0 => (s).[i_0])) ((4 * i) + 2)) `^` t64)))); + t64 <- (get64 (WArray33.init8 (fun i_0 => (m3).[i_0])) i); + s <- + Array25.init + (WArray800.get256 (WArray800.set64 (WArray800.init256 (fun i_0 => (s).[i_0])) ((4 * i) + 3) (( + (get64 (WArray800.init256 (fun i_0 => (s).[i_0])) ((4 * i) + 3)) `^` t64)))); + i <- i + 1; + } + t8 <- m0.[32]; + s <- + Array25.init + (WArray800.get256 (WArray800.set8 (WArray800.init256 (fun i_0 => (s).[i_0])) 128 (( + (get8 (WArray800.init256 (fun i_0 => (s).[i_0])) 128) `^` t8)))); + s <- + Array25.init + (WArray800.get256 (WArray800.set8 (WArray800.init256 (fun i_0 => (s).[i_0])) 129 (( + (get8 (WArray800.init256 (fun i_0 => (s).[i_0])) 129) `^` (W8.of_int 31))))); + t8 <- m1.[32]; + s <- + Array25.init + (WArray800.get256 (WArray800.set8 (WArray800.init256 (fun i_0 => (s).[i_0])) 136 (( + (get8 (WArray800.init256 (fun i_0 => (s).[i_0])) 136) `^` t8)))); + s <- + Array25.init + (WArray800.get256 (WArray800.set8 (WArray800.init256 (fun i_0 => (s).[i_0])) 137 (( + (get8 (WArray800.init256 (fun i_0 => (s).[i_0])) 137) `^` (W8.of_int 31))))); + t8 <- m2.[32]; + s <- + Array25.init + (WArray800.get256 (WArray800.set8 (WArray800.init256 (fun i_0 => (s).[i_0])) 144 (( + (get8 (WArray800.init256 (fun i_0 => (s).[i_0])) 144) `^` t8)))); + s <- + Array25.init + (WArray800.get256 (WArray800.set8 (WArray800.init256 (fun i_0 => (s).[i_0])) 145 (( + (get8 (WArray800.init256 (fun i_0 => (s).[i_0])) 145) `^` (W8.of_int 31))))); + t8 <- m3.[32]; + s <- + Array25.init + (WArray800.get256 (WArray800.set8 (WArray800.init256 (fun i_0 => (s).[i_0])) 152 (( + (get8 (WArray800.init256 (fun i_0 => (s).[i_0])) 152) `^` t8)))); + s <- + Array25.init + (WArray800.get256 (WArray800.set8 (WArray800.init256 (fun i_0 => (s).[i_0])) 153 (( + (get8 (WArray800.init256 (fun i_0 => (s).[i_0])) 153) `^` (W8.of_int 31))))); + t0 <- (get256 (WArray32.init64 (fun i_0 => (shake_sep).[i_0])) 0); + t1 <- s.[((136 %/ 8) - 1)]; + t0 <- (t0 `^` t1); + s.[((136 %/ 8) - 1)] <- t0; + return (s); + } + + proc __shake256_squeezeblock4x (state:W256.t Array25.t, h0:W8.t Array136.t, + h1:W8.t Array136.t, h2:W8.t Array136.t, + h3:W8.t Array136.t) : W256.t Array25.t * + W8.t Array136.t * + W8.t Array136.t * + W8.t Array136.t * + W8.t Array136.t = { + var aux: int; + + var i:int; + var t256:W256.t; + var t128:W128.t; + + state <@ _KeccakF1600_StatePermute4x (state); + aux <- (136 %/ 8); + i <- 0; + while (i < aux) { + t256 <- state.[i]; + t128 <- (truncateu128 t256); + h0 <- + Array136.init + (WArray136.get8 (WArray136.set64 (WArray136.init8 (fun i_0 => (h0).[i_0])) i (VMOVLPD t128))); + h1 <- + Array136.init + (WArray136.get8 (WArray136.set64 (WArray136.init8 (fun i_0 => (h1).[i_0])) i (VMOVHPD t128))); + t128 <- VEXTRACTI128 t256 (W8.of_int 1); + h2 <- + Array136.init + (WArray136.get8 (WArray136.set64 (WArray136.init8 (fun i_0 => (h2).[i_0])) i (VMOVLPD t128))); + h3 <- + Array136.init + (WArray136.get8 (WArray136.set64 (WArray136.init8 (fun i_0 => (h3).[i_0])) i (VMOVHPD t128))); + i <- i + 1; + } + return (state, h0, h1, h2, h3); + } + + proc _poly_add2 (rp:W16.t Array256.t, bp:W16.t Array256.t) : W16.t Array256.t = { + var aux: int; + + var i:int; + var a:W256.t; + var b:W256.t; + var r:W256.t; + + i <- 0; + while (i < 16) { + a <- + (get256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) (32 * i)); + b <- + (get256_direct (WArray512.init16 (fun i_0 => (bp).[i_0])) (32 * i)); + r <- VPADD_16u16 a b; + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) (32 * i) (r))); + i <- i + 1; + } + return (rp); + } + + proc _poly_csubq (rp:W16.t Array256.t) : W16.t Array256.t = { + var aux: int; + + var qx16:W256.t; + var i:int; + var r:W256.t; + + qx16 <- (get256 (WArray32.init16 (fun i_0 => (jqx16).[i_0])) 0); + i <- 0; + while (i < 16) { + r <- + (get256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) (32 * i)); + r <@ __csubq (r, qx16); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) (32 * i) (r))); + i <- i + 1; + } + return (rp); + } + + proc __w256_interleave_u16 (al:W256.t, ah:W256.t) : W256.t * W256.t = { + + var a0:W256.t; + var a1:W256.t; + + a0 <- VPUNPCKL_16u16 al ah; + a1 <- VPUNPCKH_16u16 al ah; + return (a0, a1); + } + + proc __w256_deinterleave_u16 (_zero:W256.t, a0:W256.t, a1:W256.t) : + W256.t * W256.t = { + + var al:W256.t; + var ah:W256.t; + + al <- VPBLEND_16u16 a0 _zero (W8.of_int 170); + ah <- VPBLEND_16u16 a1 _zero (W8.of_int 170); + al <- VPACKUS_8u32 al ah; + a0 <- VPSRL_8u32 a0 (W8.of_int 16); + a1 <- VPSRL_8u32 a1 (W8.of_int 16); + ah <- VPACKUS_8u32 a0 a1; + return (al, ah); + } + + proc __mont_red (lo:W256.t, hi:W256.t, qx16:W256.t, qinvx16:W256.t) : + W256.t = { + + var m:W256.t; + + m <- VPMULL_16u16 lo qinvx16; + m <- VPMULH_16u16 m qx16; + lo <- VPSUB_16u16 hi m; + return (lo); + } + + proc __wmul_16u16 (x:W256.t, y:W256.t) : W256.t * W256.t = { + + var xy0:W256.t; + var xy1:W256.t; + var xyL:W256.t; + var xyH:W256.t; + + xyL <- VPMULL_16u16 x y; + xyH <- VPMULH_16u16 x y; + (xy0, xy1) <@ __w256_interleave_u16 (xyL, xyH); + return (xy0, xy1); + } + + proc __schoolbook16x (are:W256.t, aim:W256.t, bre:W256.t, bim:W256.t, + zeta_0:W256.t, zetaqinv:W256.t, qx16:W256.t, + qinvx16:W256.t, sign:int) : W256.t * W256.t = { + + var x0:W256.t; + var y0:W256.t; + var zaim:W256.t; + var ac0:W256.t; + var ac1:W256.t; + var ad0:W256.t; + var ad1:W256.t; + var bc0:W256.t; + var bc1:W256.t; + var zbd0:W256.t; + var zbd1:W256.t; + var x1:W256.t; + var y1:W256.t; + var _zero:W256.t; + + zaim <@ __fqmulprecomp16x (aim, zetaqinv, zeta_0, qx16); + (ac0, ac1) <@ __wmul_16u16 (are, bre); + (ad0, ad1) <@ __wmul_16u16 (are, bim); + (bc0, bc1) <@ __wmul_16u16 (aim, bre); + (zbd0, zbd1) <@ __wmul_16u16 (zaim, bim); + if ((sign = 0)) { + x0 <- VPADD_8u32 ac0 zbd0; + x1 <- VPADD_8u32 ac1 zbd1; + } else { + x0 <- VPSUB_8u32 ac0 zbd0; + x1 <- VPSUB_8u32 ac1 zbd1; + } + y0 <- VPADD_8u32 bc0 ad0; + y1 <- VPADD_8u32 bc1 ad1; + _zero <- set0_256 ; + (x0, x1) <@ __w256_deinterleave_u16 (_zero, x0, x1); + (y0, y1) <@ __w256_deinterleave_u16 (_zero, y0, y1); + x0 <@ __mont_red (x0, x1, qx16, qinvx16); + y0 <@ __mont_red (y0, y1, qx16, qinvx16); + return (x0, y0); + } + + proc _poly_basemul (rp:W16.t Array256.t, ap:W16.t Array256.t, + bp:W16.t Array256.t) : W16.t Array256.t = { + + var qx16:W256.t; + var qinvx16:W256.t; + var zetaqinv:W256.t; + var zeta_0:W256.t; + var are:W256.t; + var aim:W256.t; + var bre:W256.t; + var bim:W256.t; + + qx16 <- (get256_direct (WArray32.init16 (fun i => (jqx16).[i])) 0); + qinvx16 <- (get256_direct (WArray32.init16 (fun i => (jqinvx16).[i])) 0); + zetaqinv <- + (get256_direct (WArray800.init16 (fun i => (jzetas_exp).[i])) 272); + zeta_0 <- + (get256_direct (WArray800.init16 (fun i => (jzetas_exp).[i])) 304); + are <- (get256_direct (WArray512.init16 (fun i => (ap).[i])) (32 * 0)); + aim <- (get256_direct (WArray512.init16 (fun i => (ap).[i])) (32 * 1)); + bre <- (get256_direct (WArray512.init16 (fun i => (bp).[i])) (32 * 0)); + bim <- (get256_direct (WArray512.init16 (fun i => (bp).[i])) (32 * 1)); + (are, aim) <@ __schoolbook16x (are, aim, bre, bim, zeta_0, zetaqinv, + qx16, qinvx16, 0); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 0) (are))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 1) (aim))); + are <- (get256_direct (WArray512.init16 (fun i => (ap).[i])) (32 * 2)); + aim <- (get256_direct (WArray512.init16 (fun i => (ap).[i])) (32 * 3)); + bre <- (get256_direct (WArray512.init16 (fun i => (bp).[i])) (32 * 2)); + bim <- (get256_direct (WArray512.init16 (fun i => (bp).[i])) (32 * 3)); + (are, aim) <@ __schoolbook16x (are, aim, bre, bim, zeta_0, zetaqinv, + qx16, qinvx16, 1); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 2) (are))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 3) (aim))); + zetaqinv <- + (get256_direct (WArray800.init16 (fun i => (jzetas_exp).[i])) 336); + zeta_0 <- + (get256_direct (WArray800.init16 (fun i => (jzetas_exp).[i])) 368); + are <- (get256_direct (WArray512.init16 (fun i => (ap).[i])) (32 * 4)); + aim <- (get256_direct (WArray512.init16 (fun i => (ap).[i])) (32 * 5)); + bre <- (get256_direct (WArray512.init16 (fun i => (bp).[i])) (32 * 4)); + bim <- (get256_direct (WArray512.init16 (fun i => (bp).[i])) (32 * 5)); + (are, aim) <@ __schoolbook16x (are, aim, bre, bim, zeta_0, zetaqinv, + qx16, qinvx16, 0); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 4) (are))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 5) (aim))); + are <- (get256_direct (WArray512.init16 (fun i => (ap).[i])) (32 * 6)); + aim <- (get256_direct (WArray512.init16 (fun i => (ap).[i])) (32 * 7)); + bre <- (get256_direct (WArray512.init16 (fun i => (bp).[i])) (32 * 6)); + bim <- (get256_direct (WArray512.init16 (fun i => (bp).[i])) (32 * 7)); + (are, aim) <@ __schoolbook16x (are, aim, bre, bim, zeta_0, zetaqinv, + qx16, qinvx16, 1); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 6) (are))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 7) (aim))); + zetaqinv <- + (get256_direct (WArray800.init16 (fun i => (jzetas_exp).[i])) 664); + zeta_0 <- + (get256_direct (WArray800.init16 (fun i => (jzetas_exp).[i])) 696); + are <- (get256_direct (WArray512.init16 (fun i => (ap).[i])) (32 * 8)); + aim <- (get256_direct (WArray512.init16 (fun i => (ap).[i])) (32 * 9)); + bre <- (get256_direct (WArray512.init16 (fun i => (bp).[i])) (32 * 8)); + bim <- (get256_direct (WArray512.init16 (fun i => (bp).[i])) (32 * 9)); + (are, aim) <@ __schoolbook16x (are, aim, bre, bim, zeta_0, zetaqinv, + qx16, qinvx16, 0); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 8) (are))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 9) (aim))); + are <- (get256_direct (WArray512.init16 (fun i => (ap).[i])) (32 * 10)); + aim <- (get256_direct (WArray512.init16 (fun i => (ap).[i])) (32 * 11)); + bre <- (get256_direct (WArray512.init16 (fun i => (bp).[i])) (32 * 10)); + bim <- (get256_direct (WArray512.init16 (fun i => (bp).[i])) (32 * 11)); + (are, aim) <@ __schoolbook16x (are, aim, bre, bim, zeta_0, zetaqinv, + qx16, qinvx16, 1); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 10) (are))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 11) (aim))); + zetaqinv <- + (get256_direct (WArray800.init16 (fun i => (jzetas_exp).[i])) 728); + zeta_0 <- + (get256_direct (WArray800.init16 (fun i => (jzetas_exp).[i])) 760); + are <- (get256_direct (WArray512.init16 (fun i => (ap).[i])) (32 * 12)); + aim <- (get256_direct (WArray512.init16 (fun i => (ap).[i])) (32 * 13)); + bre <- (get256_direct (WArray512.init16 (fun i => (bp).[i])) (32 * 12)); + bim <- (get256_direct (WArray512.init16 (fun i => (bp).[i])) (32 * 13)); + (are, aim) <@ __schoolbook16x (are, aim, bre, bim, zeta_0, zetaqinv, + qx16, qinvx16, 0); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 12) (are))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 13) (aim))); + are <- (get256_direct (WArray512.init16 (fun i => (ap).[i])) (32 * 14)); + aim <- (get256_direct (WArray512.init16 (fun i => (ap).[i])) (32 * 15)); + bre <- (get256_direct (WArray512.init16 (fun i => (bp).[i])) (32 * 14)); + bim <- (get256_direct (WArray512.init16 (fun i => (bp).[i])) (32 * 15)); + (are, aim) <@ __schoolbook16x (are, aim, bre, bim, zeta_0, zetaqinv, + qx16, qinvx16, 1); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 14) (are))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i => (rp).[i])) (32 * 15) (aim))); + return (rp); + } + + proc _poly_compress (rp:W64.t, a:W16.t Array256.t) : W16.t Array256.t = { + var aux: int; + + var x16p:W16.t Array16.t; + var v:W256.t; + var shift1:W256.t; + var mask:W256.t; + var shift2:W256.t; + var permidx:W256.t; + var i:int; + var f0:W256.t; + var f1:W256.t; + var f2:W256.t; + var f3:W256.t; + x16p <- witness; + a <@ _poly_csubq (a); + x16p <- jvx16; + v <- (get256 (WArray32.init16 (fun i_0 => (x16p).[i_0])) 0); + shift1 <- VPBROADCAST_16u16 pc_shift1_s; + mask <- VPBROADCAST_16u16 pc_mask_s; + shift2 <- VPBROADCAST_16u16 pc_shift2_s; + permidx <- + (get256 (WArray32.init32 (fun i_0 => (pc_permidx_s).[i_0])) 0); + aux <- (256 %/ 64); + i <- 0; + while (i < aux) { + f0 <- (get256 (WArray512.init16 (fun i_0 => (a).[i_0])) (4 * i)); + f1 <- (get256 (WArray512.init16 (fun i_0 => (a).[i_0])) ((4 * i) + 1)); + f2 <- (get256 (WArray512.init16 (fun i_0 => (a).[i_0])) ((4 * i) + 2)); + f3 <- (get256 (WArray512.init16 (fun i_0 => (a).[i_0])) ((4 * i) + 3)); + f0 <- VPMULH_16u16 f0 v; + f1 <- VPMULH_16u16 f1 v; + f2 <- VPMULH_16u16 f2 v; + f3 <- VPMULH_16u16 f3 v; + f0 <- VPMULHRS_16u16 f0 shift1; + f1 <- VPMULHRS_16u16 f1 shift1; + f2 <- VPMULHRS_16u16 f2 shift1; + f3 <- VPMULHRS_16u16 f3 shift1; + f0 <- VPAND_256 f0 mask; + f1 <- VPAND_256 f1 mask; + f2 <- VPAND_256 f2 mask; + f3 <- VPAND_256 f3 mask; + f0 <- VPACKUS_16u16 f0 f1; + f2 <- VPACKUS_16u16 f2 f3; + f0 <- VPMADDUBSW_256 f0 shift2; + f2 <- VPMADDUBSW_256 f2 shift2; + f0 <- VPACKUS_16u16 f0 f2; + f0 <- VPERMD permidx f0; + Glob.mem <- + storeW256 Glob.mem (W64.to_uint (rp + (W64.of_int (32 * i)))) (f0); + i <- i + 1; + } + return (a); + } + + proc _poly_compress_1 (rp:W8.t Array128.t, a:W16.t Array256.t) : W8.t Array128.t * + W16.t Array256.t = { + var aux: int; + + var x16p:W16.t Array16.t; + var v:W256.t; + var shift1:W256.t; + var mask:W256.t; + var shift2:W256.t; + var permidx:W256.t; + var i:int; + var f0:W256.t; + var f1:W256.t; + var f2:W256.t; + var f3:W256.t; + x16p <- witness; + a <@ _poly_csubq (a); + x16p <- jvx16; + v <- (get256 (WArray32.init16 (fun i_0 => (x16p).[i_0])) 0); + shift1 <- VPBROADCAST_16u16 pc_shift1_s; + mask <- VPBROADCAST_16u16 pc_mask_s; + shift2 <- VPBROADCAST_16u16 pc_shift2_s; + permidx <- + (get256 (WArray32.init32 (fun i_0 => (pc_permidx_s).[i_0])) 0); + aux <- (256 %/ 64); + i <- 0; + while (i < aux) { + f0 <- (get256 (WArray512.init16 (fun i_0 => (a).[i_0])) (4 * i)); + f1 <- (get256 (WArray512.init16 (fun i_0 => (a).[i_0])) ((4 * i) + 1)); + f2 <- (get256 (WArray512.init16 (fun i_0 => (a).[i_0])) ((4 * i) + 2)); + f3 <- (get256 (WArray512.init16 (fun i_0 => (a).[i_0])) ((4 * i) + 3)); + f0 <- VPMULH_16u16 f0 v; + f1 <- VPMULH_16u16 f1 v; + f2 <- VPMULH_16u16 f2 v; + f3 <- VPMULH_16u16 f3 v; + f0 <- VPMULHRS_16u16 f0 shift1; + f1 <- VPMULHRS_16u16 f1 shift1; + f2 <- VPMULHRS_16u16 f2 shift1; + f3 <- VPMULHRS_16u16 f3 shift1; + f0 <- VPAND_256 f0 mask; + f1 <- VPAND_256 f1 mask; + f2 <- VPAND_256 f2 mask; + f3 <- VPAND_256 f3 mask; + f0 <- VPACKUS_16u16 f0 f1; + f2 <- VPACKUS_16u16 f2 f3; + f0 <- VPMADDUBSW_256 f0 shift2; + f2 <- VPMADDUBSW_256 f2 shift2; + f0 <- VPACKUS_16u16 f0 f2; + f0 <- VPERMD permidx f0; + rp <- + Array128.init + (WArray128.get8 (WArray128.set256_direct (WArray128.init8 (fun i_0 => (rp).[i_0])) (32 * i) (f0))); + i <- i + 1; + } + return (rp, a); + } + + proc _poly_decompress (rp:W16.t Array256.t, ap:W64.t) : W16.t Array256.t = { + var aux: int; + + var x16p:W16.t Array16.t; + var q:W256.t; + var x32p:W8.t Array32.t; + var shufbidx:W256.t; + var mask:W256.t; + var shift:W256.t; + var f:W256.t; + var i:int; + x16p <- witness; + x32p <- witness; + x16p <- jqx16; + q <- (get256 (WArray32.init16 (fun i_0 => (x16p).[i_0])) 0); + x32p <- pd_jshufbidx; + shufbidx <- (get256 (WArray32.init8 (fun i_0 => (x32p).[i_0])) 0); + mask <- VPBROADCAST_8u32 pd_mask_s; + shift <- VPBROADCAST_8u32 pd_shift_s; + f <- set0_256 ; + aux <- (256 %/ 16); + i <- 0; + while (i < aux) { + f <- + VPBROADCAST_2u128 (loadW128 Glob.mem (W64.to_uint (ap + (W64.of_int (8 * i))))); + f <- VPSHUFB_256 f shufbidx; + f <- VPAND_256 f mask; + f <- VPMULL_16u16 f shift; + f <- VPMULHRS_16u16 f q; + rp <- + Array256.init + (WArray512.get16 (WArray512.set256 (WArray512.init16 (fun i_0 => (rp).[i_0])) i (f))); + i <- i + 1; + } + return (rp); + } + + proc _poly_frombytes (rp:W16.t Array256.t, ap:W64.t) : W16.t Array256.t = { + var aux: int; + + var maskp:W16.t Array16.t; + var mask:W256.t; + var i:int; + var t0:W256.t; + var t1:W256.t; + var t2:W256.t; + var t3:W256.t; + var t4:W256.t; + var t5:W256.t; + var tt:W256.t; + var t6:W256.t; + var t7:W256.t; + var t8:W256.t; + var t9:W256.t; + var t10:W256.t; + var t11:W256.t; + maskp <- witness; + maskp <- maskx16; + mask <- (get256 (WArray32.init16 (fun i_0 => (maskp).[i_0])) 0); + i <- 0; + while (i < 2) { + t0 <- (loadW256 Glob.mem (W64.to_uint (ap + (W64.of_int (192 * i))))); + t1 <- + (loadW256 Glob.mem (W64.to_uint (ap + (W64.of_int ((192 * i) + 32))))); + t2 <- + (loadW256 Glob.mem (W64.to_uint (ap + (W64.of_int ((192 * i) + 64))))); + t3 <- + (loadW256 Glob.mem (W64.to_uint (ap + (W64.of_int ((192 * i) + 96))))); + t4 <- + (loadW256 Glob.mem (W64.to_uint (ap + (W64.of_int ((192 * i) + 128))))); + t5 <- + (loadW256 Glob.mem (W64.to_uint (ap + (W64.of_int ((192 * i) + 160))))); + (tt, t3) <@ __shuffle8 (t0, t3); + (t0, t4) <@ __shuffle8 (t1, t4); + (t1, t5) <@ __shuffle8 (t2, t5); + (t2, t4) <@ __shuffle4 (tt, t4); + (tt, t1) <@ __shuffle4 (t3, t1); + (t3, t5) <@ __shuffle4 (t0, t5); + (t0, t1) <@ __shuffle2 (t2, t1); + (t2, t3) <@ __shuffle2 (t4, t3); + (t4, t5) <@ __shuffle2 (tt, t5); + (t6, t3) <@ __shuffle1 (t0, t3); + (t0, t4) <@ __shuffle1 (t1, t4); + (t1, t5) <@ __shuffle1 (t2, t5); + t7 <- VPSRL_16u16 t6 (W8.of_int 12); + t8 <- VPSLL_16u16 t3 (W8.of_int 4); + t7 <- VPOR_256 t7 t8; + t6 <- VPAND_256 mask t6; + t7 <- VPAND_256 mask t7; + t8 <- VPSRL_16u16 t3 (W8.of_int 8); + t9 <- VPSLL_16u16 t0 (W8.of_int 8); + t8 <- VPOR_256 t8 t9; + t8 <- VPAND_256 mask t8; + t9 <- VPSRL_16u16 t0 (W8.of_int 4); + t9 <- VPAND_256 mask t9; + t10 <- VPSRL_16u16 t4 (W8.of_int 12); + t11 <- VPSLL_16u16 t1 (W8.of_int 4); + t10 <- VPOR_256 t10 t11; + t4 <- VPAND_256 mask t4; + t10 <- VPAND_256 mask t10; + t11 <- VPSRL_16u16 t1 (W8.of_int 8); + tt <- VPSLL_16u16 t5 (W8.of_int 8); + t11 <- VPOR_256 t11 tt; + t11 <- VPAND_256 mask t11; + tt <- VPSRL_16u16 t5 (W8.of_int 4); + tt <- VPAND_256 mask tt; + rp <- + Array256.init + (WArray512.get16 (WArray512.set256 (WArray512.init16 (fun i_0 => (rp).[i_0])) (8 * i) (t6))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256 (WArray512.init16 (fun i_0 => (rp).[i_0])) ((8 * i) + 1) (t7))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256 (WArray512.init16 (fun i_0 => (rp).[i_0])) ((8 * i) + 2) (t8))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256 (WArray512.init16 (fun i_0 => (rp).[i_0])) ((8 * i) + 3) (t9))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256 (WArray512.init16 (fun i_0 => (rp).[i_0])) ((8 * i) + 4) (t4))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256 (WArray512.init16 (fun i_0 => (rp).[i_0])) ((8 * i) + 5) (t10))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256 (WArray512.init16 (fun i_0 => (rp).[i_0])) ((8 * i) + 6) (t11))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256 (WArray512.init16 (fun i_0 => (rp).[i_0])) ((8 * i) + 7) (tt))); + i <- i + 1; + } + return (rp); + } + + proc _poly_frommont (rp:W16.t Array256.t) : W16.t Array256.t = { + var aux: int; + + var x16p:W16.t Array16.t; + var qx16:W256.t; + var qinvx16:W256.t; + var dmontx16:W256.t; + var i:int; + var t:W256.t; + x16p <- witness; + x16p <- jqx16; + qx16 <- (get256 (WArray32.init16 (fun i_0 => (x16p).[i_0])) 0); + x16p <- jqinvx16; + qinvx16 <- (get256 (WArray32.init16 (fun i_0 => (x16p).[i_0])) 0); + x16p <- jdmontx16; + dmontx16 <- (get256 (WArray32.init16 (fun i_0 => (x16p).[i_0])) 0); + aux <- (256 %/ 16); + i <- 0; + while (i < aux) { + t <- (get256 (WArray512.init16 (fun i_0 => (rp).[i_0])) i); + t <@ __fqmulx16 (t, dmontx16, qx16, qinvx16); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256 (WArray512.init16 (fun i_0 => (rp).[i_0])) i (t))); + i <- i + 1; + } + return (rp); + } + + proc _poly_frommsg_1 (rp:W16.t Array256.t, ap:W8.t Array32.t) : W16.t Array256.t = { + var aux: int; + + var x16p:W16.t Array16.t; + var hqs:W256.t; + var shift:W256.t; + var idx:W256.t; + var f:W256.t; + var i:int; + var g3:W256.t; + var g0:W256.t; + var g1:W256.t; + var g2:W256.t; + var h0:W256.t; + var h2:W256.t; + var h1:W256.t; + var h3:W256.t; + x16p <- witness; + x16p <- hqx16_p1; + hqs <- (get256 (WArray32.init16 (fun i_0 => (x16p).[i_0])) 0); + shift <- + VPBROADCAST_2u128 (get128 + (WArray16.init32 (fun i_0 => (pfm_shift_s).[i_0])) 0); + idx <- + VPBROADCAST_2u128 (get128 (WArray16.init8 (fun i_0 => (pfm_idx_s).[i_0])) + 0); + f <- (get256 (WArray32.init8 (fun i_0 => (ap).[i_0])) 0); + i <- 0; + while (i < 4) { + g3 <- VPSHUFD_256 f (W8.of_int (85 * i)); + g3 <- VPSLLV_8u32 g3 shift; + g3 <- VPSHUFB_256 g3 idx; + g0 <- VPSLL_16u16 g3 (W8.of_int 12); + g1 <- VPSLL_16u16 g3 (W8.of_int 8); + g2 <- VPSLL_16u16 g3 (W8.of_int 4); + g0 <- VPSRA_16u16 g0 (W8.of_int 15); + g1 <- VPSRA_16u16 g1 (W8.of_int 15); + g2 <- VPSRA_16u16 g2 (W8.of_int 15); + g3 <- VPSRA_16u16 g3 (W8.of_int 15); + g0 <- VPAND_256 g0 hqs; + g1 <- VPAND_256 g1 hqs; + g2 <- VPAND_256 g2 hqs; + g3 <- VPAND_256 g3 hqs; + h0 <- VPUNPCKL_4u64 g0 g1; + h2 <- VPUNPCKH_4u64 g0 g1; + h1 <- VPUNPCKL_4u64 g2 g3; + h3 <- VPUNPCKH_4u64 g2 g3; + g0 <- VPERM2I128 h0 h1 (W8.of_int 32); + g2 <- VPERM2I128 h0 h1 (W8.of_int 49); + g1 <- VPERM2I128 h2 h3 (W8.of_int 32); + g3 <- VPERM2I128 h2 h3 (W8.of_int 49); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256 (WArray512.init16 (fun i_0 => (rp).[i_0])) (2 * i) (g0))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256 (WArray512.init16 (fun i_0 => (rp).[i_0])) ((2 * i) + 1) (g1))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256 (WArray512.init16 (fun i_0 => (rp).[i_0])) ((2 * i) + 8) (g2))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256 (WArray512.init16 (fun i_0 => (rp).[i_0])) (((2 * i) + 8) + 1) (g3))); + i <- i + 1; + } + return (rp); + } + + proc __cbd3 (rp:W16.t Array256.t, buf:W8.t Array128.t) : W16.t Array256.t = { + var aux: int; + + var mask249_s:W32.t; + var mask6DB_s:W32.t; + var mask07_s:W32.t; + var mask70_s:W32.t; + var mask3_s:W16.t; + var mask249:W256.t; + var mask6DB:W256.t; + var mask07:W256.t; + var mask70:W256.t; + var mask3:W256.t; + var shufbidx:W256.t; + var i:int; + var f0:W256.t; + var f1:W256.t; + var f2:W256.t; + var f3:W256.t; + + mask249_s <- (W32.of_int 2396745); + mask6DB_s <- (W32.of_int 7190235); + mask07_s <- (W32.of_int 7); + mask70_s <- (W32.of_int (7 `<<` 16)); + mask3_s <- (W16.of_int 3); + mask249 <- VPBROADCAST_8u32 mask249_s; + mask6DB <- VPBROADCAST_8u32 mask6DB_s; + mask07 <- VPBROADCAST_8u32 mask07_s; + mask70 <- VPBROADCAST_8u32 mask70_s; + mask3 <- VPBROADCAST_16u16 mask3_s; + shufbidx <- + (get256 (WArray32.init8 (fun i_0 => (cbd_jshufbidx).[i_0])) 0); + aux <- (256 %/ 32); + i <- 0; + while (i < aux) { + f0 <- + (get256_direct (WArray128.init8 (fun i_0 => (buf).[i_0])) (24 * i)); + f0 <- VPERMQ f0 (W8.of_int 148); + f0 <- VPSHUFB_256 f0 shufbidx; + f1 <- VPSRL_8u32 f0 (W8.of_int 1); + f2 <- VPSRL_8u32 f0 (W8.of_int 2); + f0 <- VPAND_256 mask249 f0; + f1 <- VPAND_256 mask249 f1; + f2 <- VPAND_256 mask249 f2; + f0 <- VPADD_8u32 f0 f1; + f0 <- VPADD_8u32 f0 f2; + f1 <- VPSRL_8u32 f0 (W8.of_int 3); + f0 <- VPADD_8u32 f0 mask6DB; + f0 <- VPSUB_8u32 f0 f1; + f1 <- VPSLL_8u32 f0 (W8.of_int 10); + f2 <- VPSRL_8u32 f0 (W8.of_int 12); + f3 <- VPSRL_8u32 f0 (W8.of_int 2); + f0 <- VPAND_256 f0 mask07; + f1 <- VPAND_256 f1 mask70; + f2 <- VPAND_256 f2 mask07; + f3 <- VPAND_256 f3 mask70; + f0 <- VPADD_16u16 f0 f1; + f1 <- VPADD_16u16 f2 f3; + f0 <- VPSUB_16u16 f0 mask3; + f1 <- VPSUB_16u16 f1 mask3; + f2 <- VPUNPCKL_8u32 f0 f1; + f3 <- VPUNPCKH_8u32 f0 f1; + f0 <- VPERM2I128 f2 f3 (W8.of_int 32); + f1 <- VPERM2I128 f2 f3 (W8.of_int 49); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256 (WArray512.init16 (fun i_0 => (rp).[i_0])) (2 * i) (f0))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256 (WArray512.init16 (fun i_0 => (rp).[i_0])) ((2 * i) + 1) (f1))); + i <- i + 1; + } + return (rp); + } + + proc __cbd2 (rp:W16.t Array256.t, buf:W8.t Array128.t) : W16.t Array256.t = { + var aux: int; + + var mask55_s:W32.t; + var mask33_s:W32.t; + var mask03_s:W32.t; + var mask0F_s:W32.t; + var mask55:W256.t; + var mask33:W256.t; + var mask03:W256.t; + var mask0F:W256.t; + var i:int; + var f0:W256.t; + var f1:W256.t; + var f2:W256.t; + var f3:W256.t; + var t:W128.t; + + mask55_s <- (W32.of_int 1431655765); + mask33_s <- (W32.of_int 858993459); + mask03_s <- (W32.of_int 50529027); + mask0F_s <- (W32.of_int 252645135); + mask55 <- VPBROADCAST_8u32 mask55_s; + mask33 <- VPBROADCAST_8u32 mask33_s; + mask03 <- VPBROADCAST_8u32 mask03_s; + mask0F <- VPBROADCAST_8u32 mask0F_s; + aux <- (256 %/ 64); + i <- 0; + while (i < aux) { + f0 <- (get256 (WArray128.init8 (fun i_0 => (buf).[i_0])) i); + f1 <- VPSRL_16u16 f0 (W8.of_int 1); + f0 <- VPAND_256 mask55 f0; + f1 <- VPAND_256 mask55 f1; + f0 <- VPADD_32u8 f0 f1; + f1 <- VPSRL_16u16 f0 (W8.of_int 2); + f0 <- VPAND_256 mask33 f0; + f1 <- VPAND_256 mask33 f1; + f0 <- VPADD_32u8 f0 mask33; + f0 <- VPSUB_32u8 f0 f1; + f1 <- VPSRL_16u16 f0 (W8.of_int 4); + f0 <- VPAND_256 mask0F f0; + f1 <- VPAND_256 mask0F f1; + f0 <- VPSUB_32u8 f0 mask03; + f1 <- VPSUB_32u8 f1 mask03; + f2 <- VPUNPCKL_32u8 f0 f1; + f3 <- VPUNPCKH_32u8 f0 f1; + t <- (truncateu128 f2); + f0 <- VPMOVSX_16u8_16u16 t; + t <- VEXTRACTI128 f2 (W8.of_int 1); + f1 <- VPMOVSX_16u8_16u16 t; + t <- (truncateu128 f3); + f2 <- VPMOVSX_16u8_16u16 t; + t <- VEXTRACTI128 f3 (W8.of_int 1); + f3 <- VPMOVSX_16u8_16u16 t; + rp <- + Array256.init + (WArray512.get16 (WArray512.set256 (WArray512.init16 (fun i_0 => (rp).[i_0])) (4 * i) (f0))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256 (WArray512.init16 (fun i_0 => (rp).[i_0])) ((4 * i) + 1) (f2))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256 (WArray512.init16 (fun i_0 => (rp).[i_0])) ((4 * i) + 2) (f1))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256 (WArray512.init16 (fun i_0 => (rp).[i_0])) ((4 * i) + 3) (f3))); + i <- i + 1; + } + return (rp); + } + + proc __poly_cbd_eta1 (rp:W16.t Array256.t, buf:W8.t Array128.t) : W16.t Array256.t = { + + + + if ((2 = 2)) { + rp <@ __cbd2 (rp, (Array128.init (fun i => buf.[0 + i]))); + } else { + rp <@ __cbd3 (rp, buf); + } + return (rp); + } + + proc __shake256_squeezenblocks4x (state:W256.t Array25.t, + buf0:W8.t Array136.t, + buf1:W8.t Array136.t, + buf2:W8.t Array136.t, + buf3:W8.t Array136.t) : W256.t Array25.t * + W8.t Array136.t * + W8.t Array136.t * + W8.t Array136.t * + W8.t Array136.t = { + var aux: int; + var aux_4: W8.t Array136.t; + var aux_3: W8.t Array136.t; + var aux_2: W8.t Array136.t; + var aux_1: W8.t Array136.t; + var aux_0: W256.t Array25.t; + + var i:int; + + aux <- (((((2 * 256) %/ 4) + 136) - 1) %/ 136); + i <- 0; + while (i < aux) { + (aux_0, aux_4, aux_3, aux_2, + aux_1) <@ __shake256_squeezeblock4x (state, + (Array136.init (fun i_0 => buf0.[(i * 136) + i_0])), + (Array136.init (fun i_0 => buf1.[(i * 136) + i_0])), + (Array136.init (fun i_0 => buf2.[(i * 136) + i_0])), + (Array136.init (fun i_0 => buf3.[(i * 136) + i_0]))); + state <- aux_0; + buf0 <- Array136.init + (fun i_0 => if (i * 136) <= i_0 < (i * 136) + 136 + then aux_4.[i_0-(i * 136)] else buf0.[i_0]); + buf1 <- Array136.init + (fun i_0 => if (i * 136) <= i_0 < (i * 136) + 136 + then aux_3.[i_0-(i * 136)] else buf1.[i_0]); + buf2 <- Array136.init + (fun i_0 => if (i * 136) <= i_0 < (i * 136) + 136 + then aux_2.[i_0-(i * 136)] else buf2.[i_0]); + buf3 <- Array136.init + (fun i_0 => if (i * 136) <= i_0 < (i * 136) + 136 + then aux_1.[i_0-(i * 136)] else buf3.[i_0]); + i <- i + 1; + } + return (state, buf0, buf1, buf2, buf3); + } + + proc _poly_getnoise_eta1_4x (r0:W16.t Array256.t, r1:W16.t Array256.t, + r2:W16.t Array256.t, r3:W16.t Array256.t, + seed:W8.t Array32.t, nonce:W8.t) : W16.t Array256.t * + W16.t Array256.t * + W16.t Array256.t * + W16.t Array256.t = { + + var f:W256.t; + var buf0:W8.t Array136.t; + var buf1:W8.t Array136.t; + var buf2:W8.t Array136.t; + var buf3:W8.t Array136.t; + var state:W256.t Array25.t; + buf0 <- witness; + buf1 <- witness; + buf2 <- witness; + buf3 <- witness; + state <- witness; + f <- (get256 (WArray32.init8 (fun i => (seed).[i])) 0); + buf0 <- + Array136.init + (WArray136.get8 (WArray136.set256 (WArray136.init8 (fun i => (buf0).[i])) 0 (f))); + buf1 <- + Array136.init + (WArray136.get8 (WArray136.set256 (WArray136.init8 (fun i => (buf1).[i])) 0 (f))); + buf2 <- + Array136.init + (WArray136.get8 (WArray136.set256 (WArray136.init8 (fun i => (buf2).[i])) 0 (f))); + buf3 <- + Array136.init + (WArray136.get8 (WArray136.set256 (WArray136.init8 (fun i => (buf3).[i])) 0 (f))); + buf0 <- + Array136.init + (WArray136.get8 (WArray136.set8_direct (WArray136.init8 (fun i => (buf0).[i])) 32 (nonce))); + nonce <- (nonce + (W8.of_int 1)); + buf1 <- + Array136.init + (WArray136.get8 (WArray136.set8_direct (WArray136.init8 (fun i => (buf1).[i])) 32 (nonce))); + nonce <- (nonce + (W8.of_int 1)); + buf2 <- + Array136.init + (WArray136.get8 (WArray136.set8_direct (WArray136.init8 (fun i => (buf2).[i])) 32 (nonce))); + nonce <- (nonce + (W8.of_int 1)); + buf3 <- + Array136.init + (WArray136.get8 (WArray136.set8_direct (WArray136.init8 (fun i => (buf3).[i])) 32 (nonce))); + state <@ _shake256_absorb4x_33 (state, + (Array33.init (fun i => buf0.[0 + i])), + (Array33.init (fun i => buf1.[0 + i])), + (Array33.init (fun i => buf2.[0 + i])), + (Array33.init (fun i => buf3.[0 + i]))); + (state, buf0, buf1, buf2, buf3) <@ __shake256_squeezenblocks4x (state, + buf0, buf1, buf2, buf3); + r0 <@ __poly_cbd_eta1 (r0, (Array128.init (fun i => buf0.[0 + i]))); + r1 <@ __poly_cbd_eta1 (r1, (Array128.init (fun i => buf1.[0 + i]))); + r2 <@ __poly_cbd_eta1 (r2, (Array128.init (fun i => buf2.[0 + i]))); + r3 <@ __poly_cbd_eta1 (r3, (Array128.init (fun i => buf3.[0 + i]))); + return (r0, r1, r2, r3); + } + + proc __invntt___butterfly64x (rl0:W256.t, rl1:W256.t, rl2:W256.t, + rl3:W256.t, rh0:W256.t, rh1:W256.t, + rh2:W256.t, rh3:W256.t, zl0:W256.t, + zl1:W256.t, zh0:W256.t, zh1:W256.t, + qx16:W256.t) : W256.t * W256.t * W256.t * + W256.t * W256.t * W256.t * + W256.t * W256.t = { + + var t0:W256.t; + var t1:W256.t; + var t2:W256.t; + var t3:W256.t; + + t0 <- VPSUB_16u16 rl0 rh0; + t1 <- VPSUB_16u16 rl1 rh1; + t2 <- VPSUB_16u16 rl2 rh2; + rl0 <- VPADD_16u16 rh0 rl0; + rl1 <- VPADD_16u16 rh1 rl1; + rh0 <- VPMULL_16u16 zl0 t0; + rl2 <- VPADD_16u16 rh2 rl2; + rh1 <- VPMULL_16u16 zl0 t1; + t3 <- VPSUB_16u16 rl3 rh3; + rl3 <- VPADD_16u16 rh3 rl3; + rh2 <- VPMULL_16u16 zl1 t2; + rh3 <- VPMULL_16u16 zl1 t3; + t0 <- VPMULH_16u16 zh0 t0; + t1 <- VPMULH_16u16 zh0 t1; + t2 <- VPMULH_16u16 zh1 t2; + t3 <- VPMULH_16u16 zh1 t3; + rh0 <- VPMULH_16u16 qx16 rh0; + rh1 <- VPMULH_16u16 qx16 rh1; + rh2 <- VPMULH_16u16 qx16 rh2; + rh3 <- VPMULH_16u16 qx16 rh3; + rh0 <- VPSUB_16u16 t0 rh0; + rh1 <- VPSUB_16u16 t1 rh1; + rh2 <- VPSUB_16u16 t2 rh2; + rh3 <- VPSUB_16u16 t3 rh3; + return (rl0, rl1, rl2, rl3, rh0, rh1, rh2, rh3); + } + + proc _poly_invntt (rp:W16.t Array256.t) : W16.t Array256.t = { + var aux: int; + + var zetasp:W16.t Array400.t; + var qx16:W256.t; + var i:int; + var zeta0:W256.t; + var zeta1:W256.t; + var zeta2:W256.t; + var zeta3:W256.t; + var r0:W256.t; + var r1:W256.t; + var r2:W256.t; + var r3:W256.t; + var r4:W256.t; + var r5:W256.t; + var r6:W256.t; + var r7:W256.t; + var vx16:W256.t; + var flox16:W256.t; + var fhix16:W256.t; + zetasp <- witness; + zetasp <- jzetas_inv_exp; + qx16 <- (get256 (WArray32.init16 (fun i_0 => (jqx16).[i_0])) 0); + i <- 0; + while (i < 2) { + zeta0 <- + (get256_direct (WArray800.init16 (fun i_0 => (zetasp).[i_0])) + (0 + (392 * i))); + zeta1 <- + (get256_direct (WArray800.init16 (fun i_0 => (zetasp).[i_0])) + (64 + (392 * i))); + zeta2 <- + (get256_direct (WArray800.init16 (fun i_0 => (zetasp).[i_0])) + (32 + (392 * i))); + zeta3 <- + (get256_direct (WArray800.init16 (fun i_0 => (zetasp).[i_0])) + (96 + (392 * i))); + r0 <- + (get256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) + ((32 * 0) + (256 * i))); + r1 <- + (get256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) + ((32 * 1) + (256 * i))); + r2 <- + (get256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) + ((32 * 2) + (256 * i))); + r3 <- + (get256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) + ((32 * 3) + (256 * i))); + r4 <- + (get256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) + ((32 * 4) + (256 * i))); + r5 <- + (get256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) + ((32 * 5) + (256 * i))); + r6 <- + (get256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) + ((32 * 6) + (256 * i))); + r7 <- + (get256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) + ((32 * 7) + (256 * i))); + (r0, r1, r4, r5, r2, r3, r6, r7) <@ __invntt___butterfly64x (r0, r1, + r4, r5, r2, r3, r6, r7, zeta0, zeta1, zeta2, zeta3, qx16); + vx16 <- (get256 (WArray32.init16 (fun i_0 => (jvx16).[i_0])) 0); + zeta0 <- + (get256_direct (WArray800.init16 (fun i_0 => (zetasp).[i_0])) + (128 + (392 * i))); + zeta1 <- + (get256_direct (WArray800.init16 (fun i_0 => (zetasp).[i_0])) + (160 + (392 * i))); + r0 <@ __red16x (r0, qx16, vx16); + r1 <@ __red16x (r1, qx16, vx16); + r4 <@ __red16x (r4, qx16, vx16); + r5 <@ __red16x (r5, qx16, vx16); + (r0, r1, r2, r3, r4, r5, r6, r7) <@ __invntt___butterfly64x (r0, r1, + r2, r3, r4, r5, r6, r7, zeta0, zeta0, zeta1, zeta1, qx16); + (r0, r1) <@ __shuffle1 (r0, r1); + (r2, r3) <@ __shuffle1 (r2, r3); + (r4, r5) <@ __shuffle1 (r4, r5); + (r6, r7) <@ __shuffle1 (r6, r7); + zeta0 <- + (get256_direct (WArray800.init16 (fun i_0 => (zetasp).[i_0])) + (192 + (392 * i))); + zeta1 <- + (get256_direct (WArray800.init16 (fun i_0 => (zetasp).[i_0])) + (224 + (392 * i))); + (r0, r2, r4, r6, r1, r3, r5, r7) <@ __invntt___butterfly64x (r0, r2, + r4, r6, r1, r3, r5, r7, zeta0, zeta0, zeta1, zeta1, qx16); + r0 <@ __red16x (r0, qx16, vx16); + (r0, r2) <@ __shuffle2 (r0, r2); + (r4, r6) <@ __shuffle2 (r4, r6); + (r1, r3) <@ __shuffle2 (r1, r3); + (r5, r7) <@ __shuffle2 (r5, r7); + zeta0 <- + (get256_direct (WArray800.init16 (fun i_0 => (zetasp).[i_0])) + (256 + (392 * i))); + zeta1 <- + (get256_direct (WArray800.init16 (fun i_0 => (zetasp).[i_0])) + (288 + (392 * i))); + (r0, r4, r1, r5, r2, r6, r3, r7) <@ __invntt___butterfly64x (r0, r4, + r1, r5, r2, r6, r3, r7, zeta0, zeta0, zeta1, zeta1, qx16); + r0 <@ __red16x (r0, qx16, vx16); + (r0, r4) <@ __shuffle4 (r0, r4); + (r1, r5) <@ __shuffle4 (r1, r5); + (r2, r6) <@ __shuffle4 (r2, r6); + (r3, r7) <@ __shuffle4 (r3, r7); + zeta0 <- + (get256_direct (WArray800.init16 (fun i_0 => (zetasp).[i_0])) + (320 + (392 * i))); + zeta1 <- + (get256_direct (WArray800.init16 (fun i_0 => (zetasp).[i_0])) + (352 + (392 * i))); + (r0, r1, r2, r3, r4, r5, r6, r7) <@ __invntt___butterfly64x (r0, r1, + r2, r3, r4, r5, r6, r7, zeta0, zeta0, zeta1, zeta1, qx16); + r0 <@ __red16x (r0, qx16, vx16); + (r0, r1) <@ __shuffle8 (r0, r1); + (r2, r3) <@ __shuffle8 (r2, r3); + (r4, r5) <@ __shuffle8 (r4, r5); + (r6, r7) <@ __shuffle8 (r6, r7); + zeta0 <- + VPBROADCAST_8u32 (get32_direct + (WArray800.init16 (fun i_0 => (zetasp).[i_0])) + (384 + (392 * i))); + zeta1 <- + VPBROADCAST_8u32 (get32_direct + (WArray800.init16 (fun i_0 => (zetasp).[i_0])) + (388 + (392 * i))); + (r0, r2, r4, r6, r1, r3, r5, r7) <@ __invntt___butterfly64x (r0, r2, + r4, r6, r1, r3, r5, r7, zeta0, zeta0, zeta1, zeta1, qx16); + r0 <@ __red16x (r0, qx16, vx16); + if ((i = 0)) { + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) ((32 * 0) + (256 * i)) (r0))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) ((32 * 1) + (256 * i)) (r2))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) ((32 * 2) + (256 * i)) (r4))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) ((32 * 3) + (256 * i)) (r6))); + } else { + + } + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) ((32 * 4) + (256 * i)) (r1))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) ((32 * 5) + (256 * i)) (r3))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) ((32 * 6) + (256 * i)) (r5))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) ((32 * 7) + (256 * i)) (r7))); + i <- i + 1; + } + zeta0 <- + VPBROADCAST_8u32 (get32_direct + (WArray800.init16 (fun i_0 => (zetasp).[i_0])) 784); + zeta1 <- + VPBROADCAST_8u32 (get32_direct + (WArray800.init16 (fun i_0 => (zetasp).[i_0])) 788); + i <- 0; + while (i < 2) { + if ((i = 0)) { + r7 <- r6; + r6 <- r4; + r5 <- r2; + r4 <- r0; + } else { + r4 <- + (get256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) + ((32 * 8) + (128 * i))); + r5 <- + (get256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) + ((32 * 9) + (128 * i))); + r6 <- + (get256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) + ((32 * 10) + (128 * i))); + r7 <- + (get256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) + ((32 * 11) + (128 * i))); + } + r0 <- + (get256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) + ((32 * 0) + (128 * i))); + r1 <- + (get256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) + ((32 * 1) + (128 * i))); + r2 <- + (get256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) + ((32 * 2) + (128 * i))); + r3 <- + (get256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) + ((32 * 3) + (128 * i))); + (r0, r1, r2, r3, r4, r5, r6, r7) <@ __invntt___butterfly64x (r0, r1, + r2, r3, r4, r5, r6, r7, zeta0, zeta0, zeta1, zeta1, qx16); + flox16 <- (get256 (WArray32.init16 (fun i_0 => (jflox16).[i_0])) 0); + fhix16 <- (get256 (WArray32.init16 (fun i_0 => (jfhix16).[i_0])) 0); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) ((32 * 8) + (128 * i)) (r4))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) ((32 * 9) + (128 * i)) (r5))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) ((32 * 10) + (128 * i)) (r6))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) ((32 * 11) + (128 * i)) (r7))); + r0 <@ __fqmulprecomp16x (r0, flox16, fhix16, qx16); + r1 <@ __fqmulprecomp16x (r1, flox16, fhix16, qx16); + r2 <@ __fqmulprecomp16x (r2, flox16, fhix16, qx16); + r3 <@ __fqmulprecomp16x (r3, flox16, fhix16, qx16); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) ((32 * 0) + (128 * i)) (r0))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) ((32 * 1) + (128 * i)) (r1))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) ((32 * 2) + (128 * i)) (r2))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) ((32 * 3) + (128 * i)) (r3))); + i <- i + 1; + } + return (rp); + } + + proc __butterfly64x (rl0:W256.t, rl1:W256.t, rl2:W256.t, rl3:W256.t, + rh0:W256.t, rh1:W256.t, rh2:W256.t, rh3:W256.t, + zl0:W256.t, zl1:W256.t, zh0:W256.t, zh1:W256.t, + qx16:W256.t) : W256.t * W256.t * W256.t * W256.t * + W256.t * W256.t * W256.t * W256.t = { + + var t0:W256.t; + var t1:W256.t; + var t2:W256.t; + var t3:W256.t; + var t4:W256.t; + var t5:W256.t; + var t6:W256.t; + var t7:W256.t; + + t0 <- VPMULL_16u16 zl0 rh0; + t1 <- VPMULH_16u16 zh0 rh0; + t2 <- VPMULL_16u16 zl0 rh1; + t3 <- VPMULH_16u16 zh0 rh1; + t4 <- VPMULL_16u16 zl1 rh2; + t5 <- VPMULH_16u16 zh1 rh2; + t6 <- VPMULL_16u16 zl1 rh3; + t7 <- VPMULH_16u16 zh1 rh3; + t0 <- VPMULH_16u16 t0 qx16; + t2 <- VPMULH_16u16 t2 qx16; + t4 <- VPMULH_16u16 t4 qx16; + t6 <- VPMULH_16u16 t6 qx16; + rh1 <- VPSUB_16u16 rl1 t3; + rl1 <- VPADD_16u16 t3 rl1; + rh0 <- VPSUB_16u16 rl0 t1; + rl0 <- VPADD_16u16 t1 rl0; + rh3 <- VPSUB_16u16 rl3 t7; + rl3 <- VPADD_16u16 t7 rl3; + rh2 <- VPSUB_16u16 rl2 t5; + rl2 <- VPADD_16u16 t5 rl2; + rh0 <- VPADD_16u16 t0 rh0; + rl0 <- VPSUB_16u16 rl0 t0; + rh1 <- VPADD_16u16 t2 rh1; + rl1 <- VPSUB_16u16 rl1 t2; + rh2 <- VPADD_16u16 t4 rh2; + rl2 <- VPSUB_16u16 rl2 t4; + rh3 <- VPADD_16u16 t6 rh3; + rl3 <- VPSUB_16u16 rl3 t6; + return (rl0, rl1, rl2, rl3, rh0, rh1, rh2, rh3); + } + + proc _poly_ntt (rp:W16.t Array256.t) : W16.t Array256.t = { + var aux: int; + + var zetasp:W16.t Array400.t; + var qx16:W256.t; + var zeta0:W256.t; + var zeta1:W256.t; + var r0:W256.t; + var r1:W256.t; + var r2:W256.t; + var r3:W256.t; + var r4:W256.t; + var r5:W256.t; + var r6:W256.t; + var r7:W256.t; + var i:int; + var zeta2:W256.t; + var zeta3:W256.t; + var vx16:W256.t; + zetasp <- witness; + zetasp <- jzetas_exp; + qx16 <- (get256 (WArray32.init16 (fun i_0 => (jqx16).[i_0])) 0); + zeta0 <- + VPBROADCAST_8u32 (get32 (WArray800.init16 (fun i_0 => (zetasp).[i_0])) 0); + zeta1 <- + VPBROADCAST_8u32 (get32 (WArray800.init16 (fun i_0 => (zetasp).[i_0])) 1); + r0 <- + (get256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) (32 * 0)); + r1 <- + (get256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) (32 * 1)); + r2 <- + (get256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) (32 * 2)); + r3 <- + (get256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) (32 * 3)); + r4 <- + (get256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) (32 * 8)); + r5 <- + (get256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) (32 * 9)); + r6 <- + (get256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) (32 * 10)); + r7 <- + (get256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) (32 * 11)); + (r0, r1, r2, r3, r4, r5, r6, r7) <@ __butterfly64x (r0, r1, r2, r3, r4, + r5, r6, r7, zeta0, zeta0, zeta1, zeta1, qx16); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) (32 * 0) (r0))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) (32 * 1) (r1))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) (32 * 2) (r2))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) (32 * 3) (r3))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) (32 * 8) (r4))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) (32 * 9) (r5))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) (32 * 10) (r6))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) (32 * 11) (r7))); + r0 <- + (get256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) (32 * 4)); + r1 <- + (get256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) (32 * 5)); + r2 <- + (get256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) (32 * 6)); + r3 <- + (get256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) (32 * 7)); + r4 <- + (get256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) (32 * 12)); + r5 <- + (get256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) (32 * 13)); + r6 <- + (get256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) (32 * 14)); + r7 <- + (get256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) (32 * 15)); + (r0, r1, r2, r3, r4, r5, r6, r7) <@ __butterfly64x (r0, r1, r2, r3, r4, + r5, r6, r7, zeta0, zeta0, zeta1, zeta1, qx16); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) (32 * 12) (r4))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) (32 * 13) (r5))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) (32 * 14) (r6))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) (32 * 15) (r7))); + i <- 0; + while (i < 2) { + zeta0 <- + VPBROADCAST_8u32 (get32_direct + (WArray800.init16 (fun i_0 => (zetasp).[i_0])) + (8 + (392 * i))); + zeta1 <- + VPBROADCAST_8u32 (get32_direct + (WArray800.init16 (fun i_0 => (zetasp).[i_0])) + (12 + (392 * i))); + if ((i = 0)) { + r4 <- r0; + r5 <- r1; + r6 <- r2; + r7 <- r3; + } else { + r4 <- + (get256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) + ((32 * 4) + (256 * i))); + r5 <- + (get256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) + ((32 * 5) + (256 * i))); + r6 <- + (get256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) + ((32 * 6) + (256 * i))); + r7 <- + (get256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) + ((32 * 7) + (256 * i))); + } + r0 <- + (get256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) + ((32 * 0) + (256 * i))); + r1 <- + (get256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) + ((32 * 1) + (256 * i))); + r2 <- + (get256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) + ((32 * 2) + (256 * i))); + r3 <- + (get256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) + ((32 * 3) + (256 * i))); + (r0, r1, r2, r3, r4, r5, r6, r7) <@ __butterfly64x (r0, r1, r2, r3, r4, + r5, r6, r7, zeta0, zeta0, zeta1, zeta1, qx16); + zeta0 <- + (get256_direct (WArray800.init16 (fun i_0 => (zetasp).[i_0])) + (16 + (392 * i))); + zeta1 <- + (get256_direct (WArray800.init16 (fun i_0 => (zetasp).[i_0])) + (48 + (392 * i))); + (r0, r4) <@ __shuffle8 (r0, r4); + (r1, r5) <@ __shuffle8 (r1, r5); + (r2, r6) <@ __shuffle8 (r2, r6); + (r3, r7) <@ __shuffle8 (r3, r7); + (r0, r4, r1, r5, r2, r6, r3, r7) <@ __butterfly64x (r0, r4, r1, r5, r2, + r6, r3, r7, zeta0, zeta0, zeta1, zeta1, qx16); + zeta0 <- + (get256_direct (WArray800.init16 (fun i_0 => (zetasp).[i_0])) + (80 + (392 * i))); + zeta1 <- + (get256_direct (WArray800.init16 (fun i_0 => (zetasp).[i_0])) + (112 + (392 * i))); + (r0, r2) <@ __shuffle4 (r0, r2); + (r4, r6) <@ __shuffle4 (r4, r6); + (r1, r3) <@ __shuffle4 (r1, r3); + (r5, r7) <@ __shuffle4 (r5, r7); + (r0, r2, r4, r6, r1, r3, r5, r7) <@ __butterfly64x (r0, r2, r4, r6, r1, + r3, r5, r7, zeta0, zeta0, zeta1, zeta1, qx16); + zeta0 <- + (get256_direct (WArray800.init16 (fun i_0 => (zetasp).[i_0])) + (144 + (392 * i))); + zeta1 <- + (get256_direct (WArray800.init16 (fun i_0 => (zetasp).[i_0])) + (176 + (392 * i))); + (r0, r1) <@ __shuffle2 (r0, r1); + (r2, r3) <@ __shuffle2 (r2, r3); + (r4, r5) <@ __shuffle2 (r4, r5); + (r6, r7) <@ __shuffle2 (r6, r7); + (r0, r1, r2, r3, r4, r5, r6, r7) <@ __butterfly64x (r0, r1, r2, r3, r4, + r5, r6, r7, zeta0, zeta0, zeta1, zeta1, qx16); + zeta0 <- + (get256_direct (WArray800.init16 (fun i_0 => (zetasp).[i_0])) + (208 + (392 * i))); + zeta1 <- + (get256_direct (WArray800.init16 (fun i_0 => (zetasp).[i_0])) + (240 + (392 * i))); + (r0, r4) <@ __shuffle1 (r0, r4); + (r1, r5) <@ __shuffle1 (r1, r5); + (r2, r6) <@ __shuffle1 (r2, r6); + (r3, r7) <@ __shuffle1 (r3, r7); + (r0, r4, r1, r5, r2, r6, r3, r7) <@ __butterfly64x (r0, r4, r1, r5, r2, + r6, r3, r7, zeta0, zeta0, zeta1, zeta1, qx16); + zeta0 <- + (get256_direct (WArray800.init16 (fun i_0 => (zetasp).[i_0])) + (272 + (392 * i))); + zeta2 <- + (get256_direct (WArray800.init16 (fun i_0 => (zetasp).[i_0])) + (304 + (392 * i))); + zeta1 <- + (get256_direct (WArray800.init16 (fun i_0 => (zetasp).[i_0])) + (336 + (392 * i))); + zeta3 <- + (get256_direct (WArray800.init16 (fun i_0 => (zetasp).[i_0])) + (368 + (392 * i))); + (r0, r4, r2, r6, r1, r5, r3, r7) <@ __butterfly64x (r0, r4, r2, r6, r1, + r5, r3, r7, zeta0, zeta1, zeta2, zeta3, qx16); + vx16 <- (get256 (WArray32.init16 (fun i_0 => (jvx16).[i_0])) 0); + r0 <@ __red16x (r0, qx16, vx16); + r4 <@ __red16x (r4, qx16, vx16); + r2 <@ __red16x (r2, qx16, vx16); + r6 <@ __red16x (r6, qx16, vx16); + r1 <@ __red16x (r1, qx16, vx16); + r5 <@ __red16x (r5, qx16, vx16); + r3 <@ __red16x (r3, qx16, vx16); + r7 <@ __red16x (r7, qx16, vx16); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) ((32 * 0) + (256 * i)) (r0))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) ((32 * 1) + (256 * i)) (r4))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) ((32 * 2) + (256 * i)) (r1))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) ((32 * 3) + (256 * i)) (r5))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) ((32 * 4) + (256 * i)) (r2))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) ((32 * 5) + (256 * i)) (r6))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) ((32 * 6) + (256 * i)) (r3))); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) ((32 * 7) + (256 * i)) (r7))); + i <- i + 1; + } + return (rp); + } + + proc __poly_reduce (rp:W16.t Array256.t) : W16.t Array256.t = { + var aux: int; + + var qx16:W256.t; + var vx16:W256.t; + var i:int; + var r:W256.t; + + qx16 <- (get256 (WArray32.init16 (fun i_0 => (jqx16).[i_0])) 0); + vx16 <- (get256 (WArray32.init16 (fun i_0 => (jvx16).[i_0])) 0); + i <- 0; + while (i < 16) { + r <- + (get256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) (32 * i)); + r <@ __red16x (r, qx16, vx16); + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) (32 * i) (r))); + i <- i + 1; + } + return (rp); + } + + proc _poly_sub (rp:W16.t Array256.t, ap:W16.t Array256.t, + bp:W16.t Array256.t) : W16.t Array256.t = { + var aux: int; + + var i:int; + var a:W256.t; + var b:W256.t; + var r:W256.t; + + i <- 0; + while (i < 16) { + a <- + (get256_direct (WArray512.init16 (fun i_0 => (ap).[i_0])) (32 * i)); + b <- + (get256_direct (WArray512.init16 (fun i_0 => (bp).[i_0])) (32 * i)); + r <- VPSUB_16u16 a b; + rp <- + Array256.init + (WArray512.get16 (WArray512.set256_direct (WArray512.init16 (fun i_0 => (rp).[i_0])) (32 * i) (r))); + i <- i + 1; + } + return (rp); + } + + proc _poly_tobytes (rp:W64.t, a:W16.t Array256.t) : W16.t Array256.t = { + var aux: int; + + var jqx16_p:W16.t Array16.t; + var qx16:W256.t; + var i:int; + var t0:W256.t; + var t1:W256.t; + var t2:W256.t; + var t3:W256.t; + var t4:W256.t; + var t5:W256.t; + var t6:W256.t; + var t7:W256.t; + var tt:W256.t; + var ttt:W256.t; + jqx16_p <- witness; + jqx16_p <- jqx16; + qx16 <- (get256 (WArray32.init16 (fun i_0 => (jqx16_p).[i_0])) 0); + a <@ _poly_csubq (a); + i <- 0; + while (i < 2) { + t0 <- (get256 (WArray512.init16 (fun i_0 => (a).[i_0])) (8 * i)); + t1 <- (get256 (WArray512.init16 (fun i_0 => (a).[i_0])) ((8 * i) + 1)); + t2 <- (get256 (WArray512.init16 (fun i_0 => (a).[i_0])) ((8 * i) + 2)); + t3 <- (get256 (WArray512.init16 (fun i_0 => (a).[i_0])) ((8 * i) + 3)); + t4 <- (get256 (WArray512.init16 (fun i_0 => (a).[i_0])) ((8 * i) + 4)); + t5 <- (get256 (WArray512.init16 (fun i_0 => (a).[i_0])) ((8 * i) + 5)); + t6 <- (get256 (WArray512.init16 (fun i_0 => (a).[i_0])) ((8 * i) + 6)); + t7 <- (get256 (WArray512.init16 (fun i_0 => (a).[i_0])) ((8 * i) + 7)); + tt <- VPSLL_16u16 t1 (W8.of_int 12); + tt <- (tt `|` t0); + t0 <- VPSRL_16u16 t1 (W8.of_int 4); + t1 <- VPSLL_16u16 t2 (W8.of_int 8); + t0 <- (t0 `|` t1); + t1 <- VPSRL_16u16 t2 (W8.of_int 8); + t2 <- VPSLL_16u16 t3 (W8.of_int 4); + t1 <- (t1 `|` t2); + t2 <- VPSLL_16u16 t5 (W8.of_int 12); + t2 <- (t2 `|` t4); + t3 <- VPSRL_16u16 t5 (W8.of_int 4); + t4 <- VPSLL_16u16 t6 (W8.of_int 8); + t3 <- (t3 `|` t4); + t4 <- VPSRL_16u16 t6 (W8.of_int 8); + t5 <- VPSLL_16u16 t7 (W8.of_int 4); + t4 <- (t4 `|` t5); + (ttt, t0) <@ __shuffle1 (tt, t0); + (tt, t2) <@ __shuffle1 (t1, t2); + (t1, t4) <@ __shuffle1 (t3, t4); + (t3, tt) <@ __shuffle2 (ttt, tt); + (ttt, t0) <@ __shuffle2 (t1, t0); + (t1, t4) <@ __shuffle2 (t2, t4); + (t2, ttt) <@ __shuffle4 (t3, ttt); + (t3, tt) <@ __shuffle4 (t1, tt); + (t1, t4) <@ __shuffle4 (t0, t4); + (t0, t3) <@ __shuffle8 (t2, t3); + (t2, ttt) <@ __shuffle8 (t1, ttt); + (t1, t4) <@ __shuffle8 (tt, t4); + Glob.mem <- + storeW256 Glob.mem (W64.to_uint (rp + (W64.of_int (192 * i)))) (t0); + Glob.mem <- + storeW256 Glob.mem (W64.to_uint (rp + (W64.of_int ((192 * i) + 32)))) (t2); + Glob.mem <- + storeW256 Glob.mem (W64.to_uint (rp + (W64.of_int ((192 * i) + 64)))) (t1); + Glob.mem <- + storeW256 Glob.mem (W64.to_uint (rp + (W64.of_int ((192 * i) + 96)))) (t3); + Glob.mem <- + storeW256 Glob.mem (W64.to_uint (rp + (W64.of_int ((192 * i) + 128)))) (ttt); + Glob.mem <- + storeW256 Glob.mem (W64.to_uint (rp + (W64.of_int ((192 * i) + 160)))) (t4); + i <- i + 1; + } + return (a); + } + + proc _poly_tomsg_1 (rp:W8.t Array32.t, a:W16.t Array256.t) : W8.t Array32.t * + W16.t Array256.t = { + var aux: int; + + var px16:W16.t Array16.t; + var hq:W256.t; + var hhq:W256.t; + var i:int; + var f0:W256.t; + var f1:W256.t; + var g0:W256.t; + var g1:W256.t; + var c:W32.t; + px16 <- witness; + a <@ _poly_csubq (a); + px16 <- hqx16_m1; + hq <- (get256 (WArray32.init16 (fun i_0 => (px16).[i_0])) 0); + px16 <- hhqx16; + hhq <- (get256 (WArray32.init16 (fun i_0 => (px16).[i_0])) 0); + aux <- (256 %/ 32); + i <- 0; + while (i < aux) { + f0 <- (get256 (WArray512.init16 (fun i_0 => (a).[i_0])) (2 * i)); + f1 <- (get256 (WArray512.init16 (fun i_0 => (a).[i_0])) ((2 * i) + 1)); + f0 <- VPSUB_16u16 hq f0; + f1 <- VPSUB_16u16 hq f1; + g0 <- VPSRA_16u16 f0 (W8.of_int 15); + g1 <- VPSRA_16u16 f1 (W8.of_int 15); + f0 <- VPXOR_256 f0 g0; + f1 <- VPXOR_256 f1 g1; + f0 <- VPSUB_16u16 f0 hhq; + f1 <- VPSUB_16u16 f1 hhq; + f0 <- VPACKSS_16u16 f0 f1; + f0 <- VPERMQ f0 (W8.of_int 216); + c <- VPMOVMSKB_u256u32 f0; + rp <- + Array32.init + (WArray32.get8 (WArray32.set32 (WArray32.init8 (fun i_0 => (rp).[i_0])) i (c))); + i <- i + 1; + } + return (rp, a); + } + + proc __polyvec_add2 (r:W16.t Array768.t, b:W16.t Array768.t) : W16.t Array768.t = { + var aux: W16.t Array256.t; + + + + aux <@ _poly_add2 ((Array256.init (fun i => r.[0 + i])), + (Array256.init (fun i => b.[0 + i]))); + r <- Array768.init + (fun i => if 0 <= i < 0 + 256 then aux.[i-0] else r.[i]); + aux <@ _poly_add2 ((Array256.init (fun i => r.[256 + i])), + (Array256.init (fun i => b.[256 + i]))); + r <- Array768.init + (fun i => if 256 <= i < 256 + 256 then aux.[i-256] else r.[i]); + aux <@ _poly_add2 ((Array256.init (fun i => r.[(2 * 256) + i])), + (Array256.init (fun i => b.[(2 * 256) + i]))); + r <- Array768.init + (fun i => if (2 * 256) <= i < (2 * 256) + 256 then aux.[i-(2 * 256)] + else r.[i]); + return (r); + } + + proc __polyvec_csubq (r:W16.t Array768.t) : W16.t Array768.t = { + var aux: W16.t Array256.t; + + + + aux <@ _poly_csubq ((Array256.init (fun i => r.[0 + i]))); + r <- Array768.init + (fun i => if 0 <= i < 0 + 256 then aux.[i-0] else r.[i]); + aux <@ _poly_csubq ((Array256.init (fun i => r.[256 + i]))); + r <- Array768.init + (fun i => if 256 <= i < 256 + 256 then aux.[i-256] else r.[i]); + aux <@ _poly_csubq ((Array256.init (fun i => r.[(2 * 256) + i]))); + r <- Array768.init + (fun i => if (2 * 256) <= i < (2 * 256) + 256 then aux.[i-(2 * 256)] + else r.[i]); + return (r); + } + + proc __polyvec_decompress (rp:W64.t) : W16.t Array768.t = { + var aux: int; + + var r:W16.t Array768.t; + var q:W256.t; + var shufbidx:W256.t; + var sllvdidx:W256.t; + var mask:W256.t; + var i:int; + var k:int; + var f:W256.t; + r <- witness; + q <- VPBROADCAST_8u32 pvd_q_s; + shufbidx <- + (get256 (WArray32.init8 (fun i_0 => (pvd_shufbdidx_s).[i_0])) 0); + sllvdidx <- VPBROADCAST_4u64 pvd_sllvdidx_s; + mask <- VPBROADCAST_8u32 pvd_mask_s; + k <- 0; + while (k < 3) { + aux <- (256 %/ 16); + i <- 0; + while (i < aux) { + f <- + (loadW256 Glob.mem (W64.to_uint (rp + (W64.of_int ((320 * k) + (20 * i)))))); + f <- VPERMQ f (W8.of_int 148); + f <- VPSHUFB_256 f shufbidx; + f <- VPSLLV_8u32 f sllvdidx; + f <- VPSRL_16u16 f (W8.of_int 1); + f <- VPAND_256 f mask; + f <- VPMULHRS_16u16 f q; + r <- + Array768.init + (WArray1536.get16 (WArray1536.set256 (WArray1536.init16 (fun i_0 => (r).[i_0])) ((16 * k) + i) (f))); + i <- i + 1; + } + k <- k + 1; + } + return (r); + } + + proc __polyvec_compress (rp:W64.t, a:W16.t Array768.t) : unit = { + var aux: int; + + var x16p:W16.t Array16.t; + var v:W256.t; + var v8:W256.t; + var off:W256.t; + var shift1:W256.t; + var mask:W256.t; + var shift2:W256.t; + var sllvdidx:W256.t; + var shufbidx:W256.t; + var i:int; + var f0:W256.t; + var f1:W256.t; + var f2:W256.t; + var t0:W128.t; + var t1:W128.t; + x16p <- witness; + a <@ __polyvec_csubq (a); + x16p <- jvx16; + v <- (get256 (WArray32.init16 (fun i_0 => (x16p).[i_0])) 0); + v8 <- VPSLL_16u16 v (W8.of_int 3); + off <- VPBROADCAST_16u16 pvc_off_s; + shift1 <- VPBROADCAST_16u16 pvc_shift1_s; + mask <- VPBROADCAST_16u16 pvc_mask_s; + shift2 <- VPBROADCAST_4u64 pvc_shift2_s; + sllvdidx <- VPBROADCAST_4u64 pvc_sllvdidx_s; + shufbidx <- + (get256 (WArray32.init8 (fun i_0 => (pvc_shufbidx_s).[i_0])) 0); + aux <- ((3 * 256) %/ 16); + i <- 0; + while (i < aux) { + f0 <- (get256 (WArray1536.init16 (fun i_0 => (a).[i_0])) i); + f1 <- VPMULL_16u16 f0 v8; + f2 <- VPADD_16u16 f0 off; + f0 <- VPSLL_16u16 f0 (W8.of_int 3); + f0 <- VPMULH_16u16 f0 v; + f2 <- VPSUB_16u16 f1 f2; + f1 <- VPANDN_256 f1 f2; + f1 <- VPSRL_16u16 f1 (W8.of_int 15); + f0 <- VPSUB_16u16 f0 f1; + f0 <- VPMULHRS_16u16 f0 shift1; + f0 <- VPAND_256 f0 mask; + f0 <- VPMADDWD_256 f0 shift2; + f0 <- VPSLLV_8u32 f0 sllvdidx; + f0 <- VPSRL_4u64 f0 (W8.of_int 12); + f0 <- VPSHUFB_256 f0 shufbidx; + t0 <- (truncateu128 f0); + t1 <- VEXTRACTI128 f0 (W8.of_int 1); + t0 <- VPBLEND_8u16 t0 t1 (W8.of_int 224); + Glob.mem <- + storeW128 Glob.mem (W64.to_uint (rp + (W64.of_int (20 * i)))) (t0); + Glob.mem <- + storeW32 Glob.mem (W64.to_uint (rp + (W64.of_int ((20 * i) + 16)))) (VPEXTR_32 t1 + (W8.of_int 0)); + i <- i + 1; + } + return (); + } + + proc __polyvec_compress_1 (rp:W8.t Array960.t, a:W16.t Array768.t) : + W8.t Array960.t = { + var aux: int; + + var x16p:W16.t Array16.t; + var v:W256.t; + var v8:W256.t; + var off:W256.t; + var shift1:W256.t; + var mask:W256.t; + var shift2:W256.t; + var sllvdidx:W256.t; + var shufbidx:W256.t; + var i:int; + var f0:W256.t; + var f1:W256.t; + var f2:W256.t; + var t0:W128.t; + var t1:W128.t; + x16p <- witness; + a <@ __polyvec_csubq (a); + x16p <- jvx16; + v <- (get256 (WArray32.init16 (fun i_0 => (x16p).[i_0])) 0); + v8 <- VPSLL_16u16 v (W8.of_int 3); + off <- VPBROADCAST_16u16 pvc_off_s; + shift1 <- VPBROADCAST_16u16 pvc_shift1_s; + mask <- VPBROADCAST_16u16 pvc_mask_s; + shift2 <- VPBROADCAST_4u64 pvc_shift2_s; + sllvdidx <- VPBROADCAST_4u64 pvc_sllvdidx_s; + shufbidx <- + (get256 (WArray32.init8 (fun i_0 => (pvc_shufbidx_s).[i_0])) 0); + aux <- ((3 * 256) %/ 16); + i <- 0; + while (i < aux) { + f0 <- (get256 (WArray1536.init16 (fun i_0 => (a).[i_0])) i); + f1 <- VPMULL_16u16 f0 v8; + f2 <- VPADD_16u16 f0 off; + f0 <- VPSLL_16u16 f0 (W8.of_int 3); + f0 <- VPMULH_16u16 f0 v; + f2 <- VPSUB_16u16 f1 f2; + f1 <- VPANDN_256 f1 f2; + f1 <- VPSRL_16u16 f1 (W8.of_int 15); + f0 <- VPSUB_16u16 f0 f1; + f0 <- VPMULHRS_16u16 f0 shift1; + f0 <- VPAND_256 f0 mask; + f0 <- VPMADDWD_256 f0 shift2; + f0 <- VPSLLV_8u32 f0 sllvdidx; + f0 <- VPSRL_4u64 f0 (W8.of_int 12); + f0 <- VPSHUFB_256 f0 shufbidx; + t0 <- (truncateu128 f0); + t1 <- VEXTRACTI128 f0 (W8.of_int 1); + t0 <- VPBLEND_8u16 t0 t1 (W8.of_int 224); + rp <- + Array960.init + (WArray960.get8 (WArray960.set128_direct (WArray960.init8 (fun i_0 => (rp).[i_0])) (20 * i) (t0))); + rp <- + Array960.init + (WArray960.get8 (WArray960.set32_direct (WArray960.init8 (fun i_0 => (rp).[i_0])) ((20 * i) + 16) (VPEXTR_32 t1 + (W8.of_int 0)))); + i <- i + 1; + } + return (rp); + } + + proc __polyvec_frombytes (ap:W64.t) : W16.t Array768.t = { + var aux: W16.t Array256.t; + + var r:W16.t Array768.t; + var pp:W64.t; + r <- witness; + pp <- ap; + aux <@ _poly_frombytes ((Array256.init (fun i => r.[0 + i])), pp); + r <- Array768.init + (fun i => if 0 <= i < 0 + 256 then aux.[i-0] else r.[i]); + pp <- (pp + (W64.of_int 384)); + aux <@ _poly_frombytes ((Array256.init (fun i => r.[256 + i])), pp); + r <- Array768.init + (fun i => if 256 <= i < 256 + 256 then aux.[i-256] else r.[i]); + pp <- (pp + (W64.of_int 384)); + aux <@ _poly_frombytes ((Array256.init (fun i => r.[(2 * 256) + i])), + pp); + r <- Array768.init + (fun i => if (2 * 256) <= i < (2 * 256) + 256 then aux.[i-(2 * 256)] + else r.[i]); + return (r); + } + + proc __polyvec_invntt (r:W16.t Array768.t) : W16.t Array768.t = { + var aux: W16.t Array256.t; + + + + aux <@ _poly_invntt ((Array256.init (fun i => r.[0 + i]))); + r <- Array768.init + (fun i => if 0 <= i < 0 + 256 then aux.[i-0] else r.[i]); + aux <@ _poly_invntt ((Array256.init (fun i => r.[256 + i]))); + r <- Array768.init + (fun i => if 256 <= i < 256 + 256 then aux.[i-256] else r.[i]); + aux <@ _poly_invntt ((Array256.init (fun i => r.[(2 * 256) + i]))); + r <- Array768.init + (fun i => if (2 * 256) <= i < (2 * 256) + 256 then aux.[i-(2 * 256)] + else r.[i]); + return (r); + } + + proc __polyvec_ntt (r:W16.t Array768.t) : W16.t Array768.t = { + var aux: W16.t Array256.t; + + + + aux <@ _poly_ntt ((Array256.init (fun i => r.[0 + i]))); + r <- Array768.init + (fun i => if 0 <= i < 0 + 256 then aux.[i-0] else r.[i]); + aux <@ _poly_ntt ((Array256.init (fun i => r.[256 + i]))); + r <- Array768.init + (fun i => if 256 <= i < 256 + 256 then aux.[i-256] else r.[i]); + aux <@ _poly_ntt ((Array256.init (fun i => r.[(2 * 256) + i]))); + r <- Array768.init + (fun i => if (2 * 256) <= i < (2 * 256) + 256 then aux.[i-(2 * 256)] + else r.[i]); + return (r); + } + + proc __polyvec_reduce (r:W16.t Array768.t) : W16.t Array768.t = { + var aux: W16.t Array256.t; + + + + aux <@ __poly_reduce ((Array256.init (fun i => r.[0 + i]))); + r <- Array768.init + (fun i => if 0 <= i < 0 + 256 then aux.[i-0] else r.[i]); + aux <@ __poly_reduce ((Array256.init (fun i => r.[256 + i]))); + r <- Array768.init + (fun i => if 256 <= i < 256 + 256 then aux.[i-256] else r.[i]); + aux <@ __poly_reduce ((Array256.init (fun i => r.[(2 * 256) + i]))); + r <- Array768.init + (fun i => if (2 * 256) <= i < (2 * 256) + 256 then aux.[i-(2 * 256)] + else r.[i]); + return (r); + } + + proc __polyvec_pointwise_acc (r:W16.t Array256.t, a:W16.t Array768.t, + b:W16.t Array768.t) : W16.t Array256.t = { + + var t:W16.t Array256.t; + t <- witness; + r <@ _poly_basemul (r, (Array256.init (fun i => a.[0 + i])), + (Array256.init (fun i => b.[0 + i]))); + t <@ _poly_basemul (t, (Array256.init (fun i => a.[256 + i])), + (Array256.init (fun i => b.[256 + i]))); + r <@ _poly_add2 (r, t); + t <@ _poly_basemul (t, (Array256.init (fun i => a.[(2 * 256) + i])), + (Array256.init (fun i => b.[(2 * 256) + i]))); + r <@ _poly_add2 (r, t); + return (r); + } + + proc __polyvec_tobytes (rp:W64.t, a:W16.t Array768.t) : unit = { + var aux: W16.t Array256.t; + + var pp:W64.t; + + pp <- rp; + aux <@ _poly_tobytes (pp, (Array256.init (fun i => a.[0 + i]))); + a <- Array768.init + (fun i => if 0 <= i < 0 + 256 then aux.[i-0] else a.[i]); + pp <- (pp + (W64.of_int 384)); + aux <@ _poly_tobytes (pp, (Array256.init (fun i => a.[256 + i]))); + a <- Array768.init + (fun i => if 256 <= i < 256 + 256 then aux.[i-256] else a.[i]); + pp <- (pp + (W64.of_int 384)); + aux <@ _poly_tobytes (pp, (Array256.init (fun i => a.[(2 * 256) + i]))); + a <- Array768.init + (fun i => if (2 * 256) <= i < (2 * 256) + 256 then aux.[i-(2 * 256)] + else a.[i]); + return (); + } + + proc __rej_uniform (rp:W16.t Array256.t, offset:W64.t, buf:W8.t Array168.t) : + W64.t * W16.t Array256.t = { + + var ctr:W64.t; + var pos:W64.t; + var exit:W64.t; + var val1:W16.t; + var t:W16.t; + var val2:W16.t; + var cnd0:W64.t; + var cnd1:W64.t; + + ctr <- offset; + pos <- (W64.of_int 0); + exit <- (W64.of_int 0); + + while ((exit = (W64.of_int 0))) { + val1 <- (zeroextu16 buf.[(W64.to_uint pos)]); + pos <- (pos + (W64.of_int 1)); + t <- (zeroextu16 buf.[(W64.to_uint pos)]); + val2 <- t; + val2 <- (val2 `>>` (W8.of_int 4)); + t <- (t `&` (W16.of_int 15)); + t <- (t `<<` (W8.of_int 8)); + val1 <- (val1 `|` t); + pos <- (pos + (W64.of_int 1)); + t <- (zeroextu16 buf.[(W64.to_uint pos)]); + t <- (t `<<` (W8.of_int 4)); + val2 <- (val2 `|` t); + pos <- (pos + (W64.of_int 1)); + if ((val1 \ult (W16.of_int 3329))) { + rp.[(W64.to_uint ctr)] <- val1; + ctr <- (ctr + (W64.of_int 1)); + } else { + + } + if ((val2 \ult (W16.of_int 3329))) { + if ((ctr \ult (W64.of_int 256))) { + rp.[(W64.to_uint ctr)] <- val2; + ctr <- (ctr + (W64.of_int 1)); + } else { + + } + } else { + + } + cnd0 <- (W64.of_int 256); + cnd0 <- (cnd0 - ctr); + cnd0 <- (cnd0 - (W64.of_int 1)); + cnd1 <- (W64.of_int 168); + cnd1 <- (cnd1 - pos); + cnd1 <- (cnd1 - (W64.of_int 3)); + exit <- (cnd0 `|` cnd1); + exit <- (exit `>>` (W8.of_int 63)); + } + return (ctr, rp); + } + + proc __gen_matrix (seed:W8.t Array32.t, transposed:W64.t) : W16.t Array2304.t = { + var aux: int; + var aux_0: W16.t Array256.t; + + var r:W16.t Array2304.t; + var stransposed:W64.t; + var j:int; + var c:W8.t; + var extseed:W8.t Array34.t; + var i:int; + var state:W64.t Array25.t; + var ctr:W64.t; + var sctr:W64.t; + var buf:W8.t Array168.t; + var poly:W16.t Array256.t; + var k:W64.t; + var l:W64.t; + var t:W16.t; + buf <- witness; + extseed <- witness; + poly <- witness; + r <- witness; + state <- witness; + stransposed <- transposed; + j <- 0; + while (j < 32) { + c <- seed.[j]; + extseed.[j] <- c; + j <- j + 1; + } + i <- 0; + while (i < 3) { + j <- 0; + while (j < 3) { + transposed <- stransposed; + if ((transposed = (W64.of_int 0))) { + extseed.[32] <- (W8.of_int j); + extseed.[(32 + 1)] <- (W8.of_int i); + } else { + extseed.[32] <- (W8.of_int i); + extseed.[(32 + 1)] <- (W8.of_int j); + } + state <@ _shake128_absorb34 (state, extseed); + ctr <- (W64.of_int 0); + + while ((ctr \ult (W64.of_int 256))) { + sctr <- ctr; + (state, buf) <@ _shake128_squeezeblock (state, buf); + ctr <- sctr; + (ctr, poly) <@ __rej_uniform (poly, ctr, buf); + } + k <- (W64.of_int 0); + l <- (W64.of_int ((i * (3 * 256)) + (j * 256))); + + while ((k \ult (W64.of_int 256))) { + t <- poly.[(W64.to_uint k)]; + r.[(W64.to_uint l)] <- t; + k <- (k + (W64.of_int 1)); + l <- (l + (W64.of_int 1)); + } + j <- j + 1; + } + i <- i + 1; + } + i <- 0; + while (i < 3) { + j <- 0; + while (j < 3) { + aux_0 <@ _nttunpack ((Array256.init (fun i_0 => r.[((i * (3 * 256)) + (j * 256)) + i_0]))); + r <- Array2304.init + (fun i_0 => if ((i * (3 * 256)) + (j * 256)) <= i_0 < ((i * (3 * 256)) + (j * 256)) + 256 + then aux_0.[i_0-((i * (3 * 256)) + (j * 256))] else r.[i_0]); + j <- j + 1; + } + i <- i + 1; + } + return (r); + } + + proc __indcpa_keypair (pkp:W64.t, skp:W64.t, randomnessp:W8.t Array32.t) : unit = { + var aux: int; + var aux_3: W16.t Array256.t; + var aux_2: W16.t Array256.t; + var aux_1: W16.t Array256.t; + var aux_0: W16.t Array256.t; + + var spkp:W64.t; + var sskp:W64.t; + var i:int; + var t64:W64.t; + var inbuf:W8.t Array32.t; + var buf:W8.t Array64.t; + var publicseed:W8.t Array32.t; + var noiseseed:W8.t Array32.t; + var aa:W16.t Array2304.t; + var nonce:W8.t; + var skpv:W16.t Array768.t; + var e:W16.t Array768.t; + var pkpv:W16.t Array768.t; + aa <- witness; + buf <- witness; + e <- witness; + inbuf <- witness; + noiseseed <- witness; + pkpv <- witness; + publicseed <- witness; + skpv <- witness; + spkp <- pkp; + sskp <- skp; + aux <- (32 %/ 8); + i <- 0; + while (i < aux) { + t64 <- (get64 (WArray32.init8 (fun i_0 => (randomnessp).[i_0])) i); + inbuf <- + Array32.init + (WArray32.get8 (WArray32.set64 (WArray32.init8 (fun i_0 => (inbuf).[i_0])) i (t64))); + i <- i + 1; + } + buf <@ _sha3_512_32 (buf, inbuf); + aux <- (32 %/ 8); + i <- 0; + while (i < aux) { + t64 <- (get64 (WArray64.init8 (fun i_0 => (buf).[i_0])) i); + publicseed <- + Array32.init + (WArray32.get8 (WArray32.set64 (WArray32.init8 (fun i_0 => (publicseed).[i_0])) i (t64))); + t64 <- + (get64 (WArray64.init8 (fun i_0 => (buf).[i_0])) (i + (32 %/ 8))); + noiseseed <- + Array32.init + (WArray32.get8 (WArray32.set64 (WArray32.init8 (fun i_0 => (noiseseed).[i_0])) i (t64))); + i <- i + 1; + } + aa <@ __gen_matrix (publicseed, (W64.of_int 0)); + nonce <- (W8.of_int 0); + (aux_3, aux_2, aux_1, + aux_0) <@ _poly_getnoise_eta1_4x ((Array256.init (fun i_0 => skpv.[0 + i_0])), + (Array256.init (fun i_0 => skpv.[256 + i_0])), + (Array256.init (fun i_0 => skpv.[(2 * 256) + i_0])), + (Array256.init (fun i_0 => e.[0 + i_0])), noiseseed, nonce); + skpv <- Array768.init + (fun i_0 => if 0 <= i_0 < 0 + 256 then aux_3.[i_0-0] + else skpv.[i_0]); + skpv <- Array768.init + (fun i_0 => if 256 <= i_0 < 256 + 256 then aux_2.[i_0-256] + else skpv.[i_0]); + skpv <- Array768.init + (fun i_0 => if (2 * 256) <= i_0 < (2 * 256) + 256 + then aux_1.[i_0-(2 * 256)] else skpv.[i_0]); + e <- Array768.init + (fun i_0 => if 0 <= i_0 < 0 + 256 then aux_0.[i_0-0] else e.[i_0]); + nonce <- (W8.of_int 4); + (aux_3, aux_2, aux_1, + aux_0) <@ _poly_getnoise_eta1_4x ((Array256.init (fun i_0 => e.[256 + i_0])), + (Array256.init (fun i_0 => e.[(2 * 256) + i_0])), + (Array256.init (fun i_0 => pkpv.[0 + i_0])), + (Array256.init (fun i_0 => pkpv.[256 + i_0])), noiseseed, nonce); + e <- Array768.init + (fun i_0 => if 256 <= i_0 < 256 + 256 then aux_3.[i_0-256] + else e.[i_0]); + e <- Array768.init + (fun i_0 => if (2 * 256) <= i_0 < (2 * 256) + 256 + then aux_2.[i_0-(2 * 256)] else e.[i_0]); + pkpv <- Array768.init + (fun i_0 => if 0 <= i_0 < 0 + 256 then aux_1.[i_0-0] + else pkpv.[i_0]); + pkpv <- Array768.init + (fun i_0 => if 256 <= i_0 < 256 + 256 then aux_0.[i_0-256] + else pkpv.[i_0]); + skpv <@ __polyvec_ntt (skpv); + e <@ __polyvec_ntt (e); + i <- 0; + while (i < 3) { + aux_3 <@ __polyvec_pointwise_acc ((Array256.init (fun i_0 => pkpv.[(i * 256) + i_0])), + (Array768.init (fun i_0 => aa.[(i * (3 * 256)) + i_0])), skpv); + pkpv <- Array768.init + (fun i_0 => if (i * 256) <= i_0 < (i * 256) + 256 + then aux_3.[i_0-(i * 256)] else pkpv.[i_0]); + aux_3 <@ _poly_frommont ((Array256.init (fun i_0 => pkpv.[(i * 256) + i_0]))); + pkpv <- Array768.init + (fun i_0 => if (i * 256) <= i_0 < (i * 256) + 256 + then aux_3.[i_0-(i * 256)] else pkpv.[i_0]); + i <- i + 1; + } + pkpv <@ __polyvec_add2 (pkpv, e); + pkpv <@ __polyvec_reduce (pkpv); + pkp <- spkp; + skp <- sskp; + __polyvec_tobytes (skp, skpv); + __polyvec_tobytes (pkp, pkpv); + pkp <- (pkp + (W64.of_int (3 * 384))); + aux <- (32 %/ 8); + i <- 0; + while (i < aux) { + t64 <- (get64 (WArray32.init8 (fun i_0 => (publicseed).[i_0])) i); + Glob.mem <- + storeW64 Glob.mem (W64.to_uint (pkp + (W64.of_int 0))) (t64); + pkp <- (pkp + (W64.of_int 8)); + i <- i + 1; + } + return (); + } + + proc __indcpa_enc_0 (sctp:W64.t, msgp:W8.t Array32.t, pkp:W64.t, + noiseseed:W8.t Array32.t) : unit = { + var aux_3: int; + var aux_2: W16.t Array256.t; + var aux_1: W16.t Array256.t; + var aux_0: W16.t Array256.t; + var aux: W16.t Array256.t; + + var pkpv:W16.t Array768.t; + var i:W64.t; + var t64:W64.t; + var publicseed:W8.t Array32.t; + var k:W16.t Array256.t; + var s_noiseseed:W8.t Array32.t; + var aat:W16.t Array2304.t; + var lnoiseseed:W8.t Array32.t; + var nonce:W8.t; + var sp_0:W16.t Array768.t; + var ep:W16.t Array768.t; + var epp:W16.t Array256.t; + var bp:W16.t Array768.t; + var w:int; + var v:W16.t Array256.t; + var ctp:W64.t; + aat <- witness; + bp <- witness; + ep <- witness; + epp <- witness; + k <- witness; + lnoiseseed <- witness; + pkpv <- witness; + publicseed <- witness; + s_noiseseed <- witness; + sp_0 <- witness; + v <- witness; + pkpv <@ __polyvec_frombytes (pkp); + i <- (W64.of_int 0); + pkp <- (pkp + (W64.of_int (3 * 384))); + + while ((i \ult (W64.of_int (32 %/ 8)))) { + t64 <- (loadW64 Glob.mem (W64.to_uint (pkp + (W64.of_int 0)))); + publicseed <- + Array32.init + (WArray32.get8 (WArray32.set64_direct (WArray32.init8 (fun i_0 => (publicseed).[i_0])) (8 * (W64.to_uint i)) (t64))); + pkp <- (pkp + (W64.of_int 8)); + i <- (i + (W64.of_int 1)); + } + k <@ _poly_frommsg_1 (k, msgp); + s_noiseseed <- noiseseed; + aat <@ __gen_matrix (publicseed, (W64.of_int 1)); + lnoiseseed <- s_noiseseed; + nonce <- (W8.of_int 0); + (aux_2, aux_1, aux_0, + aux) <@ _poly_getnoise_eta1_4x ((Array256.init (fun i_0 => sp_0.[0 + i_0])), + (Array256.init (fun i_0 => sp_0.[256 + i_0])), + (Array256.init (fun i_0 => sp_0.[(2 * 256) + i_0])), + (Array256.init (fun i_0 => ep.[0 + i_0])), lnoiseseed, nonce); + sp_0 <- Array768.init + (fun i_0 => if 0 <= i_0 < 0 + 256 then aux_2.[i_0-0] + else sp_0.[i_0]); + sp_0 <- Array768.init + (fun i_0 => if 256 <= i_0 < 256 + 256 then aux_1.[i_0-256] + else sp_0.[i_0]); + sp_0 <- Array768.init + (fun i_0 => if (2 * 256) <= i_0 < (2 * 256) + 256 + then aux_0.[i_0-(2 * 256)] else sp_0.[i_0]); + ep <- Array768.init + (fun i_0 => if 0 <= i_0 < 0 + 256 then aux.[i_0-0] else ep.[i_0]); + nonce <- (W8.of_int 4); + (aux_2, aux_1, aux_0, + aux) <@ _poly_getnoise_eta1_4x ((Array256.init (fun i_0 => ep.[256 + i_0])), + (Array256.init (fun i_0 => ep.[(2 * 256) + i_0])), epp, + (Array256.init (fun i_0 => bp.[0 + i_0])), lnoiseseed, nonce); + ep <- Array768.init + (fun i_0 => if 256 <= i_0 < 256 + 256 then aux_2.[i_0-256] + else ep.[i_0]); + ep <- Array768.init + (fun i_0 => if (2 * 256) <= i_0 < (2 * 256) + 256 + then aux_1.[i_0-(2 * 256)] else ep.[i_0]); + epp <- aux_0; + bp <- Array768.init + (fun i_0 => if 0 <= i_0 < 0 + 256 then aux.[i_0-0] else bp.[i_0]); + sp_0 <@ __polyvec_ntt (sp_0); + w <- 0; + while (w < 3) { + aux_2 <@ __polyvec_pointwise_acc ((Array256.init (fun i_0 => bp.[(w * 256) + i_0])), + (Array768.init (fun i_0 => aat.[(w * (3 * 256)) + i_0])), sp_0); + bp <- Array768.init + (fun i_0 => if (w * 256) <= i_0 < (w * 256) + 256 + then aux_2.[i_0-(w * 256)] else bp.[i_0]); + w <- w + 1; + } + v <@ __polyvec_pointwise_acc (v, pkpv, sp_0); + bp <@ __polyvec_invntt (bp); + v <@ _poly_invntt (v); + bp <@ __polyvec_add2 (bp, ep); + v <@ _poly_add2 (v, epp); + v <@ _poly_add2 (v, k); + bp <@ __polyvec_reduce (bp); + v <@ __poly_reduce (v); + ctp <- sctp; + __polyvec_compress (ctp, bp); + ctp <- (ctp + (W64.of_int (3 * 320))); + v <@ _poly_compress (ctp, v); + return (); + } + + proc __indcpa_enc_1 (ctp:W8.t Array1088.t, msgp:W8.t Array32.t, pkp:W64.t, + noiseseed:W8.t Array32.t) : W8.t Array1088.t = { + var aux_3: int; + var aux_5: W8.t Array128.t; + var aux_4: W8.t Array960.t; + var aux_2: W16.t Array256.t; + var aux_1: W16.t Array256.t; + var aux_0: W16.t Array256.t; + var aux: W16.t Array256.t; + + var sctp:W8.t Array1088.t; + var pkpv:W16.t Array768.t; + var i:W64.t; + var t64:W64.t; + var publicseed:W8.t Array32.t; + var k:W16.t Array256.t; + var s_noiseseed:W8.t Array32.t; + var aat:W16.t Array2304.t; + var lnoiseseed:W8.t Array32.t; + var nonce:W8.t; + var sp_0:W16.t Array768.t; + var ep:W16.t Array768.t; + var epp:W16.t Array256.t; + var bp:W16.t Array768.t; + var w:int; + var v:W16.t Array256.t; + aat <- witness; + bp <- witness; + ep <- witness; + epp <- witness; + k <- witness; + lnoiseseed <- witness; + pkpv <- witness; + publicseed <- witness; + s_noiseseed <- witness; + sctp <- witness; + sp_0 <- witness; + v <- witness; + sctp <- ctp; + pkpv <@ __polyvec_frombytes (pkp); + i <- (W64.of_int 0); + pkp <- (pkp + (W64.of_int (3 * 384))); + + while ((i \ult (W64.of_int (32 %/ 8)))) { + t64 <- (loadW64 Glob.mem (W64.to_uint (pkp + (W64.of_int 0)))); + publicseed <- + Array32.init + (WArray32.get8 (WArray32.set64_direct (WArray32.init8 (fun i_0 => (publicseed).[i_0])) (8 * (W64.to_uint i)) (t64))); + pkp <- (pkp + (W64.of_int 8)); + i <- (i + (W64.of_int 1)); + } + k <@ _poly_frommsg_1 (k, msgp); + s_noiseseed <- noiseseed; + aat <@ __gen_matrix (publicseed, (W64.of_int 1)); + lnoiseseed <- s_noiseseed; + nonce <- (W8.of_int 0); + (aux_2, aux_1, aux_0, + aux) <@ _poly_getnoise_eta1_4x ((Array256.init (fun i_0 => sp_0.[0 + i_0])), + (Array256.init (fun i_0 => sp_0.[256 + i_0])), + (Array256.init (fun i_0 => sp_0.[(2 * 256) + i_0])), + (Array256.init (fun i_0 => ep.[0 + i_0])), lnoiseseed, nonce); + sp_0 <- Array768.init + (fun i_0 => if 0 <= i_0 < 0 + 256 then aux_2.[i_0-0] + else sp_0.[i_0]); + sp_0 <- Array768.init + (fun i_0 => if 256 <= i_0 < 256 + 256 then aux_1.[i_0-256] + else sp_0.[i_0]); + sp_0 <- Array768.init + (fun i_0 => if (2 * 256) <= i_0 < (2 * 256) + 256 + then aux_0.[i_0-(2 * 256)] else sp_0.[i_0]); + ep <- Array768.init + (fun i_0 => if 0 <= i_0 < 0 + 256 then aux.[i_0-0] else ep.[i_0]); + nonce <- (W8.of_int 4); + (aux_2, aux_1, aux_0, + aux) <@ _poly_getnoise_eta1_4x ((Array256.init (fun i_0 => ep.[256 + i_0])), + (Array256.init (fun i_0 => ep.[(2 * 256) + i_0])), epp, + (Array256.init (fun i_0 => bp.[0 + i_0])), lnoiseseed, nonce); + ep <- Array768.init + (fun i_0 => if 256 <= i_0 < 256 + 256 then aux_2.[i_0-256] + else ep.[i_0]); + ep <- Array768.init + (fun i_0 => if (2 * 256) <= i_0 < (2 * 256) + 256 + then aux_1.[i_0-(2 * 256)] else ep.[i_0]); + epp <- aux_0; + bp <- Array768.init + (fun i_0 => if 0 <= i_0 < 0 + 256 then aux.[i_0-0] else bp.[i_0]); + sp_0 <@ __polyvec_ntt (sp_0); + w <- 0; + while (w < 3) { + aux_2 <@ __polyvec_pointwise_acc ((Array256.init (fun i_0 => bp.[(w * 256) + i_0])), + (Array768.init (fun i_0 => aat.[(w * (3 * 256)) + i_0])), sp_0); + bp <- Array768.init + (fun i_0 => if (w * 256) <= i_0 < (w * 256) + 256 + then aux_2.[i_0-(w * 256)] else bp.[i_0]); + w <- w + 1; + } + v <@ __polyvec_pointwise_acc (v, pkpv, sp_0); + bp <@ __polyvec_invntt (bp); + v <@ _poly_invntt (v); + bp <@ __polyvec_add2 (bp, ep); + v <@ _poly_add2 (v, epp); + v <@ _poly_add2 (v, k); + bp <@ __polyvec_reduce (bp); + v <@ __poly_reduce (v); + ctp <- sctp; + aux_4 <@ __polyvec_compress_1 ((Array960.init (fun i_0 => ctp.[0 + i_0])), + bp); + ctp <- Array1088.init + (fun i_0 => if 0 <= i_0 < 0 + 960 then aux_4.[i_0-0] + else ctp.[i_0]); + (aux_5, + aux_2) <@ _poly_compress_1 ((Array128.init (fun i_0 => ctp.[(3 * 320) + i_0])), + v); + ctp <- Array1088.init + (fun i_0 => if (3 * 320) <= i_0 < (3 * 320) + 128 + then aux_5.[i_0-(3 * 320)] else ctp.[i_0]); + v <- aux_2; + return (ctp); + } + + proc __indcpa_dec_1 (msgp:W8.t Array32.t, ctp:W64.t, skp:W64.t) : W8.t Array32.t = { + + var bp:W16.t Array768.t; + var v:W16.t Array256.t; + var skpv:W16.t Array768.t; + var t:W16.t Array256.t; + var mp:W16.t Array256.t; + bp <- witness; + mp <- witness; + skpv <- witness; + t <- witness; + v <- witness; + bp <@ __polyvec_decompress (ctp); + ctp <- (ctp + (W64.of_int (3 * 320))); + v <@ _poly_decompress (v, ctp); + skpv <@ __polyvec_frombytes (skp); + bp <@ __polyvec_ntt (bp); + t <@ __polyvec_pointwise_acc (t, skpv, bp); + t <@ _poly_invntt (t); + mp <@ _poly_sub (mp, v, t); + mp <@ __poly_reduce (mp); + (msgp, mp) <@ _poly_tomsg_1 (msgp, mp); + return (msgp); + } + + proc __verify (ctp:W64.t, ctpc:W8.t Array1088.t) : W64.t = { + var aux: int; + + var cnd:W64.t; + var t64:W64.t; + var h:W256.t; + var i:int; + var f:W256.t; + var g:W256.t; + var zf:bool; + var off:int; + var t1:W8.t; + var t2:W8.t; + var _0:bool; + var _1:bool; + var _2:bool; + var _3:bool; + + cnd <- (W64.of_int 0); + t64 <- (W64.of_int 1); + h <- set0_256 ; + aux <- (((3 * 320) + 128) %/ 32); + i <- 0; + while (i < aux) { + f <- + (get256_direct (WArray1088.init8 (fun i_0 => (ctpc).[i_0])) (32 * i)); + g <- (loadW256 Glob.mem (W64.to_uint (ctp + (W64.of_int (32 * i))))); + f <- VPXOR_256 f g; + h <- VPOR_256 h f; + i <- i + 1; + } + ( _0, _1, _2, _3, zf) <- VPTEST_256 h h; + cnd <- ((! zf) ? t64 : cnd); + off <- ((((3 * 320) + 128) %/ 32) * 32); + aux <- ((3 * 320) + 128); + i <- off; + while (i < aux) { + t1 <- (get8_direct (WArray1088.init8 (fun i_0 => (ctpc).[i_0])) i); + t2 <- (loadW8 Glob.mem (W64.to_uint (ctp + (W64.of_int i)))); + t1 <- (t1 `^` t2); + t64 <- (zeroextu64 t1); + cnd <- (cnd `|` t64); + i <- i + 1; + } + cnd <- (- cnd); + cnd <- (cnd `>>` (W8.of_int 63)); + return (cnd); + } + + proc __cmov (dst:W8.t Array32.t, src:W64.t, cnd:W64.t) : W8.t Array32.t = { + var aux: int; + + var scnd:W64.t; + var m:W256.t; + var i:int; + var f:W256.t; + var g:W256.t; + var off:int; + var bcond:W8.t; + var t1:W8.t; + var t2:W8.t; + + cnd <- (- cnd); + scnd <- cnd; + m <- VPBROADCAST_4u64 scnd; + aux <- (32 %/ 32); + i <- 0; + while (i < aux) { + f <- + (get256_direct (WArray32.init8 (fun i_0 => (dst).[i_0])) (32 * i)); + g <- (loadW256 Glob.mem (W64.to_uint (src + (W64.of_int (32 * i))))); + f <- VPBLENDVB_256 f g m; + dst <- + Array32.init + (WArray32.get8 (WArray32.set256_direct (WArray32.init8 (fun i_0 => (dst).[i_0])) (32 * i) (f))); + i <- i + 1; + } + off <- ((32 %/ 32) * 32); + bcond <- (truncateu8 cnd); + i <- off; + while (i < 32) { + t1 <- (get8_direct (WArray32.init8 (fun i_0 => (dst).[i_0])) i); + t2 <- (loadW8 Glob.mem (W64.to_uint (src + (W64.of_int i)))); + t2 <- (t2 `^` t1); + t2 <- (t2 `&` (truncateu8 cnd)); + t1 <- (t1 `^` t2); + dst <- + Array32.init + (WArray32.get8 (WArray32.set8_direct (WArray32.init8 (fun i_0 => (dst).[i_0])) i (t1))); + i <- i + 1; + } + return (dst); + } + + proc __crypto_kem_keypair_jazz (pkp:W64.t, skp:W64.t, + randomnessp:W8.t Array64.t) : unit = { + var aux: int; + + var s_randomnessp:W8.t Array64.t; + var s_pkp:W64.t; + var s_skp:W64.t; + var randomnessp1:W8.t Array32.t; + var i:int; + var t64:W64.t; + var h_pk:W8.t Array32.t; + var randomnessp2:W8.t Array32.t; + h_pk <- witness; + randomnessp1 <- witness; + randomnessp2 <- witness; + s_randomnessp <- witness; + s_randomnessp <- randomnessp; + s_pkp <- pkp; + s_skp <- skp; + randomnessp1 <- (Array32.init (fun i_0 => randomnessp.[0 + i_0])); + __indcpa_keypair (pkp, skp, randomnessp1); + skp <- s_skp; + skp <- (skp + (W64.of_int (3 * 384))); + pkp <- s_pkp; + aux <- (((3 * 384) + 32) %/ 8); + i <- 0; + while (i < aux) { + t64 <- (loadW64 Glob.mem (W64.to_uint (pkp + (W64.of_int (8 * i))))); + Glob.mem <- + storeW64 Glob.mem (W64.to_uint (skp + (W64.of_int 0))) (t64); + skp <- (skp + (W64.of_int 8)); + i <- i + 1; + } + s_skp <- skp; + pkp <- s_pkp; + t64 <- (W64.of_int ((3 * 384) + 32)); + h_pk <@ _isha3_256 (h_pk, pkp, t64); + skp <- s_skp; + i <- 0; + while (i < 4) { + t64 <- (get64 (WArray32.init8 (fun i_0 => (h_pk).[i_0])) i); + Glob.mem <- + storeW64 Glob.mem (W64.to_uint (skp + (W64.of_int 0))) (t64); + skp <- (skp + (W64.of_int 8)); + i <- i + 1; + } + randomnessp <- s_randomnessp; + randomnessp2 <- (Array32.init (fun i_0 => randomnessp.[32 + i_0])); + aux <- (32 %/ 8); + i <- 0; + while (i < aux) { + t64 <- (get64 (WArray32.init8 (fun i_0 => (randomnessp2).[i_0])) i); + Glob.mem <- + storeW64 Glob.mem (W64.to_uint (skp + (W64.of_int 0))) (t64); + skp <- (skp + (W64.of_int 8)); + i <- i + 1; + } + return (); + } + + proc __crypto_kem_enc_jazz (ctp:W64.t, shkp:W64.t, pkp:W64.t, + randomnessp:W8.t Array32.t) : unit = { + var aux: int; + var aux_0: W8.t Array32.t; + + var s_pkp:W64.t; + var s_ctp:W64.t; + var s_shkp:W64.t; + var i:int; + var t64:W64.t; + var kr:W8.t Array64.t; + var buf:W8.t Array64.t; + buf <- witness; + kr <- witness; + s_pkp <- pkp; + s_ctp <- ctp; + s_shkp <- shkp; + aux <- (32 %/ 8); + i <- 0; + while (i < aux) { + t64 <- (get64 (WArray32.init8 (fun i_0 => (randomnessp).[i_0])) i); + kr <- + Array64.init + (WArray64.get8 (WArray64.set64 (WArray64.init8 (fun i_0 => (kr).[i_0])) i (t64))); + i <- i + 1; + } + t64 <- (W64.of_int 32); + aux_0 <@ _isha3_256_32 ((Array32.init (fun i_0 => buf.[0 + i_0])), + (Array32.init (fun i_0 => kr.[0 + i_0]))); + buf <- Array64.init + (fun i_0 => if 0 <= i_0 < 0 + 32 then aux_0.[i_0-0] + else buf.[i_0]); + pkp <- s_pkp; + t64 <- (W64.of_int ((3 * 384) + 32)); + aux_0 <@ _isha3_256 ((Array32.init (fun i_0 => buf.[32 + i_0])), pkp, + t64); + buf <- Array64.init + (fun i_0 => if 32 <= i_0 < 32 + 32 then aux_0.[i_0-32] + else buf.[i_0]); + kr <@ _sha3_512_64 (kr, buf); + pkp <- s_pkp; + __indcpa_enc_0 (s_ctp, (Array32.init (fun i_0 => buf.[0 + i_0])), pkp, + (Array32.init (fun i_0 => kr.[32 + i_0]))); + ctp <- s_ctp; + t64 <- (W64.of_int ((3 * 320) + 128)); + aux_0 <@ _isha3_256 ((Array32.init (fun i_0 => kr.[32 + i_0])), ctp, + t64); + kr <- Array64.init + (fun i_0 => if 32 <= i_0 < 32 + 32 then aux_0.[i_0-32] + else kr.[i_0]); + shkp <- s_shkp; + t64 <- (W64.of_int 32); + _shake256_64 (shkp, t64, kr); + return (); + } + + proc __crypto_kem_dec_jazz (shkp:W64.t, ctp:W64.t, skp:W64.t) : unit = { + var aux_0: int; + var aux: W8.t Array32.t; + + var s_shkp:W64.t; + var s_ctp:W64.t; + var buf:W8.t Array64.t; + var hp:W64.t; + var i:int; + var t64:W64.t; + var s_skp:W64.t; + var kr:W8.t Array64.t; + var pkp:W64.t; + var ctpc:W8.t Array1088.t; + var cnd:W64.t; + var zp:W64.t; + buf <- witness; + ctpc <- witness; + kr <- witness; + s_shkp <- shkp; + s_ctp <- ctp; + aux <@ __indcpa_dec_1 ((Array32.init (fun i_0 => buf.[0 + i_0])), ctp, + skp); + buf <- Array64.init + (fun i_0 => if 0 <= i_0 < 0 + 32 then aux.[i_0-0] else buf.[i_0]); + hp <- (skp + (W64.of_int 32)); + hp <- (hp + (W64.of_int (((24 * 3) * 256) `|>>` 3))); + aux_0 <- (32 %/ 8); + i <- 0; + while (i < aux_0) { + t64 <- (loadW64 Glob.mem (W64.to_uint (hp + (W64.of_int (8 * i))))); + buf <- + Array64.init + (WArray64.get8 (WArray64.set64_direct (WArray64.init8 (fun i_0 => (buf).[i_0])) (32 + (8 * i)) (t64))); + i <- i + 1; + } + s_skp <- skp; + kr <@ _sha3_512_64 (kr, buf); + pkp <- s_skp; + pkp <- (pkp + (W64.of_int (((12 * 3) * 256) `|>>` 3))); + ctpc <@ __indcpa_enc_1 (ctpc, (Array32.init (fun i_0 => buf.[0 + i_0])), + pkp, (Array32.init (fun i_0 => kr.[32 + i_0]))); + ctp <- s_ctp; + cnd <@ __verify (ctp, ctpc); + zp <- s_skp; + zp <- (zp + (W64.of_int 64)); + zp <- (zp + (W64.of_int (((24 * 3) * 256) `|>>` 3))); + aux <@ __cmov ((Array32.init (fun i_0 => kr.[0 + i_0])), zp, cnd); + kr <- Array64.init + (fun i_0 => if 0 <= i_0 < 0 + 32 then aux.[i_0-0] else kr.[i_0]); + t64 <- (W64.of_int ((3 * 320) + 128)); + aux <@ _isha3_256 ((Array32.init (fun i_0 => kr.[32 + i_0])), ctp, t64); + kr <- Array64.init + (fun i_0 => if 32 <= i_0 < 32 + 32 then aux.[i_0-32] + else kr.[i_0]); + shkp <- s_shkp; + t64 <- (W64.of_int 32); + _shake256_64 (shkp, t64, kr); + return (); + } + + proc jade_kem_kyber_kyber768_amd64_avx2v_keypair (public_key:W64.t, + secret_key:W64.t) : + W64.t = { + + var r:W64.t; + var randomness:W8.t Array64.t; + var randomnessp:W8.t Array64.t; + var _of_:bool; + var _cf_:bool; + var _sf_:bool; + var _zf_:bool; + var _0:bool; + randomness <- witness; + randomnessp <- witness; + public_key <- public_key; + secret_key <- secret_key; + randomnessp <- randomness; + randomnessp <@ SC.randombytes_64 (randomnessp); + __crypto_kem_keypair_jazz (public_key, secret_key, randomnessp); + (_of_, _cf_, _sf_, _0, _zf_, r) <- set0_64 ; + return (r); + } + + proc jade_kem_kyber_kyber768_amd64_avx2v_enc (ciphertext:W64.t, + shared_secret:W64.t, + public_key:W64.t) : W64.t = { + + var r:W64.t; + var randomness:W8.t Array32.t; + var randomnessp:W8.t Array32.t; + var _of_:bool; + var _cf_:bool; + var _sf_:bool; + var _zf_:bool; + var _0:bool; + randomness <- witness; + randomnessp <- witness; + ciphertext <- ciphertext; + shared_secret <- shared_secret; + public_key <- public_key; + randomnessp <- randomness; + randomnessp <@ SC.randombytes_32 (randomnessp); + __crypto_kem_enc_jazz (ciphertext, shared_secret, public_key, + randomnessp); + (_of_, _cf_, _sf_, _0, _zf_, r) <- set0_64 ; + return (r); + } + + proc jade_kem_kyber_kyber768_amd64_avx2v_dec (shared_secret:W64.t, + ciphertext:W64.t, + secret_key:W64.t) : W64.t = { + + var r:W64.t; + var _of_:bool; + var _cf_:bool; + var _sf_:bool; + var _zf_:bool; + var _0:bool; + + __crypto_kem_dec_jazz (shared_secret, ciphertext, secret_key); + (_of_, _cf_, _sf_, _0, _zf_, r) <- set0_64 ; + return (r); + } +}. + diff --git a/code/jasmin/mlkem_avx2/fips202.c b/code/jasmin/mlkem_avx2/fips202.c new file mode 100644 index 00000000..d300328b --- /dev/null +++ b/code/jasmin/mlkem_avx2/fips202.c @@ -0,0 +1,549 @@ +/* Based on the public domain implementation in + * crypto_hash/keccakc512/simple/ from http://bench.cr.yp.to/supercop.html + * by Ronny Van Keer + * and the public domain "TweetFips202" implementation + * from https://twitter.com/tweetfips202 + * by Gilles Van Assche, Daniel J. Bernstein, and Peter Schwabe */ + +#include +#include +#include "fips202.h" + +#define NROUNDS 24 +#define ROL(a, offset) ((a << offset) ^ (a >> (64-offset))) + +/************************************************* +* Name: load64 +* +* Description: Load 8 bytes into uint64_t in little-endian order +* +* Arguments: - const unsigned char *x: pointer to input byte array +* +* Returns the loaded 64-bit unsigned integer +**************************************************/ +static uint64_t load64(const unsigned char *x) +{ + unsigned long long r = 0, i; + + for (i = 0; i < 8; ++i) { + r |= (unsigned long long)x[i] << 8 * i; + } + return r; +} + +/************************************************* +* Name: store64 +* +* Description: Store a 64-bit integer to a byte array in little-endian order +* +* Arguments: - uint8_t *x: pointer to the output byte array +* - uint64_t u: input 64-bit unsigned integer +**************************************************/ +static void store64(uint8_t *x, uint64_t u) +{ + unsigned int i; + + for(i=0; i<8; ++i) { + x[i] = u; + u >>= 8; + } +} + +/* Keccak round constants */ +static const uint64_t KeccakF_RoundConstants[NROUNDS] = +{ + (uint64_t)0x0000000000000001ULL, + (uint64_t)0x0000000000008082ULL, + (uint64_t)0x800000000000808aULL, + (uint64_t)0x8000000080008000ULL, + (uint64_t)0x000000000000808bULL, + (uint64_t)0x0000000080000001ULL, + (uint64_t)0x8000000080008081ULL, + (uint64_t)0x8000000000008009ULL, + (uint64_t)0x000000000000008aULL, + (uint64_t)0x0000000000000088ULL, + (uint64_t)0x0000000080008009ULL, + (uint64_t)0x000000008000000aULL, + (uint64_t)0x000000008000808bULL, + (uint64_t)0x800000000000008bULL, + (uint64_t)0x8000000000008089ULL, + (uint64_t)0x8000000000008003ULL, + (uint64_t)0x8000000000008002ULL, + (uint64_t)0x8000000000000080ULL, + (uint64_t)0x000000000000800aULL, + (uint64_t)0x800000008000000aULL, + (uint64_t)0x8000000080008081ULL, + (uint64_t)0x8000000000008080ULL, + (uint64_t)0x0000000080000001ULL, + (uint64_t)0x8000000080008008ULL +}; + +/************************************************* +* Name: KeccakF1600_StatePermute +* +* Description: The Keccak F1600 Permutation +* +* Arguments: - uint64_t * state: pointer to in/output Keccak state +**************************************************/ +static void KeccakF1600_StatePermute(uint64_t * state) +{ + int round; + + uint64_t Aba, Abe, Abi, Abo, Abu; + uint64_t Aga, Age, Agi, Ago, Agu; + uint64_t Aka, Ake, Aki, Ako, Aku; + uint64_t Ama, Ame, Ami, Amo, Amu; + uint64_t Asa, Ase, Asi, Aso, Asu; + uint64_t BCa, BCe, BCi, BCo, BCu; + uint64_t Da, De, Di, Do, Du; + uint64_t Eba, Ebe, Ebi, Ebo, Ebu; + uint64_t Ega, Ege, Egi, Ego, Egu; + uint64_t Eka, Eke, Eki, Eko, Eku; + uint64_t Ema, Eme, Emi, Emo, Emu; + uint64_t Esa, Ese, Esi, Eso, Esu; + + //copyFromState(A, state) + Aba = state[ 0]; + Abe = state[ 1]; + Abi = state[ 2]; + Abo = state[ 3]; + Abu = state[ 4]; + Aga = state[ 5]; + Age = state[ 6]; + Agi = state[ 7]; + Ago = state[ 8]; + Agu = state[ 9]; + Aka = state[10]; + Ake = state[11]; + Aki = state[12]; + Ako = state[13]; + Aku = state[14]; + Ama = state[15]; + Ame = state[16]; + Ami = state[17]; + Amo = state[18]; + Amu = state[19]; + Asa = state[20]; + Ase = state[21]; + Asi = state[22]; + Aso = state[23]; + Asu = state[24]; + + for( round = 0; round < NROUNDS; round += 2 ) + { + // prepareTheta + BCa = Aba^Aga^Aka^Ama^Asa; + BCe = Abe^Age^Ake^Ame^Ase; + BCi = Abi^Agi^Aki^Ami^Asi; + BCo = Abo^Ago^Ako^Amo^Aso; + BCu = Abu^Agu^Aku^Amu^Asu; + + //thetaRhoPiChiIotaPrepareTheta(round , A, E) + Da = BCu^ROL(BCe, 1); + De = BCa^ROL(BCi, 1); + Di = BCe^ROL(BCo, 1); + Do = BCi^ROL(BCu, 1); + Du = BCo^ROL(BCa, 1); + + Aba ^= Da; + BCa = Aba; + Age ^= De; + BCe = ROL(Age, 44); + Aki ^= Di; + BCi = ROL(Aki, 43); + Amo ^= Do; + BCo = ROL(Amo, 21); + Asu ^= Du; + BCu = ROL(Asu, 14); + Eba = BCa ^((~BCe)& BCi ); + Eba ^= (uint64_t)KeccakF_RoundConstants[round]; + Ebe = BCe ^((~BCi)& BCo ); + Ebi = BCi ^((~BCo)& BCu ); + Ebo = BCo ^((~BCu)& BCa ); + Ebu = BCu ^((~BCa)& BCe ); + + Abo ^= Do; + BCa = ROL(Abo, 28); + Agu ^= Du; + BCe = ROL(Agu, 20); + Aka ^= Da; + BCi = ROL(Aka, 3); + Ame ^= De; + BCo = ROL(Ame, 45); + Asi ^= Di; + BCu = ROL(Asi, 61); + Ega = BCa ^((~BCe)& BCi ); + Ege = BCe ^((~BCi)& BCo ); + Egi = BCi ^((~BCo)& BCu ); + Ego = BCo ^((~BCu)& BCa ); + Egu = BCu ^((~BCa)& BCe ); + + Abe ^= De; + BCa = ROL(Abe, 1); + Agi ^= Di; + BCe = ROL(Agi, 6); + Ako ^= Do; + BCi = ROL(Ako, 25); + Amu ^= Du; + BCo = ROL(Amu, 8); + Asa ^= Da; + BCu = ROL(Asa, 18); + Eka = BCa ^((~BCe)& BCi ); + Eke = BCe ^((~BCi)& BCo ); + Eki = BCi ^((~BCo)& BCu ); + Eko = BCo ^((~BCu)& BCa ); + Eku = BCu ^((~BCa)& BCe ); + + Abu ^= Du; + BCa = ROL(Abu, 27); + Aga ^= Da; + BCe = ROL(Aga, 36); + Ake ^= De; + BCi = ROL(Ake, 10); + Ami ^= Di; + BCo = ROL(Ami, 15); + Aso ^= Do; + BCu = ROL(Aso, 56); + Ema = BCa ^((~BCe)& BCi ); + Eme = BCe ^((~BCi)& BCo ); + Emi = BCi ^((~BCo)& BCu ); + Emo = BCo ^((~BCu)& BCa ); + Emu = BCu ^((~BCa)& BCe ); + + Abi ^= Di; + BCa = ROL(Abi, 62); + Ago ^= Do; + BCe = ROL(Ago, 55); + Aku ^= Du; + BCi = ROL(Aku, 39); + Ama ^= Da; + BCo = ROL(Ama, 41); + Ase ^= De; + BCu = ROL(Ase, 2); + Esa = BCa ^((~BCe)& BCi ); + Ese = BCe ^((~BCi)& BCo ); + Esi = BCi ^((~BCo)& BCu ); + Eso = BCo ^((~BCu)& BCa ); + Esu = BCu ^((~BCa)& BCe ); + + // prepareTheta + BCa = Eba^Ega^Eka^Ema^Esa; + BCe = Ebe^Ege^Eke^Eme^Ese; + BCi = Ebi^Egi^Eki^Emi^Esi; + BCo = Ebo^Ego^Eko^Emo^Eso; + BCu = Ebu^Egu^Eku^Emu^Esu; + + //thetaRhoPiChiIotaPrepareTheta(round+1, E, A) + Da = BCu^ROL(BCe, 1); + De = BCa^ROL(BCi, 1); + Di = BCe^ROL(BCo, 1); + Do = BCi^ROL(BCu, 1); + Du = BCo^ROL(BCa, 1); + + Eba ^= Da; + BCa = Eba; + Ege ^= De; + BCe = ROL(Ege, 44); + Eki ^= Di; + BCi = ROL(Eki, 43); + Emo ^= Do; + BCo = ROL(Emo, 21); + Esu ^= Du; + BCu = ROL(Esu, 14); + Aba = BCa ^((~BCe)& BCi ); + Aba ^= (uint64_t)KeccakF_RoundConstants[round+1]; + Abe = BCe ^((~BCi)& BCo ); + Abi = BCi ^((~BCo)& BCu ); + Abo = BCo ^((~BCu)& BCa ); + Abu = BCu ^((~BCa)& BCe ); + + Ebo ^= Do; + BCa = ROL(Ebo, 28); + Egu ^= Du; + BCe = ROL(Egu, 20); + Eka ^= Da; + BCi = ROL(Eka, 3); + Eme ^= De; + BCo = ROL(Eme, 45); + Esi ^= Di; + BCu = ROL(Esi, 61); + Aga = BCa ^((~BCe)& BCi ); + Age = BCe ^((~BCi)& BCo ); + Agi = BCi ^((~BCo)& BCu ); + Ago = BCo ^((~BCu)& BCa ); + Agu = BCu ^((~BCa)& BCe ); + + Ebe ^= De; + BCa = ROL(Ebe, 1); + Egi ^= Di; + BCe = ROL(Egi, 6); + Eko ^= Do; + BCi = ROL(Eko, 25); + Emu ^= Du; + BCo = ROL(Emu, 8); + Esa ^= Da; + BCu = ROL(Esa, 18); + Aka = BCa ^((~BCe)& BCi ); + Ake = BCe ^((~BCi)& BCo ); + Aki = BCi ^((~BCo)& BCu ); + Ako = BCo ^((~BCu)& BCa ); + Aku = BCu ^((~BCa)& BCe ); + + Ebu ^= Du; + BCa = ROL(Ebu, 27); + Ega ^= Da; + BCe = ROL(Ega, 36); + Eke ^= De; + BCi = ROL(Eke, 10); + Emi ^= Di; + BCo = ROL(Emi, 15); + Eso ^= Do; + BCu = ROL(Eso, 56); + Ama = BCa ^((~BCe)& BCi ); + Ame = BCe ^((~BCi)& BCo ); + Ami = BCi ^((~BCo)& BCu ); + Amo = BCo ^((~BCu)& BCa ); + Amu = BCu ^((~BCa)& BCe ); + + Ebi ^= Di; + BCa = ROL(Ebi, 62); + Ego ^= Do; + BCe = ROL(Ego, 55); + Eku ^= Du; + BCi = ROL(Eku, 39); + Ema ^= Da; + BCo = ROL(Ema, 41); + Ese ^= De; + BCu = ROL(Ese, 2); + Asa = BCa ^((~BCe)& BCi ); + Ase = BCe ^((~BCi)& BCo ); + Asi = BCi ^((~BCo)& BCu ); + Aso = BCo ^((~BCu)& BCa ); + Asu = BCu ^((~BCa)& BCe ); + } + + //copyToState(state, A) + state[ 0] = Aba; + state[ 1] = Abe; + state[ 2] = Abi; + state[ 3] = Abo; + state[ 4] = Abu; + state[ 5] = Aga; + state[ 6] = Age; + state[ 7] = Agi; + state[ 8] = Ago; + state[ 9] = Agu; + state[10] = Aka; + state[11] = Ake; + state[12] = Aki; + state[13] = Ako; + state[14] = Aku; + state[15] = Ama; + state[16] = Ame; + state[17] = Ami; + state[18] = Amo; + state[19] = Amu; + state[20] = Asa; + state[21] = Ase; + state[22] = Asi; + state[23] = Aso; + state[24] = Asu; + + #undef round +} + +#include +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + + +/************************************************* +* Name: keccak_absorb +* +* Description: Absorb step of Keccak; +* non-incremental, starts by zeroeing the state. +* +* Arguments: - uint64_t *s: pointer to (uninitialized) output Keccak state +* - unsigned int r: rate in bytes (e.g., 168 for SHAKE128) +* - const unsigned char *m: pointer to input to be absorbed into s +* - unsigned long long mlen: length of input in bytes +* - unsigned char p: domain-separation byte for different Keccak-derived functions +**************************************************/ +static void keccak_absorb(uint64_t *s, + unsigned int r, + const unsigned char *m, unsigned long long int mlen, + unsigned char p) +{ + unsigned long long i; + unsigned char t[200]; + + // Zero state + for (i = 0; i < 25; ++i) + s[i] = 0; + + while (mlen >= r) + { + for (i = 0; i < r / 8; ++i) + s[i] ^= load64(m + 8 * i); + + KeccakF1600_StatePermute(s); + mlen -= r; + m += r; + } + + for (i = 0; i < r; ++i) + t[i] = 0; + for (i = 0; i < mlen; ++i) + t[i] = m[i]; + t[i] = p; + t[r - 1] |= 128; + for (i = 0; i < r / 8; ++i) + s[i] ^= load64(t + 8 * i); +} + + +/************************************************* +* Name: keccak_squeezeblocks +* +* Description: Squeeze step of Keccak. Squeezes full blocks of r bytes each. +* Modifies the state. Can be called multiple times to keep squeezing, +* i.e., is incremental. +* +* Arguments: - unsigned char *h: pointer to output blocks +* - unsigned long long int nblocks: number of blocks to be squeezed (written to h) +* - uint64_t *s: pointer to in/output Keccak state +* - unsigned int r: rate in bytes (e.g., 168 for SHAKE128) +**************************************************/ +static void keccak_squeezeblocks(unsigned char *h, unsigned long long int nblocks, + uint64_t *s, + unsigned int r) +{ + unsigned int i; + while(nblocks > 0) + { + KeccakF1600_StatePermute(s); + for(i=0;i<(r>>3);i++) + { + store64(h+8*i, s[i]); + } + h += r; + nblocks--; + } +} + + +/************************************************* +* Name: shake128_absorb +* +* Description: Absorb step of the SHAKE128 XOF. +* non-incremental, starts by zeroeing the state. +* +* Arguments: - uint64_t *s: pointer to (uninitialized) output Keccak state +* - const unsigned char *input: pointer to input to be absorbed into s +* - unsigned long long inputByteLen: length of input in bytes +**************************************************/ +void shake128_absorb(uint64_t *s, const unsigned char *input, unsigned int inputByteLen) +{ + keccak_absorb(s, SHAKE128_RATE, input, inputByteLen, 0x1F); +} + +/************************************************* +* Name: shake128_squeezeblocks +* +* Description: Squeeze step of SHAKE128 XOF. Squeezes full blocks of SHAKE128_RATE bytes each. +* Modifies the state. Can be called multiple times to keep squeezing, +* i.e., is incremental. +* +* Arguments: - unsigned char *output: pointer to output blocks +* - unsigned long long nblocks: number of blocks to be squeezed (written to output) +* - uint64_t *s: pointer to in/output Keccak state +**************************************************/ +void shake128_squeezeblocks(unsigned char *output, unsigned long long nblocks, uint64_t *s) +{ + keccak_squeezeblocks(output, nblocks, s, SHAKE128_RATE); +} + +/************************************************* +* Name: shake256 +* +* Description: SHAKE256 XOF with non-incremental API +* +* Arguments: - unsigned char *output: pointer to output +* - unsigned long long outlen: requested output length in bytes + - const unsigned char *input: pointer to input + - unsigned long long inlen: length of input in bytes +**************************************************/ +void shake256(unsigned char *output, unsigned long long outlen, + const unsigned char *input, unsigned long long inlen) +{ + uint64_t s[25]; + unsigned char t[SHAKE256_RATE]; + unsigned long long nblocks = outlen/SHAKE256_RATE; + size_t i; + + /* Absorb input */ + keccak_absorb(s, SHAKE256_RATE, input, inlen, 0x1F); + + /* Squeeze output */ + keccak_squeezeblocks(output, nblocks, s, SHAKE256_RATE); + + output+=nblocks*SHAKE256_RATE; + outlen-=nblocks*SHAKE256_RATE; + + if(outlen) + { + keccak_squeezeblocks(t, 1, s, SHAKE256_RATE); + for(i=0;i + +#define SHAKE128_RATE 168 +#define SHAKE256_RATE 136 +#define SHA3_256_RATE 136 +#define SHA3_512_RATE 72 + +void shake128_absorb(uint64_t *s, const unsigned char *input, unsigned int inputByteLen); +void shake128_squeezeblocks(unsigned char *output, unsigned long long nblocks, uint64_t *s); + +void shake256(unsigned char *output, unsigned long long outlen, const unsigned char *input, unsigned long long inlen); + +void sha3_256(unsigned char *output, const unsigned char *input, unsigned long long inlen); +void sha3_512(unsigned char *output, const unsigned char *input, unsigned long long inlen); + + + +void shake256_128_33_jazz(unsigned char *output, const unsigned char *input); +void sha3_512_32_jazz(unsigned char *output, const unsigned char *input); + +void shake128_absorb34_jazz(uint64_t *s, const unsigned char *input); +void shake128_squeezeblock_jazz(unsigned char *output, uint64_t *s); + + +#endif diff --git a/code/jasmin/mlkem_avx2/fips202.jinc b/code/jasmin/mlkem_avx2/fips202.jinc new file mode 100644 index 00000000..0141c670 --- /dev/null +++ b/code/jasmin/mlkem_avx2/fips202.jinc @@ -0,0 +1,647 @@ +require "params.jinc" +require "keccakf1600.jinc" +require "fips202_common.jinc" + +inline +fn __st0(reg ptr u64[25] state) -> reg ptr u64[25] +{ + inline int i; + + for i = 0 to 25 { + state[i] = 0; + } + + return state; +} + + +inline +fn __add_full_block( + stack u64[25] state, + reg u64 in, + reg u64 inlen, + reg u64 r8 +) -> stack u64[25], reg u64, reg u64 +{ + reg u64 i t r64; + + r64 = r8; + r64 >>= 3; + i = 0; + while (i < r64) + { + t = [in + 8 * i]; + state[(int) i] ^= t; + i = i + 1; + } + + in += r8; + inlen -= r8; + + return state, in, inlen; +} + + +inline +fn __add_final_block( + stack u64[25] state, + reg u64 in, + reg u64 inlen, + reg u8 trail_byte, + reg u64 r8 +) -> stack u64[25] +{ + reg u64 i, t, inlen8; + reg u8 c; + + inlen8 = inlen; + inlen8 >>= 3; + i = 0; + while ( i < inlen8) + { + t = [in + 8*i]; + state[(int) i] ^= t; + i = i + 1; + } + + i <<= 3; + while (i < inlen) + { + c = (u8)[in + i]; + state[u8 (int) i] ^= c; + i = i + 1; + } + + state[u8 (int) i] ^= trail_byte; + + i = r8; + i -= 1; + state[u8 (int) i] ^= 0x80; + + return state; +} + + +inline +fn __xtr_full_block( + stack u64[25] state, + reg u64 out, + reg u64 outlen, + reg u64 rate +) -> reg u64, reg u64 +{ + reg u64 i t rate64; + + rate64 = rate; + rate64 >>= 3; + i = 0; + while (i < rate64) + { + t = state[(int) i]; + [out + 8 * i] = t; + i = i + 1; + } + + out += rate; + outlen -= rate; + + return out, outlen; +} + + +inline +fn __xtr_bytes( + stack u64[25] state, + reg u64 out, + reg u64 outlen +) +{ + reg u64 i t outlen8; + reg u8 c; + + outlen8 = outlen; + outlen8 >>= 3; + i = 0; + while (i < outlen8 ) + { + t = state[(int) i]; + [out + 8 * i] = t; + i = i + 1; + } + i <<= 3; + + while (i < outlen) + { + c = state[u8 (int) i]; + (u8)[out + i] = c; + i = i + 1; + } +} + + +inline +fn __keccak1600_scalar( + stack u64 s_out s_outlen, + reg u64 in inlen, + stack u64 s_trail_byte, + reg u64 rate +) +{ + stack u64[25] state; + stack u64 s_in, s_inlen, s_rate; + reg u64 out, outlen, t; + reg u8 trail_byte; + + state = __st0(state); + + while ( inlen >= rate ) + { + state, in, inlen = __add_full_block(state, in, inlen, rate); + + s_in = in; + s_inlen = inlen; + s_rate = rate; + + state = _keccakf1600_scalar(state); + + inlen = s_inlen; + in = s_in; + rate = s_rate; + } + + t = s_trail_byte; + trail_byte = (8u) t; + state = __add_final_block(state, in, inlen, trail_byte, rate); + + outlen = s_outlen; + + while ( outlen > rate ) + { + s_outlen = outlen; + s_rate = rate; + + state = _keccakf1600_scalar(state); + + out = s_out; + outlen = s_outlen; + rate = s_rate; + + out, outlen = __xtr_full_block(state, out, outlen, rate); + s_outlen = outlen; + s_out = out; + } + + state = _keccakf1600_scalar(state); + out = s_out; + outlen = s_outlen; + + __xtr_bytes(state, out, outlen); +} + + +#[returnaddress="stack"] +fn _shake256(reg u64 out outlen in inlen) +{ + stack u64 ds; + stack u64 rate; + + ds = 0x1f; + rate = SHAKE256_RATE; + + __keccak1600_scalar(out, outlen, in, inlen, ds, rate); +} + + +#[returnaddress="stack"] +fn _sha3_512(reg u64 out in inlen) +{ + reg u64 ds; + reg u64 rate; + reg u64 outlen; + + ds = 0x06; + rate = SHA3_512_RATE; + outlen = 64; + + __keccak1600_scalar(out, outlen, in, inlen, ds, rate); +} + + +#[returnaddress="stack"] +fn _sha3_256(reg u64 out in inlen) +{ + reg u64 ds; + reg u64 rate; + reg u64 outlen; + + ds = 0x06; + rate = SHA3_256_RATE; + outlen = 32; + + __keccak1600_scalar(out, outlen, in, inlen, ds, rate); +} + + +#[returnaddress="stack"] +fn _isha3_256(reg ptr u8[32] out, reg u64 in inlen) -> reg ptr u8[32] +{ + stack u64[25] state; + stack ptr u8[32] s_out; + stack u64 s_in s_ilen s_r8; + reg u64 ilen r8 t64; + reg u8 t8; + inline int i; + + s_out = out; + + state = __st0(state); + + r8 = SHA3_256_RATE; + ilen = inlen; + + while(ilen >= r8) + { + state, in, ilen = __add_full_block(state, in, ilen, r8); + + s_in = in; + s_ilen = ilen; + s_r8 = r8; + + state = _keccakf1600_scalar(state); + + in = s_in; + ilen = s_ilen; + r8 = s_r8; + } + + t8 = 0x06; + state = __add_final_block(state, in, ilen, t8, r8); + + state = _keccakf1600_scalar(state); + + out = s_out; + + for i=0 to 4 + { + t64 = state[i]; + out[u64 i] = t64; + } + + return out; +} + +inline +fn __isha3_512(reg ptr u8[64] out, reg u64 in, inline int inlen) -> stack u8[64] +{ + stack u64[25] state; + stack ptr u8[64] s_out; + stack u64 s_in s_ilen s_r8; + reg u64 ilen r8 t64; + reg u8 t8; + inline int i; + + s_out = out; + + state = __st0(state); + + r8 = SHA3_512_RATE; + ilen = inlen; + + while(ilen >= r8) + { + state, in, ilen = __add_full_block(state, in, ilen, r8); + + s_in = in; + s_ilen = ilen; + s_r8 = r8; + + state = _keccakf1600_scalar(state); + + in = s_in; + ilen = s_ilen; + r8 = s_r8; + } + + t8 = 0x06; + state = __add_final_block(state, in, ilen, t8, r8); + + state = _keccakf1600_scalar(state); + + out = s_out; + + for i=0 to 8 + { + t64 = state[i]; + out[u64 i] = t64; + } + + return out; +} + +fn _shake256_1120_32(reg u64 out, reg u64 in0 in1) { + stack u64[25] state; + stack u64 s_out; + stack u64 s_in s_ilen s_r8; + reg u64 ilen r8 t64 in; + reg u8 t8; + inline int i; + + s_out = out; + state = __st0(state); + + for i = 0 to KYBER_SYMBYTES/8 { + t64 = (u64)[in0 + i*8]; + state[u64 i] ^= t64; + } + + for i = KYBER_SYMBYTES/8 to SHAKE256_RATE/8 { + t64 = (u64)[in1 + (i-KYBER_SYMBYTES/8)*8]; + state[u64 i] ^= t64; + } + + s_in = in1; + + state = _keccakf1600_scalar(state); + + r8 = SHAKE256_RATE; + ilen = KYBER_INDCPA_CIPHERTEXTBYTES - (SHAKE256_RATE - KYBER_SYMBYTES); + in = s_in; + in += SHAKE256_RATE - KYBER_SYMBYTES; + + while(ilen >= r8) + { + state, in, ilen = __add_full_block(state, in, ilen, r8); + + s_in = in; + s_ilen = ilen; + s_r8 = r8; + + state = _keccakf1600_scalar(state); + + in = s_in; + ilen = s_ilen; + r8 = s_r8; + } + + t8 = 0x1f; + state = __add_final_block(state, in, ilen, t8, r8); + + state = _keccakf1600_scalar(state); + + out = s_out; + + for i=0 to KYBER_SYMBYTES/8 + { + t64 = state[i]; + (u64)[out + 8*i] = t64; + } + +} + +#[returnaddress="stack"] +fn _shake256_64(reg u64 out outlen, reg const ptr u8[64] in) +{ + reg u64 t64 j; + reg u8 c; + stack u64[25] state; + stack u64 s_out s_outlen; + inline int i; + + s_out = out; + s_outlen = outlen; + + state = __st0(state); + + for i = 0 to 8 { + t64 = in[u64 i]; + state[u64 i] ^= t64; + } + + state[u8 64] ^= 0x1f; + state[u8 SHAKE256_RATE-1] ^= 0x80; + + state = _keccakf1600_scalar(state); + + outlen = s_outlen; + out = s_out; + + while(outlen > SHAKE256_RATE) + { + for i = 0 to SHAKE256_RATE/8 + { + t64 = state[u64 i]; + (u64)[out + 8*i] = t64; + } + + out += SHAKE256_RATE; + outlen -= SHAKE256_RATE; + + s_out = out; + s_outlen = outlen; + + state = _keccakf1600_scalar(state); + + outlen = s_outlen; + out = s_out; + } + + s_outlen = outlen; + outlen >>= 3; + j = 0; + while(j < outlen) + { + t64 = state[(int) j]; + (u64)[out + 8 * j] = t64; + j = j + 1; + } + + j <<= 3; + outlen = s_outlen; + + while (j < outlen) + { + c = state[u8 (int) j]; + (u8)[out + j] = c; + j = j + 1; + } +} + +#[returnaddress="stack"] +fn _shake256_128_33(reg ptr u8[128] out, reg const ptr u8[33] in) -> stack u8[128] +{ + stack u64[25] state; + reg u64 t64; + reg u8 c; + inline int i; + + stack ptr u8[128] sout; + + sout = out; + + state = __st0(state); + + for i = 0 to 4 { + t64 = in[u64 i]; + state[u64 i] ^= t64; + } + + c = in[32]; + state[u8 32] ^= c; + state[u8 33] ^= 0x1f; + state[u8 SHAKE256_RATE-1] ^= 0x80; + + state = _keccakf1600_scalar(state); + + out = sout; + + for i = 0 to 16 { + t64 = state[u64 i]; + out[u64 i] = t64; + } + + return out; +} + +#[returnaddress="stack"] +fn _isha3_256_32(reg ptr u8[32] out, reg ptr u8[KYBER_SYMBYTES] in) -> reg ptr u8[32] +{ + stack u64[25] state; + stack ptr u8[32] s_out; + reg u64 t64; + inline int i; + + s_out = out; + + state = __st0(state); + + for i=0 to KYBER_SYMBYTES/8 + { + t64 = in[u64 i]; + state[u64 i] = t64; + } + + state[u8 KYBER_SYMBYTES] ^= 0x06; + state[u8 SHA3_256_RATE - 1] = 0x80; + + state = _keccakf1600_scalar(state); + + out = s_out; + + for i=0 to 4 + { + t64 = state[i]; + out[u64 i] = t64; + } + + return out; +} + +#[returnaddress="stack"] +fn _sha3_512_64(reg ptr u8[64] out, reg const ptr u8[64] in) -> stack u8[64] +{ + stack u64[25] state; + stack ptr u8[64] out_s; + reg u64 t64; + inline int i; + + state = __st0(state); + + for i = 0 to 8 + { + t64 = in[u64 i]; + state[i] ^= t64; + } + + state[u8 64] ^= 0x06; + state[u8 SHA3_512_RATE - 1] ^= 0x80; + + out_s = out; + + state = _keccakf1600_scalar(state); + + out = out_s; + + for i = 0 to 8 + { + t64 = state[i]; + out[u64 i] = t64; + } + + return out; +} + +#[returnaddress="stack"] +fn _sha3_512_32(reg ptr u8[64] out, reg const ptr u8[32] in) -> stack u8[64] +{ + stack u64[25] state; + stack ptr u8[64] out_s; + reg u64 t64; + inline int i; + + state = __st0(state); + + for i = 0 to 4 + { + t64 = in[u64 i]; + state[i] ^= t64; + } + + state[u8 32] ^= 0x06; + state[u8 SHA3_512_RATE-1] ^= 0x80; + + out_s = out; + + state = _keccakf1600_scalar(state); + + out = out_s; + + for i = 0 to 8 { + t64 = state[i]; + out[u64 i] = t64; + } + + return out; +} + +fn _shake128_absorb34(reg ptr u64[25] state, reg const ptr u8[34] in) -> reg ptr u64[25] +{ + reg u64 t64; + reg u16 t16; + inline int i; + + state = __st0(state); + + for i = 0 to 4 + { + t64 = in[u64 i]; + state[u64 i] ^= t64; + } + + t16 = in.[u16 32]; + state[u16 16] ^= t16; + + state[u8 34] ^= 0x1f; + + state[u8 SHAKE128_RATE-1] ^= 0x80; + + return state; +} + +#[returnaddress="stack"] +fn _shake128_squeezeblock(reg ptr u64[25] state, reg ptr u8[SHAKE128_RATE] out) -> reg ptr u64[25], reg ptr u8[SHAKE128_RATE] +{ + stack ptr u8[SHAKE128_RATE] out_s; + reg u64 t; + inline int i; + + out_s = out; + state = _keccakf1600_scalar(state); + out = out_s; + + for i = 0 to SHAKE128_RATE/8 + { + t = state[i]; + out[u64 i] = t; + } + return state, out; +} diff --git a/code/jasmin/mlkem_avx2/fips202_4x.jinc b/code/jasmin/mlkem_avx2/fips202_4x.jinc new file mode 100644 index 00000000..a1409887 --- /dev/null +++ b/code/jasmin/mlkem_avx2/fips202_4x.jinc @@ -0,0 +1,1434 @@ +require "fips202_common.jinc" + +u256 rho56 = 0x181F1E1D1C1B1A191017161514131211080F0E0D0C0B0A090007060504030201; +u256 rho8 = 0x1E1D1C1B1A19181F16151413121110170E0D0C0B0A09080F0605040302010007; + +inline fn __rol_4u64_rho56(reg u256 a) -> reg u256 +{ + reg u256 r; + + r = #VPSHUFB_256(a, rho56); + + return r; +} + + +inline fn __rol_4u64_rho8(reg u256 a) -> reg u256 +{ + reg u256 r; + + r = #VPSHUFB_256(a, rho8); + + return r; +} + + +inline fn __rol_4u64(reg u256 a, inline int o) -> reg u256 +{ + reg u256 r; + reg u256 t256; + + r = #VPSLL_4u64(a, o); + t256 = #VPSRL_4u64(a, 64 - o); + + r |= t256; + + return r; +} + + +param int ba=0; +param int be=1; +param int bi=2; +param int bo=3; +param int bu=4; +param int ga=5; +param int ge=6; +param int gi=7; +param int go=8; +param int gu=9; +param int ka=10; +param int ke=11; +param int ki=12; +param int ko=13; +param int ku=14; +param int ma=15; +param int me=16; +param int mi=17; +param int mo=18; +param int mu=19; +param int sa=20; +param int se=21; +param int si=22; +param int so=23; +param int su=24; + +u256[24] KeccakF1600RoundConstants = { + 0x0000000000000001000000000000000100000000000000010000000000000001, + 0x0000000000008082000000000000808200000000000080820000000000008082, + 0x800000000000808a800000000000808a800000000000808a800000000000808a, + 0x8000000080008000800000008000800080000000800080008000000080008000, + 0x000000000000808b000000000000808b000000000000808b000000000000808b, + 0x0000000080000001000000008000000100000000800000010000000080000001, + 0x8000000080008081800000008000808180000000800080818000000080008081, + 0x8000000000008009800000000000800980000000000080098000000000008009, + 0x000000000000008a000000000000008a000000000000008a000000000000008a, + 0x0000000000000088000000000000008800000000000000880000000000000088, + 0x0000000080008009000000008000800900000000800080090000000080008009, + 0x000000008000000a000000008000000a000000008000000a000000008000000a, + 0x000000008000808b000000008000808b000000008000808b000000008000808b, + 0x800000000000008b800000000000008b800000000000008b800000000000008b, + 0x8000000000008089800000000000808980000000000080898000000000008089, + 0x8000000000008003800000000000800380000000000080038000000000008003, + 0x8000000000008002800000000000800280000000000080028000000000008002, + 0x8000000000000080800000000000008080000000000000808000000000000080, + 0x000000000000800a000000000000800a000000000000800a000000000000800a, + 0x800000008000000a800000008000000a800000008000000a800000008000000a, + 0x8000000080008081800000008000808180000000800080818000000080008081, + 0x8000000000008080800000000000808080000000000080808000000000008080, + 0x0000000080000001000000008000000100000000800000010000000080000001, + 0x8000000080008008800000008000800880000000800080088000000080008008 + }; + +inline fn __prepare_theta(reg ptr u256[25] A_4x) -> reg u256, reg u256, reg u256, reg u256, reg u256 +{ + reg u256 Ca, Ce, Ci, Co, Cu; + + // Ca = XOR256(Aba, XOR256(Aga, XOR256(Aka, XOR256(Ama, Asa)))); + Ca = A_4x[sa]; + Ca ^= A_4x[ma]; + Ca ^= A_4x[ka]; + Ca ^= A_4x[ga]; + Ca ^= A_4x[ba]; + + // Ce = XOR256(Abe, XOR256(Age, XOR256(Ake, XOR256(Ame, Ase)))); + Ce = A_4x[se]; + Ce ^= A_4x[me]; + Ce ^= A_4x[ke]; + Ce ^= A_4x[ge]; + Ce ^= A_4x[be]; + + // Ci = XOR256(Abi, XOR256(Agi, XOR256(Aki, XOR256(Ami, Asi)))); + Ci = A_4x[si]; + Ci ^= A_4x[mi]; + Ci ^= A_4x[ki]; + Ci ^= A_4x[gi]; + Ci ^= A_4x[bi]; + + // Co = XOR256(Abo, XOR256(Ago, XOR256(Ako, XOR256(Amo, Aso)))); + Co = A_4x[so]; + Co ^= A_4x[mo]; + Co ^= A_4x[ko]; + Co ^= A_4x[go]; + Co ^= A_4x[bo]; + + // Cu = XOR256(Abu, XOR256(Agu, XOR256(Aku, XOR256(Amu, Asu)))); + Cu = A_4x[su]; + Cu ^= A_4x[mu]; + Cu ^= A_4x[ku]; + Cu ^= A_4x[gu]; + Cu ^= A_4x[bu]; + + return Ca, Ce, Ci, Co, Cu; +} + +inline fn __first(reg u256 Ca, reg u256 Ce, reg u256 Ci, reg u256 Co, reg u256 Cu) -> reg u256, reg u256, reg u256, reg u256, reg u256 +{ + reg u256 Da, De, Di, Do, Du; + reg u256 Ca1, Ce1, Ci1, Co1, Cu1; + + Ce1 = __rol_4u64(Ce, 1); + Da = Cu ^ Ce1; + + Ci1 = __rol_4u64(Ci, 1); + De = Ca ^ Ci1; + + Co1 = __rol_4u64(Co, 1); + Di = Ce ^ Co1; + + Cu1 = __rol_4u64(Cu, 1); + Do = Ci ^ Cu1; + + Ca1 = __rol_4u64(Ca, 1); + Du = Co ^ Ca1; + + return Da, De, Di, Do, Du; +} + + +inline fn __second_even( +reg ptr u256[25] A_4x, reg ptr u256[25] E_4x, inline int index, +reg u256 Ca, reg u256 Ce, reg u256 Ci, reg u256 Co, reg u256 Cu, +reg u256 Da, reg u256 De, reg u256 Di, reg u256 Do, reg u256 Du) +-> reg ptr u256[25], reg ptr u256[25], reg u256, reg u256, reg u256, reg u256, reg u256 +{ + reg u256 Bba, Bbe, Bbi, Bbo, Bbu; + reg u256 t256; + + t256 = A_4x[ba]; + t256 ^= Da; + A_4x[ba] = t256; + Bba = t256; + + t256 = A_4x[ge]; + t256 ^= De; + A_4x[ge] = t256; + Bbe = __rol_4u64(t256, 44); + + t256 = A_4x[ki]; + t256 ^= Di; + A_4x[ki] = t256; + Bbi = __rol_4u64(t256, 43); + + // E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); XOReq256(E##ba, CONST256_64(KeccakF1600RoundConstants[i])); + t256 = #VPANDN_256(Bbe, Bbi); + t256 ^= Bba; + t256 ^= KeccakF1600RoundConstants[index]; + E_4x[ba] = t256; + + Ca = t256; + + t256 = A_4x[mo]; + t256 ^= Do; + A_4x[mo] = t256; + Bbo = __rol_4u64(t256, 21); + + // E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); + t256 = #VPANDN_256(Bbi, Bbo); + t256 ^= Bbe; + E_4x[be] = t256; + + Ce = t256; + + t256 = A_4x[su]; + t256 ^= Du; + A_4x[su] = t256; + Bbu = __rol_4u64(t256, 14); + + // E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); + t256 = #VPANDN_256(Bbo, Bbu); + t256 ^= Bbi; + E_4x[bi] = t256; + + Ci = t256; + + // E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); + t256 = #VPANDN_256(Bbu, Bba); + t256 ^= Bbo; + E_4x[bo] = t256; + + Co = t256; + + // E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); + t256 = #VPANDN_256(Bba, Bbe); + t256 ^= Bbu; + E_4x[bu] = t256; + + Cu = t256; + + return A_4x, E_4x, Ca, Ce, Ci, Co, Cu; +} + +inline fn __third_even( +reg ptr u256[25] A_4x, reg ptr u256[25] E_4x, +reg u256 Ca, reg u256 Ce, reg u256 Ci, reg u256 Co, reg u256 Cu, +reg u256 Da, reg u256 De, reg u256 Di, reg u256 Do, reg u256 Du) +-> reg ptr u256[25], reg ptr u256[25], reg u256, reg u256, reg u256, reg u256, reg u256 +{ + reg u256 Bga, Bge, Bgi, Bgo, Bgu; + reg u256 t256; + + t256 = A_4x[bo]; + t256 ^= Do; + A_4x[bo] = t256; + Bga = __rol_4u64(t256, 28); + + t256 = A_4x[gu]; + t256 ^= Du; + A_4x[gu] = t256; + Bge = __rol_4u64(t256, 20); + + t256 = A_4x[ka]; + t256 ^= Da; + A_4x[ka] = t256; + Bgi = __rol_4u64(t256, 3); + + // E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)) + t256 = #VPANDN_256(Bge, Bgi); + t256 ^= Bga; + E_4x[ga] = t256; + + Ca ^= t256; + + t256 = A_4x[me]; + t256 ^= De; + A_4x[me] = t256; + Bgo = __rol_4u64(t256, 45); + + // E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)) + t256 = #VPANDN_256(Bgi, Bgo); + t256 ^= Bge; + E_4x[ge] = t256; + + Ce ^= t256; + + t256 = A_4x[si]; + t256 ^= Di; + A_4x[si] = t256; + Bgu = __rol_4u64(t256, 61); + + // E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)) + t256 = #VPANDN_256(Bgo, Bgu); + t256 ^= Bgi; + E_4x[gi] = t256; + + Ci ^= t256; + + // E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); + t256 = #VPANDN_256(Bgu, Bga); + t256 ^= Bgo; + E_4x[go] = t256; + + Co ^= t256; + + // E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); + t256 = #VPANDN_256(Bga, Bge); + t256 ^= Bgu; + E_4x[gu] = t256; + + Cu ^= t256; + + return A_4x, E_4x, Ca, Ce, Ci, Co, Cu; +} + +inline fn __fourth_even( +reg ptr u256[25] A_4x, reg ptr u256[25] E_4x, +reg u256 Ca, reg u256 Ce, reg u256 Ci, reg u256 Co, reg u256 Cu, +reg u256 Da, reg u256 De, reg u256 Di, reg u256 Do, reg u256 Du) +-> reg ptr u256[25], reg ptr u256[25], reg u256, reg u256, reg u256, reg u256, reg u256 +{ + reg u256 Bka, Bke, Bki, Bko, Bku; + reg u256 t256; + + t256 = A_4x[be]; + t256 ^= De; + A_4x[be] = t256; + Bka = __rol_4u64(t256, 1); + + t256 = A_4x[gi]; + t256 ^= Di; + A_4x[gi] = t256; + Bke = __rol_4u64(t256, 6); + + t256 = A_4x[ko]; + t256 ^= Do; + A_4x[ko] = t256; + Bki = __rol_4u64(t256, 25); + + // E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); + t256 = #VPANDN_256(Bke, Bki); + t256 ^= Bka; + E_4x[ka] = t256; + + Ca ^= t256; + + t256 = A_4x[mu]; + t256 ^= Du; + A_4x[mu] = t256; + Bko = __rol_4u64_rho8(t256); + + // E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); + t256 = #VPANDN_256(Bki, Bko); + t256 ^= Bke; + E_4x[ke] = t256; + + Ce ^= t256; + + t256 = A_4x[sa]; + t256 ^= Da; + A_4x[sa] = t256; + Bku = __rol_4u64(t256, 18); + + // E##ki = XOR256(Bki, ANDnu256(Bko, Bku)) + t256 = #VPANDN_256(Bko, Bku); + t256 ^= Bki; + E_4x[ki] = t256; + + Ci ^= t256; + + // E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); + t256 = #VPANDN_256(Bku, Bka); + t256 ^= Bko; + E_4x[ko] = t256; + + Co ^= t256; + + // E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); + t256 = #VPANDN_256(Bka, Bke); + t256 ^= Bku; + E_4x[ku] = t256; + + Cu ^= t256; + + return A_4x, E_4x, Ca, Ce, Ci, Co, Cu; +} + +inline fn __fifth_even( +reg ptr u256[25] A_4x, reg ptr u256[25] E_4x, +reg u256 Ca, reg u256 Ce, reg u256 Ci, reg u256 Co, reg u256 Cu, +reg u256 Da, reg u256 De, reg u256 Di, reg u256 Do, reg u256 Du) +-> reg ptr u256[25], reg ptr u256[25], reg u256, reg u256, reg u256, reg u256, reg u256 +{ + reg u256 Bma, Bme, Bmi, Bmo, Bmu; + reg u256 t256; + + t256 = A_4x[bu]; + t256 ^= Du; + A_4x[bu] = t256; + Bma = __rol_4u64(t256, 27); + + t256 = A_4x[ga]; + t256 ^= Da; + A_4x[ga] = t256; + Bme = __rol_4u64(t256, 36); + + t256 = A_4x[ke]; + t256 ^= De; + A_4x[ke] = t256; + Bmi = __rol_4u64(t256, 10); + + // E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); + t256 = #VPANDN_256(Bme, Bmi); + t256 ^= Bma; + E_4x[ma] = t256; + + Ca ^= t256; + + t256 = A_4x[mi]; + t256 ^= Di; + A_4x[mi] = t256; + Bmo = __rol_4u64(t256, 15); + + // E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); + t256 = #VPANDN_256(Bmi, Bmo); + t256 ^= Bme; + E_4x[me] = t256; + + Ce ^= t256; + + t256 = A_4x[so]; + t256 ^= Do; + A_4x[so] = t256; + Bmu = __rol_4u64_rho56(t256); + + // E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); + t256 = #VPANDN_256(Bmo, Bmu); + t256 ^= Bmi; + E_4x[mi] = t256; + + Ci ^= t256; + + // E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); + t256 = #VPANDN_256(Bmu, Bma); + t256 ^= Bmo; + E_4x[mo] = t256; + + Co ^= t256; + + // E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); + t256 = #VPANDN_256(Bma, Bme); + t256 ^= Bmu; + E_4x[mu] = t256; + + Cu ^= t256; + + return A_4x, E_4x, Ca, Ce, Ci, Co, Cu; +} + +inline fn __sixth_even( +reg ptr u256[25] A_4x, reg ptr u256[25] E_4x, +reg u256 Ca, reg u256 Ce, reg u256 Ci, reg u256 Co, reg u256 Cu, +reg u256 Da, reg u256 De, reg u256 Di, reg u256 Do, reg u256 Du) +-> reg ptr u256[25], reg ptr u256[25], reg u256, reg u256, reg u256, reg u256, reg u256 +{ + reg u256 Bsa, Bse, Bsi, Bso, Bsu; + reg u256 t256; + + t256 = A_4x[bi]; + t256 ^= Di; + A_4x[bi] = t256; + Bsa = __rol_4u64(t256, 62); + + t256 = A_4x[go]; + t256 ^= Do; + A_4x[go] = t256; + Bse = __rol_4u64(t256, 55); + + t256 = A_4x[ku]; + t256 ^= Du; + A_4x[ku] = t256; + Bsi = __rol_4u64(t256, 39); + + // E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); + t256 = #VPANDN_256(Bse, Bsi); + t256 ^= Bsa; + E_4x[sa] = t256; + + Ca ^= t256; + + t256 = A_4x[ma]; + t256 ^= Da; + A_4x[ma] = t256; + Bso = __rol_4u64(t256, 41); + + // E##se = XOR256(Bse, ANDnu256(Bsi, Bso)) + t256 = #VPANDN_256(Bsi, Bso); + t256 ^= Bse; + E_4x[se] = t256; + + Ce ^= t256; + + t256 = A_4x[se]; + t256 ^= De; + A_4x[se] = t256; + Bsu = __rol_4u64(t256, 2); + + // E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); + t256 = #VPANDN_256(Bso, Bsu); + t256 ^= Bsi; + E_4x[si] = t256; + + Ci ^= t256; + + // E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); + t256 = #VPANDN_256(Bsu, Bsa); + t256 ^= Bso; + E_4x[so] = t256; + + Co ^= t256; + + // E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); + t256 = #VPANDN_256(Bsa, Bse); + t256 ^= Bsu; + E_4x[su] = t256; + + Cu ^= t256; + + return A_4x, E_4x, Ca, Ce, Ci, Co, Cu; +} + +inline fn __second_odd( +reg ptr u256[25] A_4x, reg ptr u256[25] E_4x, inline int index, +reg u256 Ca, reg u256 Ce, reg u256 Ci, reg u256 Co, reg u256 Cu, +reg u256 Da, reg u256 De, reg u256 Di, reg u256 Do, reg u256 Du) +-> reg ptr u256[25], reg ptr u256[25], reg u256, reg u256, reg u256, reg u256, reg u256 +{ + reg u256 Bba, Bbe, Bbi, Bbo, Bbu; + reg u256 t256; + + t256 = A_4x[ba]; + t256 ^= Da; + A_4x[ba] = t256; + Bba = t256; + + t256 = A_4x[ge]; + t256 ^= De; + A_4x[ge] = t256; + Bbe = __rol_4u64(t256, 44); + + t256 = A_4x[ki]; + t256 ^= Di; + A_4x[ki] = t256; + Bbi = __rol_4u64(t256, 43); + + // E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); XOReq256(E##ba, CONST256_64(KeccakF1600RoundConstants[i])); + t256 = #VPANDN_256(Bbe, Bbi); + t256 ^= Bba; + t256 ^= KeccakF1600RoundConstants[index]; + E_4x[ba] = t256; + + Ca = t256; + + t256 = A_4x[mo]; + t256 ^= Do; + A_4x[mo] = t256; + Bbo = __rol_4u64(t256, 21); + + // E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); + t256 = #VPANDN_256(Bbi, Bbo); + t256 ^= Bbe; + E_4x[be] = t256; + + Ce = t256; + + t256 = A_4x[su]; + t256 ^= Du; + A_4x[su] = t256; + Bbu = __rol_4u64(t256, 14); + + // E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); + t256 = #VPANDN_256(Bbo, Bbu); + t256 ^= Bbi; + E_4x[bi] = t256; + + Ci = t256; + + // E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); + t256 = #VPANDN_256(Bbu, Bba); + t256 ^= Bbo; + E_4x[bo] = t256; + + Co = t256; + + // E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); + t256 = #VPANDN_256(Bba, Bbe); + t256 ^= Bbu; + E_4x[bu] = t256; + + Cu = t256; + + return A_4x, E_4x, Ca, Ce, Ci, Co, Cu; +} + +inline fn __third_odd( +reg ptr u256[25] A_4x, reg ptr u256[25] E_4x, +reg u256 Ca, reg u256 Ce, reg u256 Ci, reg u256 Co, reg u256 Cu, +reg u256 Da, reg u256 De, reg u256 Di, reg u256 Do, reg u256 Du) +-> reg ptr u256[25], reg ptr u256[25], reg u256, reg u256, reg u256, reg u256, reg u256 +{ + reg u256 Bga, Bge, Bgi, Bgo, Bgu; + reg u256 t256; + + t256 = A_4x[bo]; + t256 ^= Do; + A_4x[bo] = t256; + Bga = __rol_4u64(t256, 28); + + t256 = A_4x[gu]; + t256 ^= Du; + A_4x[gu] = t256; + Bge = __rol_4u64(t256, 20); + + t256 = A_4x[ka]; + t256 ^= Da; + A_4x[ka] = t256; + Bgi = __rol_4u64(t256, 3); + + // E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)) + t256 = #VPANDN_256(Bge, Bgi); + t256 ^= Bga; + E_4x[ga] = t256; + + Ca ^= t256; + + t256 = A_4x[me]; + t256 ^= De; + A_4x[me] = t256; + Bgo = __rol_4u64(t256, 45); + + // E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)) + t256 = #VPANDN_256(Bgi, Bgo); + t256 ^= Bge; + E_4x[ge] = t256; + + Ce ^= t256; + + t256 = A_4x[si]; + t256 ^= Di; + A_4x[si] = t256; + Bgu = __rol_4u64(t256, 61); + + // E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)) + t256 = #VPANDN_256(Bgo, Bgu); + t256 ^= Bgi; + E_4x[gi] = t256; + + Ci ^= t256; + + // E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); + t256 = #VPANDN_256(Bgu, Bga); + t256 ^= Bgo; + E_4x[go] = t256; + + Co ^= t256; + + // E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); + t256 = #VPANDN_256(Bga, Bge); + t256 ^= Bgu; + E_4x[gu] = t256; + + Cu ^= t256; + + return A_4x, E_4x, Ca, Ce, Ci, Co, Cu; +} + +inline fn __fourth_odd( +reg ptr u256[25] A_4x, reg ptr u256[25] E_4x, +reg u256 Ca, reg u256 Ce, reg u256 Ci, reg u256 Co, reg u256 Cu, +reg u256 Da, reg u256 De, reg u256 Di, reg u256 Do, reg u256 Du) +-> reg ptr u256[25], reg ptr u256[25], reg u256, reg u256, reg u256, reg u256, reg u256 +{ + reg u256 Bka, Bke, Bki, Bko, Bku; + reg u256 t256; + + t256 = A_4x[be]; + t256 ^= De; + A_4x[be] = t256; + Bka = __rol_4u64(t256, 1); + + t256 = A_4x[gi]; + t256 ^= Di; + A_4x[gi] = t256; + Bke = __rol_4u64(t256, 6); + + t256 = A_4x[ko]; + t256 ^= Do; + A_4x[ko] = t256; + Bki = __rol_4u64(t256, 25); + + // E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); + t256 = #VPANDN_256(Bke, Bki); + t256 ^= Bka; + E_4x[ka] = t256; + + Ca ^= t256; + + t256 = A_4x[mu]; + t256 ^= Du; + A_4x[mu] = t256; + Bko = __rol_4u64_rho8(t256); + + // E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); + t256 = #VPANDN_256(Bki, Bko); + t256 ^= Bke; + E_4x[ke] = t256; + + Ce ^= t256; + + t256 = A_4x[sa]; + t256 ^= Da; + A_4x[sa] = t256; + Bku = __rol_4u64(t256, 18); + + // E##ki = XOR256(Bki, ANDnu256(Bko, Bku)) + t256 = #VPANDN_256(Bko, Bku); + t256 ^= Bki; + E_4x[ki] = t256; + + Ci ^= t256; + + // E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); + t256 = #VPANDN_256(Bku, Bka); + t256 ^= Bko; + E_4x[ko] = t256; + + Co ^= t256; + + // E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); + t256 = #VPANDN_256(Bka, Bke); + t256 ^= Bku; + E_4x[ku] = t256; + + Cu ^= t256; + + return A_4x, E_4x, Ca, Ce, Ci, Co, Cu; +} + +inline fn __fifth_odd( +reg ptr u256[25] A_4x, reg ptr u256[25] E_4x, +reg u256 Ca, reg u256 Ce, reg u256 Ci, reg u256 Co, reg u256 Cu, +reg u256 Da, reg u256 De, reg u256 Di, reg u256 Do, reg u256 Du) +-> reg ptr u256[25], reg ptr u256[25], reg u256, reg u256, reg u256, reg u256, reg u256 +{ + reg u256 Bma, Bme, Bmi, Bmo, Bmu; + reg u256 t256; + + t256 = A_4x[bu]; + t256 ^= Du; + A_4x[bu] = t256; + Bma = __rol_4u64(t256, 27); + + t256 = A_4x[ga]; + t256 ^= Da; + A_4x[ga] = t256; + Bme = __rol_4u64(t256, 36); + + t256 = A_4x[ke]; + t256 ^= De; + A_4x[ke] = t256; + Bmi = __rol_4u64(t256, 10); + + // E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); + t256 = #VPANDN_256(Bme, Bmi); + t256 ^= Bma; + E_4x[ma] = t256; + + Ca ^= t256; + + t256 = A_4x[mi]; + t256 ^= Di; + A_4x[mi] = t256; + Bmo = __rol_4u64(t256, 15); + + // E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); + t256 = #VPANDN_256(Bmi, Bmo); + t256 ^= Bme; + E_4x[me] = t256; + + Ce ^= t256; + + t256 = A_4x[so]; + t256 ^= Do; + A_4x[so] = t256; + Bmu = __rol_4u64_rho56(t256); + + // E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); + t256 = #VPANDN_256(Bmo, Bmu); + t256 ^= Bmi; + E_4x[mi] = t256; + + Ci ^= t256; + + // E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); + t256 = #VPANDN_256(Bmu, Bma); + t256 ^= Bmo; + E_4x[mo] = t256; + + Co ^= t256; + + // E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); + t256 = #VPANDN_256(Bma, Bme); + t256 ^= Bmu; + E_4x[mu] = t256; + + Cu ^= t256; + + return A_4x, E_4x, Ca, Ce, Ci, Co, Cu; +} + +inline fn __sixth_odd( +reg ptr u256[25] A_4x, reg ptr u256[25] E_4x, +reg u256 Ca, reg u256 Ce, reg u256 Ci, reg u256 Co, reg u256 Cu, +reg u256 Da, reg u256 De, reg u256 Di, reg u256 Do, reg u256 Du) +-> reg ptr u256[25], reg ptr u256[25], reg u256, reg u256, reg u256, reg u256, reg u256 +{ + reg u256 Bsa, Bse, Bsi, Bso, Bsu; + reg u256 t256; + + t256 = A_4x[bi]; + t256 ^= Di; + A_4x[bi] = t256; + Bsa = __rol_4u64(t256, 62); + + t256 = A_4x[go]; + t256 ^= Do; + A_4x[go] = t256; + Bse = __rol_4u64(t256, 55); + + t256 = A_4x[ku]; + t256 ^= Du; + A_4x[ku] = t256; + Bsi = __rol_4u64(t256, 39); + + // E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); + t256 = #VPANDN_256(Bse, Bsi); + t256 ^= Bsa; + E_4x[sa] = t256; + + Ca ^= t256; + + t256 = A_4x[ma]; + t256 ^= Da; + A_4x[ma] = t256; + Bso = __rol_4u64(t256, 41); + + // E##se = XOR256(Bse, ANDnu256(Bsi, Bso)) + t256 = #VPANDN_256(Bsi, Bso); + t256 ^= Bse; + E_4x[se] = t256; + + Ce ^= t256; + + t256 = A_4x[se]; + t256 ^= De; + A_4x[se] = t256; + Bsu = __rol_4u64(t256, 2); + + // E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); + t256 = #VPANDN_256(Bso, Bsu); + t256 ^= Bsi; + E_4x[si] = t256; + + Ci ^= t256; + + // E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); + t256 = #VPANDN_256(Bsu, Bsa); + t256 ^= Bso; + E_4x[so] = t256; + + Co ^= t256; + + // E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); + t256 = #VPANDN_256(Bsa, Bse); + t256 ^= Bsu; + E_4x[su] = t256; + + Cu ^= t256; + + return A_4x, E_4x, Ca, Ce, Ci, Co, Cu; +} + +inline fn __second_last( +reg ptr u256[25] A_4x, reg ptr u256[25] E_4x, inline int index, +reg u256 Da, reg u256 De, reg u256 Di, reg u256 Do, reg u256 Du) +-> reg ptr u256[25], reg ptr u256[25] +{ + reg u256 Bba, Bbe, Bbi, Bbo, Bbu; + reg u256 t256; + + t256 = A_4x[ba]; + t256 ^= Da; + A_4x[ba] = t256; + Bba = t256; + + t256 = A_4x[ge]; + t256 ^= De; + A_4x[ge] = t256; + Bbe = __rol_4u64(t256, 44); + + t256 = A_4x[ki]; + t256 ^= Di; + A_4x[ki] = t256; + Bbi = __rol_4u64(t256, 43); + + // E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); XOReq256(E##ba, CONST256_64(KeccakF1600RoundConstants[i])); + t256 = #VPANDN_256(Bbe, Bbi); + t256 ^= Bba; + t256 ^= KeccakF1600RoundConstants[index]; + E_4x[ba] = t256; + + t256 = A_4x[mo]; + t256 ^= Do; + A_4x[mo] = t256; + Bbo = __rol_4u64(t256, 21); + + // E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); + t256 = #VPANDN_256(Bbi, Bbo); + t256 ^= Bbe; + E_4x[be] = t256; + + t256 = A_4x[su]; + t256 ^= Du; + A_4x[su] = t256; + Bbu = __rol_4u64(t256, 14); + + // E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); + t256 = #VPANDN_256(Bbo, Bbu); + t256 ^= Bbi; + E_4x[bi] = t256; + + // E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); + t256 = #VPANDN_256(Bbu, Bba); + t256 ^= Bbo; + E_4x[bo] = t256; + + // E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); + t256 = #VPANDN_256(Bba, Bbe); + t256 ^= Bbu; + E_4x[bu] = t256; + + return A_4x, E_4x; +} + +inline fn __third_last( +reg ptr u256[25] A_4x, reg ptr u256[25] E_4x, +reg u256 Da, reg u256 De, reg u256 Di, reg u256 Do, reg u256 Du) +-> reg ptr u256[25], reg ptr u256[25] +{ + reg u256 Bga, Bge, Bgi, Bgo, Bgu; + reg u256 t256; + + t256 = A_4x[bo]; + t256 ^= Do; + A_4x[bo] = t256; + Bga = __rol_4u64(t256, 28); + + t256 = A_4x[gu]; + t256 ^= Du; + A_4x[gu] = t256; + Bge = __rol_4u64(t256, 20); + + t256 = A_4x[ka]; + t256 ^= Da; + A_4x[ka] = t256; + Bgi = __rol_4u64(t256, 3); + + // E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)) + t256 = #VPANDN_256(Bge, Bgi); + t256 ^= Bga; + E_4x[ga] = t256; + + t256 = A_4x[me]; + t256 ^= De; + A_4x[me] = t256; + Bgo = __rol_4u64(t256, 45); + + // E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)) + t256 = #VPANDN_256(Bgi, Bgo); + t256 ^= Bge; + E_4x[ge] = t256; + + t256 = A_4x[si]; + t256 ^= Di; + A_4x[si] = t256; + Bgu = __rol_4u64(t256, 61); + + // E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)) + t256 = #VPANDN_256(Bgo, Bgu); + t256 ^= Bgi; + E_4x[gi] = t256; + + // E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); + t256 = #VPANDN_256(Bgu, Bga); + t256 ^= Bgo; + E_4x[go] = t256; + + // E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); + t256 = #VPANDN_256(Bga, Bge); + t256 ^= Bgu; + E_4x[gu] = t256; + + return A_4x, E_4x; +} + +inline fn __fourth_last( +reg ptr u256[25] A_4x, reg ptr u256[25] E_4x, +reg u256 Da, reg u256 De, reg u256 Di, reg u256 Do, reg u256 Du) +-> reg ptr u256[25], reg ptr u256[25] +{ + reg u256 Bka, Bke, Bki, Bko, Bku; + reg u256 t256; + + t256 = A_4x[be]; + t256 ^= De; + A_4x[be] = t256; + Bka = __rol_4u64(t256, 1); + + t256 = A_4x[gi]; + t256 ^= Di; + A_4x[gi] = t256; + Bke = __rol_4u64(t256, 6); + + t256 = A_4x[ko]; + t256 ^= Do; + A_4x[ko] = t256; + Bki = __rol_4u64(t256, 25); + + // E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); + t256 = #VPANDN_256(Bke, Bki); + t256 ^= Bka; + E_4x[ka] = t256; + + t256 = A_4x[mu]; + t256 ^= Du; + A_4x[mu] = t256; + Bko = __rol_4u64_rho8(t256); + + // E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); + t256 = #VPANDN_256(Bki, Bko); + t256 ^= Bke; + E_4x[ke] = t256; + + t256 = A_4x[sa]; + t256 ^= Da; + A_4x[sa] = t256; + Bku = __rol_4u64(t256, 18); + + // E##ki = XOR256(Bki, ANDnu256(Bko, Bku)) + t256 = #VPANDN_256(Bko, Bku); + t256 ^= Bki; + E_4x[ki] = t256; + + // E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); + t256 = #VPANDN_256(Bku, Bka); + t256 ^= Bko; + E_4x[ko] = t256; + + // E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); + t256 = #VPANDN_256(Bka, Bke); + t256 ^= Bku; + E_4x[ku] = t256; + + return A_4x, E_4x; +} + +inline fn __fifth_last( +reg ptr u256[25] A_4x, reg ptr u256[25] E_4x, +reg u256 Da, reg u256 De, reg u256 Di, reg u256 Do, reg u256 Du) +-> reg ptr u256[25], reg ptr u256[25] +{ + reg u256 Bma, Bme, Bmi, Bmo, Bmu; + reg u256 t256; + + t256 = A_4x[bu]; + t256 ^= Du; + A_4x[bu] = t256; + Bma = __rol_4u64(t256, 27); + + t256 = A_4x[ga]; + t256 ^= Da; + A_4x[ga] = t256; + Bme = __rol_4u64(t256, 36); + + t256 = A_4x[ke]; + t256 ^= De; + A_4x[ke] = t256; + Bmi = __rol_4u64(t256, 10); + + // E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); + t256 = #VPANDN_256(Bme, Bmi); + t256 ^= Bma; + E_4x[ma] = t256; + + t256 = A_4x[mi]; + t256 ^= Di; + A_4x[mi] = t256; + Bmo = __rol_4u64(t256, 15); + + // E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); + t256 = #VPANDN_256(Bmi, Bmo); + t256 ^= Bme; + E_4x[me] = t256; + + t256 = A_4x[so]; + t256 ^= Do; + A_4x[so] = t256; + Bmu = __rol_4u64_rho56(t256); + + // E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); + t256 = #VPANDN_256(Bmo, Bmu); + t256 ^= Bmi; + E_4x[mi] = t256; + + // E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); + t256 = #VPANDN_256(Bmu, Bma); + t256 ^= Bmo; + E_4x[mo] = t256; + + // E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); + t256 = #VPANDN_256(Bma, Bme); + t256 ^= Bmu; + E_4x[mu] = t256; + + return A_4x, E_4x; +} + +inline fn __sixth_last( +reg ptr u256[25] A_4x, reg ptr u256[25] E_4x, +reg u256 Da, reg u256 De, reg u256 Di, reg u256 Do, reg u256 Du) +-> reg ptr u256[25], reg ptr u256[25] +{ + reg u256 Bsa, Bse, Bsi, Bso, Bsu; + reg u256 t256; + + t256 = A_4x[bi]; + t256 ^= Di; + A_4x[bi] = t256; + Bsa = __rol_4u64(t256, 62); + + t256 = A_4x[go]; + t256 ^= Do; + A_4x[go] = t256; + Bse = __rol_4u64(t256, 55); + + t256 = A_4x[ku]; + t256 ^= Du; + A_4x[ku] = t256; + Bsi = __rol_4u64(t256, 39); + + // E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); + t256 = #VPANDN_256(Bse, Bsi); + t256 ^= Bsa; + E_4x[sa] = t256; + + t256 = A_4x[ma]; + t256 ^= Da; + A_4x[ma] = t256; + Bso = __rol_4u64(t256, 41); + + // E##se = XOR256(Bse, ANDnu256(Bsi, Bso)) + t256 = #VPANDN_256(Bsi, Bso); + t256 ^= Bse; + E_4x[se] = t256; + + t256 = A_4x[se]; + t256 ^= De; + A_4x[se] = t256; + Bsu = __rol_4u64(t256, 2); + + // E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); + t256 = #VPANDN_256(Bso, Bsu); + t256 ^= Bsi; + E_4x[si] = t256; + + // E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); + t256 = #VPANDN_256(Bsu, Bsa); + t256 ^= Bso; + E_4x[so] = t256; + + // E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); + t256 = #VPANDN_256(Bsa, Bse); + t256 ^= Bsu; + E_4x[su] = t256; + + return A_4x, E_4x; +} + +inline fn __theta_rho_pi_chi_iota_prepare_theta_even( +reg ptr u256[25] A_4x, reg ptr u256[25] E_4x, inline int index, +reg u256 Ca, reg u256 Ce, reg u256 Ci, reg u256 Co, reg u256 Cu) +-> reg ptr u256[25], reg ptr u256[25], reg u256, reg u256, reg u256, reg u256, reg u256 +{ + reg u256 Da, De, Di, Do, Du; + + Da, De, Di, Do, Du = __first(Ca, Ce, Ci, Co, Cu); + + A_4x, E_4x, Ca, Ce, Ci, Co, Cu = __second_even(A_4x, E_4x, index, Ca, Ce, Ci, Co, Cu, Da, De, Di, Do, Du); + + A_4x, E_4x, Ca, Ce, Ci, Co, Cu = __third_even(A_4x, E_4x, Ca, Ce, Ci, Co, Cu, Da, De, Di, Do, Du); + + A_4x, E_4x, Ca, Ce, Ci, Co, Cu = __fourth_even(A_4x, E_4x, Ca, Ce, Ci, Co, Cu, Da, De, Di, Do, Du); + + A_4x, E_4x, Ca, Ce, Ci, Co, Cu = __fifth_even(A_4x, E_4x, Ca, Ce, Ci, Co, Cu, Da, De, Di, Do, Du); + + A_4x, E_4x, Ca, Ce, Ci, Co, Cu = __sixth_even(A_4x, E_4x, Ca, Ce, Ci, Co, Cu, Da, De, Di, Do, Du); + + return A_4x, E_4x, Ca, Ce, Ci, Co, Cu; +} + +inline fn __theta_rho_pi_chi_iota_prepare_theta_odd( +reg ptr u256[25] A_4x, reg ptr u256[25] E_4x, inline int index, +reg u256 Ca, reg u256 Ce, reg u256 Ci, reg u256 Co, reg u256 Cu) +-> reg ptr u256[25], reg ptr u256[25], reg u256, reg u256, reg u256, reg u256, reg u256 +{ + reg u256 Da, De, Di, Do, Du; + + Da, De, Di, Do, Du = __first(Ca, Ce, Ci, Co, Cu); + + A_4x, E_4x, Ca, Ce, Ci, Co, Cu = __second_odd(A_4x, E_4x, index, Ca, Ce, Ci, Co, Cu, Da, De, Di, Do, Du); + + A_4x, E_4x, Ca, Ce, Ci, Co, Cu = __third_odd(A_4x, E_4x, Ca, Ce, Ci, Co, Cu, Da, De, Di, Do, Du); + + A_4x, E_4x, Ca, Ce, Ci, Co, Cu = __fourth_odd(A_4x, E_4x, Ca, Ce, Ci, Co, Cu, Da, De, Di, Do, Du); + + A_4x, E_4x, Ca, Ce, Ci, Co, Cu = __fifth_odd(A_4x, E_4x, Ca, Ce, Ci, Co, Cu, Da, De, Di, Do, Du); + + A_4x, E_4x, Ca, Ce, Ci, Co, Cu = __sixth_odd(A_4x, E_4x, Ca, Ce, Ci, Co, Cu, Da, De, Di, Do, Du); + + return A_4x, E_4x, Ca, Ce, Ci, Co, Cu; +} + +inline fn __theta_rho_pi_chi_iota( +reg ptr u256[25] A_4x, reg ptr u256[25] E_4x, inline int index, +reg u256 Ca, reg u256 Ce, reg u256 Ci, reg u256 Co, reg u256 Cu) +-> reg ptr u256[25], reg ptr u256[25] +{ + reg u256 Da, De, Di, Do, Du; + + Da, De, Di, Do, Du = __first(Ca, Ce, Ci, Co, Cu); + + A_4x, E_4x = __second_last(A_4x, E_4x, index, Da, De, Di, Do, Du); + + A_4x, E_4x = __third_last(A_4x, E_4x, Da, De, Di, Do, Du); + + A_4x, E_4x = __fourth_last(A_4x, E_4x, Da, De, Di, Do, Du); + + A_4x, E_4x = __fifth_last(A_4x, E_4x, Da, De, Di, Do, Du); + + A_4x, E_4x = __sixth_last(A_4x, E_4x, Da, De, Di, Do, Du); + + return A_4x, E_4x; +} + +fn _KeccakF1600_StatePermute4x(reg ptr u256[25] A_4x) -> reg ptr u256[25] +{ + reg u256 Ca, Ce, Ci, Co, Cu; + + stack u256[25] E_4x; + + /** Rounds24 **/ + Ca, Ce, Ci, Co, Cu = __prepare_theta(A_4x); + A_4x, E_4x, Ca, Ce, Ci, Co, Cu = __theta_rho_pi_chi_iota_prepare_theta_even(A_4x, E_4x, 0, Ca, Ce, Ci, Co, Cu); + E_4x, A_4x, Ca, Ce, Ci, Co, Cu = __theta_rho_pi_chi_iota_prepare_theta_odd(E_4x, A_4x, 1, Ca, Ce, Ci, Co, Cu); + A_4x, E_4x, Ca, Ce, Ci, Co, Cu = __theta_rho_pi_chi_iota_prepare_theta_even(A_4x, E_4x, 2, Ca, Ce, Ci, Co, Cu); + E_4x, A_4x, Ca, Ce, Ci, Co, Cu = __theta_rho_pi_chi_iota_prepare_theta_odd(E_4x, A_4x, 3, Ca, Ce, Ci, Co, Cu); + A_4x, E_4x, Ca, Ce, Ci, Co, Cu = __theta_rho_pi_chi_iota_prepare_theta_even(A_4x, E_4x, 4, Ca, Ce, Ci, Co, Cu); + E_4x, A_4x, Ca, Ce, Ci, Co, Cu = __theta_rho_pi_chi_iota_prepare_theta_odd(E_4x, A_4x, 5, Ca, Ce, Ci, Co, Cu); + A_4x, E_4x, Ca, Ce, Ci, Co, Cu = __theta_rho_pi_chi_iota_prepare_theta_even(A_4x, E_4x, 6, Ca, Ce, Ci, Co, Cu); + E_4x, A_4x, Ca, Ce, Ci, Co, Cu = __theta_rho_pi_chi_iota_prepare_theta_odd(E_4x, A_4x, 7, Ca, Ce, Ci, Co, Cu); + A_4x, E_4x, Ca, Ce, Ci, Co, Cu = __theta_rho_pi_chi_iota_prepare_theta_even(A_4x, E_4x, 8, Ca, Ce, Ci, Co, Cu); + E_4x, A_4x, Ca, Ce, Ci, Co, Cu = __theta_rho_pi_chi_iota_prepare_theta_odd(E_4x, A_4x, 9, Ca, Ce, Ci, Co, Cu); + A_4x, E_4x, Ca, Ce, Ci, Co, Cu = __theta_rho_pi_chi_iota_prepare_theta_even(A_4x, E_4x, 10, Ca, Ce, Ci, Co, Cu); + E_4x, A_4x, Ca, Ce, Ci, Co, Cu = __theta_rho_pi_chi_iota_prepare_theta_odd(E_4x, A_4x, 11, Ca, Ce, Ci, Co, Cu); + A_4x, E_4x, Ca, Ce, Ci, Co, Cu = __theta_rho_pi_chi_iota_prepare_theta_even(A_4x, E_4x, 12, Ca, Ce, Ci, Co, Cu); + E_4x, A_4x, Ca, Ce, Ci, Co, Cu = __theta_rho_pi_chi_iota_prepare_theta_odd(E_4x, A_4x, 13, Ca, Ce, Ci, Co, Cu); + A_4x, E_4x, Ca, Ce, Ci, Co, Cu = __theta_rho_pi_chi_iota_prepare_theta_even(A_4x, E_4x, 14, Ca, Ce, Ci, Co, Cu); + E_4x, A_4x, Ca, Ce, Ci, Co, Cu = __theta_rho_pi_chi_iota_prepare_theta_odd(E_4x, A_4x, 15, Ca, Ce, Ci, Co, Cu); + A_4x, E_4x, Ca, Ce, Ci, Co, Cu = __theta_rho_pi_chi_iota_prepare_theta_even(A_4x, E_4x, 16, Ca, Ce, Ci, Co, Cu); + E_4x, A_4x, Ca, Ce, Ci, Co, Cu = __theta_rho_pi_chi_iota_prepare_theta_odd(E_4x, A_4x, 17, Ca, Ce, Ci, Co, Cu); + A_4x, E_4x, Ca, Ce, Ci, Co, Cu = __theta_rho_pi_chi_iota_prepare_theta_even(A_4x, E_4x, 18, Ca, Ce, Ci, Co, Cu); + E_4x, A_4x, Ca, Ce, Ci, Co, Cu = __theta_rho_pi_chi_iota_prepare_theta_odd(E_4x, A_4x, 19, Ca, Ce, Ci, Co, Cu); + A_4x, E_4x, Ca, Ce, Ci, Co, Cu = __theta_rho_pi_chi_iota_prepare_theta_even(A_4x, E_4x, 20, Ca, Ce, Ci, Co, Cu); + E_4x, A_4x, Ca, Ce, Ci, Co, Cu = __theta_rho_pi_chi_iota_prepare_theta_odd(E_4x, A_4x, 21, Ca, Ce, Ci, Co, Cu); + A_4x, E_4x, Ca, Ce, Ci, Co, Cu = __theta_rho_pi_chi_iota_prepare_theta_even(A_4x, E_4x, 22, Ca, Ce, Ci, Co, Cu); + E_4x, A_4x = __theta_rho_pi_chi_iota(E_4x, A_4x, 23, Ca, Ce, Ci, Co, Cu); + + + return A_4x; +} + + +fn _shake128_absorb4x_34(reg ptr u256[25] s, reg ptr u8[34] m0 m1 m2 m3) -> reg ptr u256[25] +{ + inline int i; + reg u256 t0 t1; + reg u16 t16; + reg u64 t64; + + for i = 0 to 25 + { + t0 = #set0_256(); + s[i] = t0; + } + + for i = 0 to 4 + { + t64 = m0[u64 i]; + s[u64 4 * i] ^= t64; + t64 = m1[u64 i]; + s[u64 4 * i + 1] ^= t64; + t64 = m2[u64 i]; + s[u64 4 * i + 2] ^= t64; + t64 = m3[u64 i]; + s[u64 4 * i + 3] ^= t64; + } + + t16 = m0.[u16 32]; + s[u16 64] ^= t16; + s[u8 130] ^= 0x1F; + + t16 = m1.[u16 32]; + s[u16 68] ^= t16; + s[u8 138] ^= 0x1F; + + t16 = m2.[u16 32]; + s[u16 72] ^= t16; + s[u8 146] ^= 0x1F; + + t16 = m3.[u16 32]; + s[u16 76] ^= t16; + s[u8 154] ^= 0x1F; + + t0 = shake_sep[u256 0]; + t1 = s[SHAKE128_RATE / 8 - 1]; + t0 = t0 ^ t1; + s[SHAKE128_RATE / 8 - 1] = t0; + + return s; +} + + +inline +fn __shake128_squeezeblock4x(reg ptr u256[25] state, reg ptr u8[SHAKE128_RATE] h0 h1 h2 h3) -> reg ptr u256[25], reg ptr u8[SHAKE128_RATE], reg ptr u8[SHAKE128_RATE], reg ptr u8[SHAKE128_RATE], reg ptr u8[SHAKE128_RATE] +{ + reg u256 t256; + reg u128 t128; + inline int i; + + state = _KeccakF1600_StatePermute4x(state); + + for i = 0 to (SHAKE128_RATE / 8) { + t256 = state[i]; + t128 = (128u)t256; + h0[u64 i] = #VMOVLPD(t128); + h1[u64 i] = #VMOVHPD(t128); + t128 = #VEXTRACTI128(t256, 1); + h2[u64 i] = #VMOVLPD(t128); + h3[u64 i] = #VMOVHPD(t128); + } + + return state, h0, h1, h2, h3; +} + + +fn _shake256_absorb4x_33(reg ptr u256[25] s, reg ptr u8[33] m0 m1 m2 m3) -> reg ptr u256[25] +{ + inline int i; + reg u256 t0 t1; + reg u64 t64; + reg u8 t8; + + for i = 0 to 25 + { + t0 = #set0_256(); + s[i] = t0; + } + + for i = 0 to 4 + { + t64 = m0[u64 i]; + s[u64 4 * i] ^= t64; + t64 = m1[u64 i]; + s[u64 4 * i + 1] ^= t64; + t64 = m2[u64 i]; + s[u64 4 * i + 2] ^= t64; + t64 = m3[u64 i]; + s[u64 4 * i + 3] ^= t64; + } + + t8 = m0[32]; + s[u8 128] ^= t8; + s[u8 129] ^= 0x1F; + + t8 = m1[32]; + s[u8 136] ^= t8; + s[u8 137] ^= 0x1F; + + t8 = m2[32]; + s[u8 144] ^= t8; + s[u8 145] ^= 0x1F; + + t8 = m3[32]; + s[u8 152] ^= t8; + s[u8 153] ^= 0x1F; + + t0 = shake_sep[u256 0]; + t1 = s[SHAKE256_RATE / 8 - 1]; + t0 = t0 ^ t1; + s[SHAKE256_RATE / 8 - 1] = t0; + + return s; +} + + +inline +fn __shake256_squeezeblock4x(reg ptr u256[25] state, reg ptr u8[SHAKE256_RATE] h0 h1 h2 h3) -> reg ptr u256[25], reg ptr u8[SHAKE256_RATE], reg ptr u8[SHAKE256_RATE], reg ptr u8[SHAKE256_RATE], reg ptr u8[SHAKE256_RATE] +{ + reg u256 t256; + reg u128 t128; + inline int i; + + state = _KeccakF1600_StatePermute4x(state); + + for i = 0 to (SHAKE256_RATE / 8) { + t256 = state[i]; + t128 = (128u)t256; + h0[u64 i] = #VMOVLPD(t128); + h1[u64 i] = #VMOVHPD(t128); + t128 = #VEXTRACTI128(t256, 1); + h2[u64 i] = #VMOVLPD(t128); + h3[u64 i] = #VMOVHPD(t128); + } + + return state, h0, h1, h2, h3; +} diff --git a/code/jasmin/mlkem_avx2/fips202_common.jinc b/code/jasmin/mlkem_avx2/fips202_common.jinc new file mode 100644 index 00000000..0ed82a08 --- /dev/null +++ b/code/jasmin/mlkem_avx2/fips202_common.jinc @@ -0,0 +1,6 @@ +param int SHAKE128_RATE = 168; +param int SHAKE256_RATE = 136; +param int SHA3_256_RATE = 136; +param int SHA3_512_RATE = 72; + +u64[4] shake_sep = {9223372036854775808, 9223372036854775808, 9223372036854775808, 9223372036854775808}; diff --git a/code/jasmin/mlkem_avx2/fq.S b/code/jasmin/mlkem_avx2/fq.S new file mode 100644 index 00000000..d4c5c902 --- /dev/null +++ b/code/jasmin/mlkem_avx2/fq.S @@ -0,0 +1,129 @@ +#include "consts.h" +.include "fq.inc" + +.text +reduce128_avx: +#load +vmovdqa (%rdi),%ymm2 +vmovdqa 32(%rdi),%ymm3 +vmovdqa 64(%rdi),%ymm4 +vmovdqa 96(%rdi),%ymm5 +vmovdqa 128(%rdi),%ymm6 +vmovdqa 160(%rdi),%ymm7 +vmovdqa 192(%rdi),%ymm8 +vmovdqa 224(%rdi),%ymm9 + +red16 2,10 +red16 3,11 +red16 4,12 +red16 5,13 +red16 6,14 +red16 7,15 +red16 8,10 +red16 9,11 + +#store +vmovdqa %ymm2,(%rdi) +vmovdqa %ymm3,32(%rdi) +vmovdqa %ymm4,64(%rdi) +vmovdqa %ymm5,96(%rdi) +vmovdqa %ymm6,128(%rdi) +vmovdqa %ymm7,160(%rdi) +vmovdqa %ymm8,192(%rdi) +vmovdqa %ymm9,224(%rdi) + +ret + +.global cdecl(reduce_avx) +cdecl(reduce_avx): +#consts +vmovdqa _16XQ*2(%rsi),%ymm0 +vmovdqa _16XV*2(%rsi),%ymm1 +call reduce128_avx +add $256,%rdi +call reduce128_avx +ret + +csubq128_avx: +#load +vmovdqa (%rdi),%ymm1 +vmovdqa 32(%rdi),%ymm2 +vmovdqa 64(%rdi),%ymm3 +vmovdqa 96(%rdi),%ymm4 +vmovdqa 128(%rdi),%ymm5 +vmovdqa 160(%rdi),%ymm6 +vmovdqa 192(%rdi),%ymm7 +vmovdqa 224(%rdi),%ymm8 + +csubq 1,9 +csubq 2,10 +csubq 3,11 +csubq 4,12 +csubq 5,13 +csubq 6,14 +csubq 7,15 +csubq 8,9 + +#store +vmovdqa %ymm1,(%rdi) +vmovdqa %ymm2,32(%rdi) +vmovdqa %ymm3,64(%rdi) +vmovdqa %ymm4,96(%rdi) +vmovdqa %ymm5,128(%rdi) +vmovdqa %ymm6,160(%rdi) +vmovdqa %ymm7,192(%rdi) +vmovdqa %ymm8,224(%rdi) + +ret + +.global cdecl(csubq_avx) +cdecl(csubq_avx): +#consts +vmovdqa _16XQ*2(%rsi),%ymm0 +call csubq128_avx +add $256,%rdi +call csubq128_avx +ret + +tomont128_avx: +#load +vmovdqa (%rdi),%ymm3 +vmovdqa 32(%rdi),%ymm4 +vmovdqa 64(%rdi),%ymm5 +vmovdqa 96(%rdi),%ymm6 +vmovdqa 128(%rdi),%ymm7 +vmovdqa 160(%rdi),%ymm8 +vmovdqa 192(%rdi),%ymm9 +vmovdqa 224(%rdi),%ymm10 + +fqmulprecomp 1,2,3,11 +fqmulprecomp 1,2,4,12 +fqmulprecomp 1,2,5,13 +fqmulprecomp 1,2,6,14 +fqmulprecomp 1,2,7,15 +fqmulprecomp 1,2,8,11 +fqmulprecomp 1,2,9,12 +fqmulprecomp 1,2,10,13 + +#store +vmovdqa %ymm3,(%rdi) +vmovdqa %ymm4,32(%rdi) +vmovdqa %ymm5,64(%rdi) +vmovdqa %ymm6,96(%rdi) +vmovdqa %ymm7,128(%rdi) +vmovdqa %ymm8,160(%rdi) +vmovdqa %ymm9,192(%rdi) +vmovdqa %ymm10,224(%rdi) + +ret + +.global cdecl(tomont_avx) +cdecl(tomont_avx): +#consts +vmovdqa _16XQ*2(%rsi),%ymm0 +vmovdqa _16XMONTSQLO*2(%rsi),%ymm1 +vmovdqa _16XMONTSQHI*2(%rsi),%ymm2 +call tomont128_avx +add $256,%rdi +call tomont128_avx +ret diff --git a/code/jasmin/mlkem_avx2/fq.inc b/code/jasmin/mlkem_avx2/fq.inc new file mode 100644 index 00000000..4cb28a8e --- /dev/null +++ b/code/jasmin/mlkem_avx2/fq.inc @@ -0,0 +1,26 @@ +.macro red16 r,x=12 +vpmulhw %ymm1,%ymm\r,%ymm\x +vpsraw $10,%ymm\x,%ymm\x +vpmullw %ymm0,%ymm\x,%ymm\x +vpsubw %ymm\x,%ymm\r,%ymm\r +.endm + +.macro csubq r,x=12 +vpsubw %ymm0,%ymm\r,%ymm\r +vpsraw $15,%ymm\r,%ymm\x +vpand %ymm0,%ymm\x,%ymm\x +vpaddw %ymm\x,%ymm\r,%ymm\r +.endm + +.macro caddq r,x=12 +vpsraw $15,%ymm\r,%ymm\x +vpand %ymm0,%ymm\x,%ymm\x +vpaddw %ymm\x,%ymm\r,%ymm\r +.endm + +.macro fqmulprecomp al,ah,b,x=12 +vpmullw %ymm\al,%ymm\b,%ymm\x +vpmulhw %ymm\ah,%ymm\b,%ymm\b +vpmulhw %ymm0,%ymm\x,%ymm\x +vpsubw %ymm\x,%ymm\b,%ymm\b +.endm diff --git a/code/jasmin/mlkem_avx2/gen_matrix.jazz b/code/jasmin/mlkem_avx2/gen_matrix.jazz new file mode 100644 index 00000000..3789bb29 --- /dev/null +++ b/code/jasmin/mlkem_avx2/gen_matrix.jazz @@ -0,0 +1,59 @@ +require "gen_matrix.jinc" +/* +require "gen_matrix_old.jinc" + +export fn gen_matrix_old_jazz(reg u64 ap, reg u64 seedp) +{ + stack u16[KYBER_K*KYBER_VECN] aa; + stack u8[KYBER_SYMBYTES] seed; + reg u8 c; + reg u16 t; + inline int i; + stack u64 sap; + + sap = ap; + + for i = 0 to KYBER_SYMBYTES + { + c = (u8)[seedp + i]; + seed[i] = c; + } + + aa = __gen_matrix_old(seed, 1); + + ap = sap; + + for i = 0 to KYBER_K*KYBER_VECN + { + t = aa[i]; + (u16)[ap + 2*i] = t; + } +} +*/ +export fn gen_matrix_jazz(reg u64 ap, reg u64 seedp) +{ + stack u16[KYBER_K*KYBER_VECN] aa; + stack u8[KYBER_SYMBYTES] seed; + reg u8 c; + reg u16 t; + inline int i; + stack u64 sap; + + sap = ap; + + for i = 0 to KYBER_SYMBYTES + { + c = (u8)[seedp + i]; + seed[i] = c; + } + + aa = __gen_matrix(seed, 1); + + ap = sap; + + for i = 0 to KYBER_K*KYBER_VECN + { + t = aa[i]; + (u16)[ap + 2*i] = t; + } +} diff --git a/code/jasmin/mlkem_avx2/gen_matrix.jinc b/code/jasmin/mlkem_avx2/gen_matrix.jinc new file mode 100644 index 00000000..6e07b4b7 --- /dev/null +++ b/code/jasmin/mlkem_avx2/gen_matrix.jinc @@ -0,0 +1,137 @@ +require "params.jinc" +require "shuffle.jinc" +require "fips202.jinc" +require "params.jinc" + +inline +fn __rej_uniform(stack u16[KYBER_N] rp, reg u64 offset, stack u8[SHAKE128_RATE] buf) -> reg u64, stack u16[KYBER_N] +{ + reg u16 val1 val2; + reg u16 t; + reg u64 pos ctr; + reg u64 cnd0 cnd1 exit; + + + ctr = offset; + pos = 0; + exit = 0; + + while(exit == 0) + { + val1 = (16u)buf[(int)pos]; + pos += 1; + t = (16u)buf[(int)pos]; + val2 = t; + val2 >>= 4; + t &= 0x0F; + t <<= 8; + val1 |= t; + pos += 1; + + t = (16u)buf[(int)pos]; + t <<= 4; + val2 |= t; + pos += 1; + + if(val1 < KYBER_Q) + { + rp[(int)ctr] = val1; + ctr += 1; + } + + if(val2 < KYBER_Q) + { + if(ctr < KYBER_N) + { + rp[(int)ctr] = val2; + ctr += 1; + } + } + + // Check if we should exit the loop + cnd0 = KYBER_N; + cnd0 -= ctr; + cnd0 -= 1; + cnd1 = SHAKE128_RATE; + cnd1 -= pos; + cnd1 -= 3; //TODO: (potentially) wasting 2 'good' bytes + exit = cnd0 | cnd1; + exit >>= 63; + } + + return ctr, rp; +} + +inline +fn __gen_matrix(stack u8[KYBER_SYMBYTES] seed, reg u64 transposed) -> stack u16[KYBER_K*KYBER_VECN] +{ + stack u8[34] extseed; + stack u8[SHAKE128_RATE] buf; + stack u64[25] state; + stack u16[KYBER_N] poly; + stack u16[KYBER_K*KYBER_VECN] r; + + reg u8 c; + reg u16 t; + reg u64 ctr k l; + stack u64 sctr; + stack u64 stransposed; + inline int j i; + + stransposed = transposed; + + for j = 0 to KYBER_SYMBYTES + { + c = seed[j]; + extseed[j] = c; + } + + for i=0 to KYBER_K + { + for j = 0 to KYBER_K + { + transposed = stransposed; + if(transposed == 0) + { + extseed[KYBER_SYMBYTES] = j; + extseed[KYBER_SYMBYTES+1] = i; + } + else + { + extseed[KYBER_SYMBYTES] = i; + extseed[KYBER_SYMBYTES+1] = j; + } + + state = _shake128_absorb34(state, extseed); + + ctr = 0; + while (ctr < KYBER_N) + { + sctr = ctr; + state, buf = _shake128_squeezeblock(state, buf); + ctr = sctr; + ctr, poly = __rej_uniform(poly, ctr, buf); + } + + k = 0; + l = i * KYBER_VECN + j * KYBER_N; + while (k < KYBER_N) + { + t = poly[(int) k]; + r[(int) l] = t; + k += 1; + l += 1; + } + } + } + + for i = 0 to KYBER_K + { + for j = 0 to KYBER_K + { + r[i*KYBER_VECN+j*KYBER_N:KYBER_N] = _nttunpack(r[i*KYBER_VECN+j*KYBER_N:KYBER_N]); + } + } + + return r; +} diff --git a/code/jasmin/mlkem_avx2/gen_matrix.jinc.try0 b/code/jasmin/mlkem_avx2/gen_matrix.jinc.try0 new file mode 100644 index 00000000..9c3f758d --- /dev/null +++ b/code/jasmin/mlkem_avx2/gen_matrix.jinc.try0 @@ -0,0 +1,940 @@ +/** +benchmarks with this file + + our / supercop +key 83712/ 79134 skylake +enc 96680/ 74866 +dec 83562/ 65006 + +key 74472/ 71588 haswell +enc 89524/ 72472 +dec 77096/ 61512 + +key 93730/ 91723 alderlake +enc 109251/ 85250 +dec 97006/ 73901 + +**/ + +require "params.jinc" +require "consts.jinc" +require "shuffle.jinc" +require "fips202.jinc" +require "fips202_4x.jinc" + +param int GENMATRIX_NBLOCKS = ((12*KYBER_N/8*4096/KYBER_Q + SHAKE128_RATE)/SHAKE128_RATE); +param int REJ_UNIFORM_AVX_BUFLEN = GENMATRIX_NBLOCKS * SHAKE128_RATE; + +param int USE_AVX2_REJECTION = 0; +param int USE_SQUEEZE_N = 0; + +u8[2048] ru_idx = {-1, -1, -1, -1, -1, -1, -1, -1, + 0, -1, -1, -1, -1, -1, -1, -1, + 2, -1, -1, -1, -1, -1, -1, -1, + 0, 2, -1, -1, -1, -1, -1, -1, + 4, -1, -1, -1, -1, -1, -1, -1, + 0, 4, -1, -1, -1, -1, -1, -1, + 2, 4, -1, -1, -1, -1, -1, -1, + 0, 2, 4, -1, -1, -1, -1, -1, + 6, -1, -1, -1, -1, -1, -1, -1, + 0, 6, -1, -1, -1, -1, -1, -1, + 2, 6, -1, -1, -1, -1, -1, -1, + 0, 2, 6, -1, -1, -1, -1, -1, + 4, 6, -1, -1, -1, -1, -1, -1, + 0, 4, 6, -1, -1, -1, -1, -1, + 2, 4, 6, -1, -1, -1, -1, -1, + 0, 2, 4, 6, -1, -1, -1, -1, + 8, -1, -1, -1, -1, -1, -1, -1, + 0, 8, -1, -1, -1, -1, -1, -1, + 2, 8, -1, -1, -1, -1, -1, -1, + 0, 2, 8, -1, -1, -1, -1, -1, + 4, 8, -1, -1, -1, -1, -1, -1, + 0, 4, 8, -1, -1, -1, -1, -1, + 2, 4, 8, -1, -1, -1, -1, -1, + 0, 2, 4, 8, -1, -1, -1, -1, + 6, 8, -1, -1, -1, -1, -1, -1, + 0, 6, 8, -1, -1, -1, -1, -1, + 2, 6, 8, -1, -1, -1, -1, -1, + 0, 2, 6, 8, -1, -1, -1, -1, + 4, 6, 8, -1, -1, -1, -1, -1, + 0, 4, 6, 8, -1, -1, -1, -1, + 2, 4, 6, 8, -1, -1, -1, -1, + 0, 2, 4, 6, 8, -1, -1, -1, + 10, -1, -1, -1, -1, -1, -1, -1, + 0, 10, -1, -1, -1, -1, -1, -1, + 2, 10, -1, -1, -1, -1, -1, -1, + 0, 2, 10, -1, -1, -1, -1, -1, + 4, 10, -1, -1, -1, -1, -1, -1, + 0, 4, 10, -1, -1, -1, -1, -1, + 2, 4, 10, -1, -1, -1, -1, -1, + 0, 2, 4, 10, -1, -1, -1, -1, + 6, 10, -1, -1, -1, -1, -1, -1, + 0, 6, 10, -1, -1, -1, -1, -1, + 2, 6, 10, -1, -1, -1, -1, -1, + 0, 2, 6, 10, -1, -1, -1, -1, + 4, 6, 10, -1, -1, -1, -1, -1, + 0, 4, 6, 10, -1, -1, -1, -1, + 2, 4, 6, 10, -1, -1, -1, -1, + 0, 2, 4, 6, 10, -1, -1, -1, + 8, 10, -1, -1, -1, -1, -1, -1, + 0, 8, 10, -1, -1, -1, -1, -1, + 2, 8, 10, -1, -1, -1, -1, -1, + 0, 2, 8, 10, -1, -1, -1, -1, + 4, 8, 10, -1, -1, -1, -1, -1, + 0, 4, 8, 10, -1, -1, -1, -1, + 2, 4, 8, 10, -1, -1, -1, -1, + 0, 2, 4, 8, 10, -1, -1, -1, + 6, 8, 10, -1, -1, -1, -1, -1, + 0, 6, 8, 10, -1, -1, -1, -1, + 2, 6, 8, 10, -1, -1, -1, -1, + 0, 2, 6, 8, 10, -1, -1, -1, + 4, 6, 8, 10, -1, -1, -1, -1, + 0, 4, 6, 8, 10, -1, -1, -1, + 2, 4, 6, 8, 10, -1, -1, -1, + 0, 2, 4, 6, 8, 10, -1, -1, + 12, -1, -1, -1, -1, -1, -1, -1, + 0, 12, -1, -1, -1, -1, -1, -1, + 2, 12, -1, -1, -1, -1, -1, -1, + 0, 2, 12, -1, -1, -1, -1, -1, + 4, 12, -1, -1, -1, -1, -1, -1, + 0, 4, 12, -1, -1, -1, -1, -1, + 2, 4, 12, -1, -1, -1, -1, -1, + 0, 2, 4, 12, -1, -1, -1, -1, + 6, 12, -1, -1, -1, -1, -1, -1, + 0, 6, 12, -1, -1, -1, -1, -1, + 2, 6, 12, -1, -1, -1, -1, -1, + 0, 2, 6, 12, -1, -1, -1, -1, + 4, 6, 12, -1, -1, -1, -1, -1, + 0, 4, 6, 12, -1, -1, -1, -1, + 2, 4, 6, 12, -1, -1, -1, -1, + 0, 2, 4, 6, 12, -1, -1, -1, + 8, 12, -1, -1, -1, -1, -1, -1, + 0, 8, 12, -1, -1, -1, -1, -1, + 2, 8, 12, -1, -1, -1, -1, -1, + 0, 2, 8, 12, -1, -1, -1, -1, + 4, 8, 12, -1, -1, -1, -1, -1, + 0, 4, 8, 12, -1, -1, -1, -1, + 2, 4, 8, 12, -1, -1, -1, -1, + 0, 2, 4, 8, 12, -1, -1, -1, + 6, 8, 12, -1, -1, -1, -1, -1, + 0, 6, 8, 12, -1, -1, -1, -1, + 2, 6, 8, 12, -1, -1, -1, -1, + 0, 2, 6, 8, 12, -1, -1, -1, + 4, 6, 8, 12, -1, -1, -1, -1, + 0, 4, 6, 8, 12, -1, -1, -1, + 2, 4, 6, 8, 12, -1, -1, -1, + 0, 2, 4, 6, 8, 12, -1, -1, + 10, 12, -1, -1, -1, -1, -1, -1, + 0, 10, 12, -1, -1, -1, -1, -1, + 2, 10, 12, -1, -1, -1, -1, -1, + 0, 2, 10, 12, -1, -1, -1, -1, + 4, 10, 12, -1, -1, -1, -1, -1, + 0, 4, 10, 12, -1, -1, -1, -1, + 2, 4, 10, 12, -1, -1, -1, -1, + 0, 2, 4, 10, 12, -1, -1, -1, + 6, 10, 12, -1, -1, -1, -1, -1, + 0, 6, 10, 12, -1, -1, -1, -1, + 2, 6, 10, 12, -1, -1, -1, -1, + 0, 2, 6, 10, 12, -1, -1, -1, + 4, 6, 10, 12, -1, -1, -1, -1, + 0, 4, 6, 10, 12, -1, -1, -1, + 2, 4, 6, 10, 12, -1, -1, -1, + 0, 2, 4, 6, 10, 12, -1, -1, + 8, 10, 12, -1, -1, -1, -1, -1, + 0, 8, 10, 12, -1, -1, -1, -1, + 2, 8, 10, 12, -1, -1, -1, -1, + 0, 2, 8, 10, 12, -1, -1, -1, + 4, 8, 10, 12, -1, -1, -1, -1, + 0, 4, 8, 10, 12, -1, -1, -1, + 2, 4, 8, 10, 12, -1, -1, -1, + 0, 2, 4, 8, 10, 12, -1, -1, + 6, 8, 10, 12, -1, -1, -1, -1, + 0, 6, 8, 10, 12, -1, -1, -1, + 2, 6, 8, 10, 12, -1, -1, -1, + 0, 2, 6, 8, 10, 12, -1, -1, + 4, 6, 8, 10, 12, -1, -1, -1, + 0, 4, 6, 8, 10, 12, -1, -1, + 2, 4, 6, 8, 10, 12, -1, -1, + 0, 2, 4, 6, 8, 10, 12, -1, + 14, -1, -1, -1, -1, -1, -1, -1, + 0, 14, -1, -1, -1, -1, -1, -1, + 2, 14, -1, -1, -1, -1, -1, -1, + 0, 2, 14, -1, -1, -1, -1, -1, + 4, 14, -1, -1, -1, -1, -1, -1, + 0, 4, 14, -1, -1, -1, -1, -1, + 2, 4, 14, -1, -1, -1, -1, -1, + 0, 2, 4, 14, -1, -1, -1, -1, + 6, 14, -1, -1, -1, -1, -1, -1, + 0, 6, 14, -1, -1, -1, -1, -1, + 2, 6, 14, -1, -1, -1, -1, -1, + 0, 2, 6, 14, -1, -1, -1, -1, + 4, 6, 14, -1, -1, -1, -1, -1, + 0, 4, 6, 14, -1, -1, -1, -1, + 2, 4, 6, 14, -1, -1, -1, -1, + 0, 2, 4, 6, 14, -1, -1, -1, + 8, 14, -1, -1, -1, -1, -1, -1, + 0, 8, 14, -1, -1, -1, -1, -1, + 2, 8, 14, -1, -1, -1, -1, -1, + 0, 2, 8, 14, -1, -1, -1, -1, + 4, 8, 14, -1, -1, -1, -1, -1, + 0, 4, 8, 14, -1, -1, -1, -1, + 2, 4, 8, 14, -1, -1, -1, -1, + 0, 2, 4, 8, 14, -1, -1, -1, + 6, 8, 14, -1, -1, -1, -1, -1, + 0, 6, 8, 14, -1, -1, -1, -1, + 2, 6, 8, 14, -1, -1, -1, -1, + 0, 2, 6, 8, 14, -1, -1, -1, + 4, 6, 8, 14, -1, -1, -1, -1, + 0, 4, 6, 8, 14, -1, -1, -1, + 2, 4, 6, 8, 14, -1, -1, -1, + 0, 2, 4, 6, 8, 14, -1, -1, + 10, 14, -1, -1, -1, -1, -1, -1, + 0, 10, 14, -1, -1, -1, -1, -1, + 2, 10, 14, -1, -1, -1, -1, -1, + 0, 2, 10, 14, -1, -1, -1, -1, + 4, 10, 14, -1, -1, -1, -1, -1, + 0, 4, 10, 14, -1, -1, -1, -1, + 2, 4, 10, 14, -1, -1, -1, -1, + 0, 2, 4, 10, 14, -1, -1, -1, + 6, 10, 14, -1, -1, -1, -1, -1, + 0, 6, 10, 14, -1, -1, -1, -1, + 2, 6, 10, 14, -1, -1, -1, -1, + 0, 2, 6, 10, 14, -1, -1, -1, + 4, 6, 10, 14, -1, -1, -1, -1, + 0, 4, 6, 10, 14, -1, -1, -1, + 2, 4, 6, 10, 14, -1, -1, -1, + 0, 2, 4, 6, 10, 14, -1, -1, + 8, 10, 14, -1, -1, -1, -1, -1, + 0, 8, 10, 14, -1, -1, -1, -1, + 2, 8, 10, 14, -1, -1, -1, -1, + 0, 2, 8, 10, 14, -1, -1, -1, + 4, 8, 10, 14, -1, -1, -1, -1, + 0, 4, 8, 10, 14, -1, -1, -1, + 2, 4, 8, 10, 14, -1, -1, -1, + 0, 2, 4, 8, 10, 14, -1, -1, + 6, 8, 10, 14, -1, -1, -1, -1, + 0, 6, 8, 10, 14, -1, -1, -1, + 2, 6, 8, 10, 14, -1, -1, -1, + 0, 2, 6, 8, 10, 14, -1, -1, + 4, 6, 8, 10, 14, -1, -1, -1, + 0, 4, 6, 8, 10, 14, -1, -1, + 2, 4, 6, 8, 10, 14, -1, -1, + 0, 2, 4, 6, 8, 10, 14, -1, + 12, 14, -1, -1, -1, -1, -1, -1, + 0, 12, 14, -1, -1, -1, -1, -1, + 2, 12, 14, -1, -1, -1, -1, -1, + 0, 2, 12, 14, -1, -1, -1, -1, + 4, 12, 14, -1, -1, -1, -1, -1, + 0, 4, 12, 14, -1, -1, -1, -1, + 2, 4, 12, 14, -1, -1, -1, -1, + 0, 2, 4, 12, 14, -1, -1, -1, + 6, 12, 14, -1, -1, -1, -1, -1, + 0, 6, 12, 14, -1, -1, -1, -1, + 2, 6, 12, 14, -1, -1, -1, -1, + 0, 2, 6, 12, 14, -1, -1, -1, + 4, 6, 12, 14, -1, -1, -1, -1, + 0, 4, 6, 12, 14, -1, -1, -1, + 2, 4, 6, 12, 14, -1, -1, -1, + 0, 2, 4, 6, 12, 14, -1, -1, + 8, 12, 14, -1, -1, -1, -1, -1, + 0, 8, 12, 14, -1, -1, -1, -1, + 2, 8, 12, 14, -1, -1, -1, -1, + 0, 2, 8, 12, 14, -1, -1, -1, + 4, 8, 12, 14, -1, -1, -1, -1, + 0, 4, 8, 12, 14, -1, -1, -1, + 2, 4, 8, 12, 14, -1, -1, -1, + 0, 2, 4, 8, 12, 14, -1, -1, + 6, 8, 12, 14, -1, -1, -1, -1, + 0, 6, 8, 12, 14, -1, -1, -1, + 2, 6, 8, 12, 14, -1, -1, -1, + 0, 2, 6, 8, 12, 14, -1, -1, + 4, 6, 8, 12, 14, -1, -1, -1, + 0, 4, 6, 8, 12, 14, -1, -1, + 2, 4, 6, 8, 12, 14, -1, -1, + 0, 2, 4, 6, 8, 12, 14, -1, + 10, 12, 14, -1, -1, -1, -1, -1, + 0, 10, 12, 14, -1, -1, -1, -1, + 2, 10, 12, 14, -1, -1, -1, -1, + 0, 2, 10, 12, 14, -1, -1, -1, + 4, 10, 12, 14, -1, -1, -1, -1, + 0, 4, 10, 12, 14, -1, -1, -1, + 2, 4, 10, 12, 14, -1, -1, -1, + 0, 2, 4, 10, 12, 14, -1, -1, + 6, 10, 12, 14, -1, -1, -1, -1, + 0, 6, 10, 12, 14, -1, -1, -1, + 2, 6, 10, 12, 14, -1, -1, -1, + 0, 2, 6, 10, 12, 14, -1, -1, + 4, 6, 10, 12, 14, -1, -1, -1, + 0, 4, 6, 10, 12, 14, -1, -1, + 2, 4, 6, 10, 12, 14, -1, -1, + 0, 2, 4, 6, 10, 12, 14, -1, + 8, 10, 12, 14, -1, -1, -1, -1, + 0, 8, 10, 12, 14, -1, -1, -1, + 2, 8, 10, 12, 14, -1, -1, -1, + 0, 2, 8, 10, 12, 14, -1, -1, + 4, 8, 10, 12, 14, -1, -1, -1, + 0, 4, 8, 10, 12, 14, -1, -1, + 2, 4, 8, 10, 12, 14, -1, -1, + 0, 2, 4, 8, 10, 12, 14, -1, + 6, 8, 10, 12, 14, -1, -1, -1, + 0, 6, 8, 10, 12, 14, -1, -1, + 2, 6, 8, 10, 12, 14, -1, -1, + 0, 2, 6, 8, 10, 12, 14, -1, + 4, 6, 8, 10, 12, 14, -1, -1, + 0, 4, 6, 8, 10, 12, 14, -1, + 2, 4, 6, 8, 10, 12, 14, -1, + 0, 2, 4, 6, 8, 10, 12, 14}; + +inline +fn __shake128_squeezenblocks(stack u64[25] state, stack u8[REJ_UNIFORM_AVX_BUFLEN] out) + -> stack u64[25], stack u8[REJ_UNIFORM_AVX_BUFLEN] +{ + inline int i; + + for i = 0 to GENMATRIX_NBLOCKS + { + state, out[i*SHAKE128_RATE:SHAKE128_RATE] = _shake128_squeezeblock(state, out[i*SHAKE128_RATE:SHAKE128_RATE]); + } + return state, out; +} + +inline +fn __shake128_squeezenblocks4x(reg ptr u256[25] state, reg ptr u8[REJ_UNIFORM_AVX_BUFLEN] h0 h1 h2 h3) + -> reg ptr u256[25], reg ptr u8[REJ_UNIFORM_AVX_BUFLEN], reg ptr u8[REJ_UNIFORM_AVX_BUFLEN], reg ptr u8[REJ_UNIFORM_AVX_BUFLEN], reg ptr u8[REJ_UNIFORM_AVX_BUFLEN] +{ + inline int i; + + for i = 0 to GENMATRIX_NBLOCKS + { + state, h0[i*SHAKE128_RATE:SHAKE128_RATE], h1[i*SHAKE128_RATE:SHAKE128_RATE], h2[i*SHAKE128_RATE:SHAKE128_RATE], h3[i*SHAKE128_RATE:SHAKE128_RATE] = __shake128_squeezeblock4x(state, h0[i*SHAKE128_RATE:SHAKE128_RATE], h1[i*SHAKE128_RATE:SHAKE128_RATE], h2[i*SHAKE128_RATE:SHAKE128_RATE], h3[i*SHAKE128_RATE:SHAKE128_RATE]); + } + + return state, h0, h1, h2, h3; +} + +inline +fn __rej_uniform(reg ptr u16[KYBER_N] rp, reg u64 offset, reg ptr u8[SHAKE128_RATE] buf, inline int buflen) -> reg u64, stack u16[KYBER_N] +{ + reg u16 val0 val1; + reg u16 t; + reg u64 pos ctr; + reg u8 fl1 fl2; + reg bool cf zf b; + + ctr = offset; + pos = 0; + + _, cf, _, _, zf = #CMP_64(ctr, KYBER_N - 1); + fl1 = #SETcc(cf || zf); //SETBE + + _, cf, _, _, zf = #CMP_64(pos, buflen - 3); + fl2 = #SETcc(cf || zf); //SETBE + + _, _, _, _, b = #TEST_8(fl1, fl2); + + while(!b) + { + val0 = (16u)buf[(int)pos]; + pos += 1; + + t = (16u)buf[(int)pos]; + val1 = t; + val1 >>= 4; + + t &= 0x0F; + t <<= 8; + val0 |= t; + pos += 1; + + t = (16u)buf[(int)pos]; + t <<= 4; + val1 |= t; + pos += 1; + + if(val0 < KYBER_Q) + { + rp[(int)ctr] = val0; + ctr += 1; + } + + if(ctr < KYBER_N) + { + if(val1 < KYBER_Q) + { + rp[(int)ctr] = val1; + ctr += 1; + } + } + + _, cf, _, _, zf = #CMP_64(ctr, KYBER_N - 1); + fl1 = #SETcc(cf || zf); //SETBE + + _, cf, _, _, zf = #CMP_64(pos, buflen - 3); + fl2 = #SETcc(cf || zf); //SETBE + + _, _, _, _, b = #TEST_8(fl1, fl2); + } + + return ctr, rp; +} + +fn _rej_uniformn(reg ptr u16[KYBER_N] rp, reg ptr u8[REJ_UNIFORM_AVX_BUFLEN] buf) -> reg u64, reg ptr u16[KYBER_N] +{ + reg u16 val0 val1; + reg u16 t; + reg u64 pos ctr; + reg u8 fl1 fl2; + reg bool b; + + ctr = 0; + pos = 0; + + ?{ "==" = b } = #CMP_64(pos, 1); + + while(!b) + { + val0 = (16u)buf[(int)pos]; + pos += 1; + + t = (16u)buf[(int)pos]; + val1 = t; + val1 >>= 4; + + t &= 0x0F; + t <<= 8; + val0 |= t; + pos += 1; + + t = (16u)buf[(int)pos]; + t <<= 4; + val1 |= t; + pos += 1; + + if(val0 < KYBER_Q) + { + rp[(int)ctr] = val0; + ctr += 1; + } + + if(ctr < KYBER_N) + { + if(val1 < KYBER_Q) + { + rp[(int)ctr] = val1; + ctr += 1; + } + } + + ?{ "<=u" = b } = #CMP_64(ctr, KYBER_N - 1); + fl1 = #SETcc(b); + + ?{ "<=u" = b } = #CMP_64(pos, REJ_UNIFORM_AVX_BUFLEN - 3); + fl2 = #SETcc(b); + + _, _, _, _, b = #TEST_8(fl1, fl2); + } + + return ctr, rp; +} + +u8 ru_ones_s = 1; +u16 ru_mask_s = 0x0FFF; +u8[32] ru_idx8_s = {0, 1, 1, 2, 3, 4, 4, 5, + 6, 7, 7, 8, 9, 10, 10, 11, + 4, 5, 5, 6, 7, 8, 8, 9, + 10, 11, 11, 12, 13, 14, 14, 15}; + +fn _rej_uniform_avx(reg ptr u16[KYBER_N] rp, reg ptr u8[REJ_UNIFORM_AVX_BUFLEN] buf) -> reg u64, reg ptr u16[KYBER_N] +{ + reg u256 f0 f1 g0 g1 g2 g3; + reg u256 bound ones mask idx8; + reg u128 f t l h; + reg u64 pos ctr t64 t64_1 t64_2 t64_3; + reg u64 good; + reg u16 val0 val1 t16; + reg ptr u8[2048] idxp; + reg u8 fl1 fl2; + reg bool cf zf b; + + idxp = ru_idx; + + bound = jqx16[u256 0]; + ctr = 0; + pos = 0; + ones = #VPBROADCAST_32u8(ru_ones_s); + mask = #VPBROADCAST_16u16(ru_mask_s); + idx8 = ru_idx8_s[u256 0]; + + _, cf, _, _, zf = #CMP_64(ctr, KYBER_N - 32); + fl1 = #SETcc(cf || zf); + + _, cf, _, _, zf = #CMP_64(pos, REJ_UNIFORM_AVX_BUFLEN - 48); + fl2 = #SETcc(cf || zf); + + _, _, _, _, b = #TEST_8(fl1, fl2); + + while(!b) + { + f0 = #VPERMQ(buf.[u256 (int)pos], 0x94); + f1 = #VPERMQ(buf.[u256 24 + (int)pos], 0x94); + f0 = #VPSHUFB_256(f0, idx8); + f1 = #VPSHUFB_256(f1, idx8); + g0 = #VPSRL_16u16(f0, 4); + g1 = #VPSRL_16u16(f1, 4); + f0 = #VPBLEND_16u16(f0, g0, 0xAA); + f1 = #VPBLEND_16u16(f1, g1, 0xAA); + f0 = #VPAND_256(f0, mask); + f1 = #VPAND_256(f1, mask); + + g0 = #VPCMPGT_16u16(bound, f0); + g1 = #VPCMPGT_16u16(bound, f1); + + g0 = #VPACKSS_16u16(g0, g1); + good = #VPMOVMSKB_u256u64(g0); + + t64 = good; + t64 &= 0xFF; + g0 = (256u) #VMOV(idxp[u64 (int)t64]); + + t64_1 = good; + t64_1 >>= 16; + t64_1 &= 0xFF; + l = #VMOV(idxp[u64 (int)t64_1]); + + t64_2 = good; + t64_2 >>= 8; + t64_2 &= 0xFF; + g1 = (256u) #VMOV(idxp[u64 (int)t64_2]); + + t64_3 = good; + t64_3 >>= 24; + t64_3 &= 0xFF; + h = #VMOV(idxp[u64 (int)t64_3]); + + g0 = #VINSERTI128(g0, l, 1); + + _, _, _, _, _, t64 = #POPCNT_64(t64); + _, _, _, _, _, t64_1 = #POPCNT_64(t64_1); + t64 += ctr; + + g1 = #VINSERTI128(g1, h, 1); + + t64_1 += t64; + _, _, _, _, _, t64_2 = #POPCNT_64(t64_2); + t64_2 += t64_1; + _, _, _, _, _, t64_3 = #POPCNT_64(t64_3); + t64_3 += t64_2; + + g2 = #VPADD_32u8(g0, ones); + g0 = #VPUNPCKL_32u8(g0, g2); + g3 = #VPADD_32u8(g1, ones); + g1 = #VPUNPCKL_32u8(g1, g3); + + f0 = #VPSHUFB_256(f0, g0); + f1 = #VPSHUFB_256(f1, g1); + + rp.[u128 2*(int)ctr] = (128u)f0; + rp.[u128 2*(int)t64] = #VEXTRACTI128(f0, 1); + rp.[u128 2*(int)t64_1] = (128u)f1; + rp.[u128 2*(int)t64_2] = #VEXTRACTI128(f1, 1); + + ctr = t64_3; + + _, cf, _, _, zf = #CMP_64(ctr, KYBER_N - 32); + fl1 = #SETcc(cf || zf); + + pos += 48; + _, cf, _, _, zf = #CMP_64(pos, REJ_UNIFORM_AVX_BUFLEN - 48); + fl2 = #SETcc(cf || zf); + + _, _, _, _, b = #TEST_8(fl1, fl2); + } + + _, cf, _, _, zf = #CMP_64(ctr, KYBER_N - 8); + fl1 = #SETcc(cf || zf); + + _, cf, _, _, zf = #CMP_64(pos, REJ_UNIFORM_AVX_BUFLEN - 12); + fl2 = #SETcc(cf || zf); + + _, _, _, _, b = #TEST_8(fl1, fl2); + + t64 = 0x5555; + while(!b) + { + f = buf.[u128 (int)pos]; + f = #VPSHUFB_128(f, idx8); + t = #VPSRL_8u16(f, 4); + f = #VPBLEND_8u16(f, t, 0xAA); + f = #VPAND_128(f, mask); + + t = #VPCMPGT_8u16(bound, f); + good = #VPMOVMSKB_u128u64(t); + + good = #PEXT_64(good, t64); + l = #VMOV(idxp[u64 (int)good]); + _, _, _, _, _, good = #POPCNT_64(good); + + h = #VPADD_16u8(l, ones); + l = #VPUNPCKL_16u8(l, h); + f = #VPSHUFB_128(f, l); + + rp.[u128 2*(int)ctr] = f; + ctr += good; + + pos += 12; + _, cf, _, _, zf = #CMP_64(ctr, KYBER_N - 8); + fl1 = #SETcc(cf || zf); + + _, cf, _, _, zf = #CMP_64(pos, REJ_UNIFORM_AVX_BUFLEN - 12); + fl2 = #SETcc(cf || zf); + + _, _, _, _, b = #TEST_8(fl1, fl2); + } + + _, cf, _, _, zf = #CMP_64(ctr, KYBER_N - 1); + fl1 = #SETcc(cf || zf); + + _, cf, _, _, zf = #CMP_64(pos, REJ_UNIFORM_AVX_BUFLEN - 3); + fl2 = #SETcc(cf || zf); + + _, _, _, _, b = #TEST_8(fl1, fl2); + + while(!b) + { + val0 = (16u)buf[(int)pos]; + pos += 1; + t16 = (16u)buf[(int)pos]; + pos += 1; + val1 = t16; + + t16 <<= 8; + val0 |= t16; + val0 &= 0xFFF; + + val1 >>= 4; + t16 = (16u)buf[(int)pos]; + pos += 1; + t16 <<= 4; + val1 |= t16; + + if(val0 < KYBER_Q) + { + rp[(int)ctr] = val0; + ctr += 1; + } + if(val1 < KYBER_Q) + { + if(ctr < KYBER_N) + { + rp[(int)ctr] = val1; + ctr += 1; + } + } + + _, cf, _, _, zf = #CMP_64(ctr, KYBER_N - 1); + fl1 = #SETcc(cf || zf); //SETBE + + _, cf, _, _, zf = #CMP_64(pos, REJ_UNIFORM_AVX_BUFLEN - 3); + fl2 = #SETcc(cf || zf); //SETBE + + _, _, _, _, b = #TEST_8(fl1, fl2); + } + + return ctr, rp; +} + + +inline fn __r2s(reg u256 f) -> stack u256 { + stack u256 fs; + fs = f; + return f; +} + + +inline fn __s2r(stack u256 fs) -> reg u256 { + reg u256 f; + f = fs; + return f; +} + + +inline +fn __gen_matrix(stack u8[KYBER_SYMBYTES] seed, inline int transposed) -> stack u16[KYBER_K*KYBER_VECN] +{ + stack u8[REJ_UNIFORM_AVX_BUFLEN] buf0; + stack u8[REJ_UNIFORM_AVX_BUFLEN] buf1; + stack u8[REJ_UNIFORM_AVX_BUFLEN] buf2; + stack u8[REJ_UNIFORM_AVX_BUFLEN] buf3; + stack u256[25] state; + stack u16[KYBER_K*KYBER_VECN] rr; + stack u256 fs; + reg u256 f; + reg u64 ctr0 ctr1 ctr2 ctr3 tmp; + stack u64 ctr0_s; + reg u8 flg0 flg1 bflg; + reg bool cf zf; + + inline int i, j; + + f = seed[u256 0]; + buf0[u256 0] = f; + buf1[u256 0] = f; + buf2[u256 0] = f; + buf3[u256 0] = f; + fs = __r2s(f); + + if(transposed == 1) + { + buf0[KYBER_SYMBYTES] = 0; + buf0[KYBER_SYMBYTES+1] = 0; + buf1[KYBER_SYMBYTES] = 0; + buf1[KYBER_SYMBYTES+1] = 1; + buf2[KYBER_SYMBYTES] = 0; + buf2[KYBER_SYMBYTES+1] = 2; + buf3[KYBER_SYMBYTES] = 1; + buf3[KYBER_SYMBYTES+1] = 0; + } + else + { + buf0[KYBER_SYMBYTES] = 0; + buf0[KYBER_SYMBYTES+1] = 0; + buf1[KYBER_SYMBYTES] = 1; + buf1[KYBER_SYMBYTES+1] = 0; + buf2[KYBER_SYMBYTES] = 2; + buf2[KYBER_SYMBYTES+1] = 0; + buf3[KYBER_SYMBYTES] = 0; + buf3[KYBER_SYMBYTES+1] = 1; + } + + state = _shake128_absorb4x_34(state, buf0[0:34], buf1[0:34], buf2[0:34], buf3[0:34]); + + if ( USE_SQUEEZE_N == 1 ) { + + state, buf0, buf1, buf2, buf3 = __shake128_squeezenblocks4x(state, buf0, buf1, buf2, buf3); + + if (USE_AVX2_REJECTION == 1) { + tmp, rr[0*KYBER_VECN+0*KYBER_N:KYBER_N] = _rej_uniform_avx(rr[0*KYBER_VECN+0*KYBER_N:KYBER_N], buf0); + ctr0 = tmp; + tmp, rr[0*KYBER_VECN+1*KYBER_N:KYBER_N] = _rej_uniform_avx(rr[0*KYBER_VECN+1*KYBER_N:KYBER_N], buf1); + ctr1 = tmp; + tmp, rr[0*KYBER_VECN+2*KYBER_N:KYBER_N] = _rej_uniform_avx(rr[0*KYBER_VECN+2*KYBER_N:KYBER_N], buf2); + ctr2 = tmp; + ctr3, rr[1*KYBER_VECN+0*KYBER_N:KYBER_N] = _rej_uniform_avx(rr[1*KYBER_VECN+0*KYBER_N:KYBER_N], buf3); + } else { + tmp, rr[0*KYBER_VECN+0*KYBER_N:KYBER_N] = _rej_uniformn(rr[0*KYBER_VECN+0*KYBER_N:KYBER_N], buf0); + ctr0 = tmp; + tmp, rr[0*KYBER_VECN+1*KYBER_N:KYBER_N] = _rej_uniformn(rr[0*KYBER_VECN+1*KYBER_N:KYBER_N], buf1); + ctr1 = tmp; + tmp, rr[0*KYBER_VECN+2*KYBER_N:KYBER_N] = _rej_uniformn(rr[0*KYBER_VECN+2*KYBER_N:KYBER_N], buf2); + ctr2 = tmp; + ctr3, rr[1*KYBER_VECN+0*KYBER_N:KYBER_N] = _rej_uniformn(rr[1*KYBER_VECN+0*KYBER_N:KYBER_N], buf3); + } + + _, cf, _, _, zf = #CMP_64(ctr0, KYBER_N - 1); + flg0 = #SETcc(cf || zf); //SETBE + + _, cf, _, _, zf = #CMP_64(ctr1, KYBER_N - 1); + flg1 = #SETcc(cf || zf); + + _, _, _, _, _, bflg = #OR_8(flg0, flg1); + + _, cf, _, _, zf = #CMP_64(ctr2, KYBER_N - 1); + flg0 = #SETcc(cf || zf); + + _, cf, _, _, zf = #CMP_64(ctr3, KYBER_N - 1); + flg1 = #SETcc(cf || zf); + + _, _, _, _, _, flg0 = #OR_8(flg0, flg1); + _, _, _, _, _, bflg = #OR_8(flg0, bflg); + + } + else + { + ctr0 = 0; + ctr1 = 0; + ctr2 = 0; + ctr3 = 0; + flg0 = 1; + bflg = 1; + } + + while(bflg != 0) { + state, buf0[0:SHAKE128_RATE], buf1[0:SHAKE128_RATE], buf2[0:SHAKE128_RATE], buf3[0:SHAKE128_RATE] = __shake128_squeezeblock4x(state, buf0[0:SHAKE128_RATE], buf1[0:SHAKE128_RATE], buf2[0:SHAKE128_RATE], buf3[0:SHAKE128_RATE]); + + ctr0, rr[0*KYBER_VECN+0*KYBER_N:KYBER_N] = __rej_uniform(rr[0*KYBER_VECN+0*KYBER_N:KYBER_N], ctr0, buf0[0:SHAKE128_RATE], SHAKE128_RATE); + ctr1, rr[0*KYBER_VECN+1*KYBER_N:KYBER_N] = __rej_uniform(rr[0*KYBER_VECN+1*KYBER_N:KYBER_N], ctr1, buf1[0:SHAKE128_RATE], SHAKE128_RATE); + ctr2, rr[0*KYBER_VECN+2*KYBER_N:KYBER_N] = __rej_uniform(rr[0*KYBER_VECN+2*KYBER_N:KYBER_N], ctr2, buf2[0:SHAKE128_RATE], SHAKE128_RATE); + ctr3, rr[1*KYBER_VECN+0*KYBER_N:KYBER_N] = __rej_uniform(rr[1*KYBER_VECN+0*KYBER_N:KYBER_N], ctr3, buf3[0:SHAKE128_RATE], SHAKE128_RATE); + + _, cf, _, _, zf = #CMP_64(ctr0, KYBER_N - 1); + flg0 = #SETcc(cf || zf); + + _, cf, _, _, zf = #CMP_64(ctr1, KYBER_N - 1); + flg1 = #SETcc(cf || zf); + + _, _, _, _, _, bflg = #OR_8(flg0, flg1); + + _, cf, _, _, zf = #CMP_64(ctr2, KYBER_N - 1); + flg0 = #SETcc(cf || zf); + + _, cf, _, _, zf = #CMP_64(ctr3, KYBER_N - 1); + flg1 = #SETcc(cf || zf); + + _, _, _, _, _, flg0 = #OR_8(flg0, flg1); + _, _, _, _, _, bflg = #OR_8(flg0, bflg); + } + + f = __s2r(fs); + buf0[u256 0] = f; + buf1[u256 0] = f; + buf2[u256 0] = f; + buf3[u256 0] = f; + fs = __r2s(f); + + if(transposed == 1) + { + buf0[KYBER_SYMBYTES] = 1; + buf0[KYBER_SYMBYTES+1] = 1; + buf1[KYBER_SYMBYTES] = 1; + buf1[KYBER_SYMBYTES+1] = 2; + buf2[KYBER_SYMBYTES] = 2; + buf2[KYBER_SYMBYTES+1] = 0; + buf3[KYBER_SYMBYTES] = 2; + buf3[KYBER_SYMBYTES+1] = 1; + } + else + { + buf0[KYBER_SYMBYTES] = 1; + buf0[KYBER_SYMBYTES+1] = 1; + buf1[KYBER_SYMBYTES] = 2; + buf1[KYBER_SYMBYTES+1] = 1; + buf2[KYBER_SYMBYTES] = 0; + buf2[KYBER_SYMBYTES+1] = 2; + buf3[KYBER_SYMBYTES] = 1; + buf3[KYBER_SYMBYTES+1] = 2; + } + + state = _shake128_absorb4x_34(state, buf0[0:34], buf1[0:34], buf2[0:34], buf3[0:34]); + + if ( USE_SQUEEZE_N == 1 ) { + + state, buf0, buf1, buf2, buf3 = __shake128_squeezenblocks4x(state, buf0, buf1, buf2, buf3); + + if (USE_AVX2_REJECTION == 1) { + tmp, rr[1*KYBER_VECN+1*KYBER_N:KYBER_N] = _rej_uniform_avx(rr[1*KYBER_VECN+1*KYBER_N:KYBER_N], buf0); + ctr0 = tmp; + tmp, rr[1*KYBER_VECN+2*KYBER_N:KYBER_N] = _rej_uniform_avx(rr[1*KYBER_VECN+2*KYBER_N:KYBER_N], buf1); + ctr1 = tmp; + tmp, rr[2*KYBER_VECN+0*KYBER_N:KYBER_N] = _rej_uniform_avx(rr[2*KYBER_VECN+0*KYBER_N:KYBER_N], buf2); + ctr2 = tmp; + ctr3, rr[2*KYBER_VECN+1*KYBER_N:KYBER_N] = _rej_uniform_avx(rr[2*KYBER_VECN+1*KYBER_N:KYBER_N], buf3); + } else { + tmp, rr[1*KYBER_VECN+1*KYBER_N:KYBER_N] = _rej_uniformn(rr[1*KYBER_VECN+1*KYBER_N:KYBER_N], buf0); + ctr0 = tmp; + tmp, rr[1*KYBER_VECN+2*KYBER_N:KYBER_N] = _rej_uniformn(rr[1*KYBER_VECN+2*KYBER_N:KYBER_N], buf1); + ctr1 = tmp; + tmp, rr[2*KYBER_VECN+0*KYBER_N:KYBER_N] = _rej_uniformn(rr[2*KYBER_VECN+0*KYBER_N:KYBER_N], buf2); + ctr2 = tmp; + ctr3, rr[2*KYBER_VECN+1*KYBER_N:KYBER_N] = _rej_uniformn(rr[2*KYBER_VECN+1*KYBER_N:KYBER_N], buf3); + } + + _, cf, _, _, zf = #CMP_64(ctr0, KYBER_N - 1); + flg0 = #SETcc(cf || zf); + + _, cf, _, _, zf = #CMP_64(ctr1, KYBER_N - 1); + flg1 = #SETcc(cf || zf); + + _, _, _, _, _, bflg = #OR_8(flg0, flg1); + + _, cf, _, _, zf = #CMP_64(ctr2, KYBER_N - 1); + flg0 = #SETcc(cf || zf); + + _, cf, _, _, zf = #CMP_64(ctr3, KYBER_N - 1); + flg1 = #SETcc(cf || zf); + + _, _, _, _, _, flg0 = #OR_8(flg0, flg1); + _, _, _, _, _, bflg = #OR_8(flg0, bflg); + + } else { + ctr0 = 0; + ctr1 = 0; + ctr2 = 0; + ctr3 = 0; + flg0 = 1; + bflg = 1; + } + + + + while(bflg != 0) { + state, buf0[0:SHAKE128_RATE], buf1[0:SHAKE128_RATE], buf2[0:SHAKE128_RATE], buf3[0:SHAKE128_RATE] = __shake128_squeezeblock4x(state, buf0[0:SHAKE128_RATE], buf1[0:SHAKE128_RATE], buf2[0:SHAKE128_RATE], buf3[0:SHAKE128_RATE]); + + ctr0, rr[1*KYBER_VECN+1*KYBER_N:KYBER_N] = __rej_uniform(rr[1*KYBER_VECN+1*KYBER_N:KYBER_N], ctr0, buf0[0:SHAKE128_RATE], SHAKE128_RATE); + ctr1, rr[1*KYBER_VECN+2*KYBER_N:KYBER_N] = __rej_uniform(rr[1*KYBER_VECN+2*KYBER_N:KYBER_N], ctr1, buf1[0:SHAKE128_RATE], SHAKE128_RATE); + ctr2, rr[2*KYBER_VECN+0*KYBER_N:KYBER_N] = __rej_uniform(rr[2*KYBER_VECN+0*KYBER_N:KYBER_N], ctr2, buf2[0:SHAKE128_RATE], SHAKE128_RATE); + ctr3, rr[2*KYBER_VECN+1*KYBER_N:KYBER_N] = __rej_uniform(rr[2*KYBER_VECN+1*KYBER_N:KYBER_N], ctr3, buf3[0:SHAKE128_RATE], SHAKE128_RATE); + + _, cf, _, _, zf = #CMP_64(ctr0, KYBER_N - 1); + flg0 = #SETcc(cf || zf); + + _, cf, _, _, zf = #CMP_64(ctr1, KYBER_N - 1); + flg1 = #SETcc(cf || zf); + + _, _, _, _, _, bflg = #OR_8(flg0, flg1); + + _, cf, _, _, zf = #CMP_64(ctr2, KYBER_N - 1); + flg0 = #SETcc(cf || zf); + + _, cf, _, _, zf = #CMP_64(ctr3, KYBER_N - 1); + flg1 = #SETcc(cf || zf); + + _, _, _, _, _, flg0 = #OR_8(flg0, flg1); + _, _, _, _, _, bflg = #OR_8(flg0, bflg); + } + + f = __s2r(fs); + buf0[u256 0] = f; + buf0[KYBER_SYMBYTES] = 2; + buf0[KYBER_SYMBYTES+1] = 2; + + state[u64 0:25] = _shake128_absorb34(state[u64 0:25], buf0[0:34]); + + if ( USE_SQUEEZE_N == 1 ) { + + state[u64 0:25], buf0 = __shake128_squeezenblocks(state[u64 0:25], buf0); + + if (USE_AVX2_REJECTION == 1) { + ctr0, rr[2*KYBER_VECN+2*KYBER_N:KYBER_N] = _rej_uniform_avx(rr[2*KYBER_VECN+2*KYBER_N:KYBER_N], buf0); + } else { + ctr0, rr[2*KYBER_VECN+2*KYBER_N:KYBER_N] = _rej_uniformn(rr[2*KYBER_VECN+2*KYBER_N:KYBER_N], buf0); + } + + _, cf, _, _, zf = #CMP_64(ctr0, KYBER_N - 1); + bflg = #SETcc(cf || zf); + + } else { + ctr0 = 0; + bflg = 1; + } + + while(bflg != 0) { + ctr0_s = ctr0; + state[u64 0:25], buf0[0:SHAKE128_RATE] = _shake128_squeezeblock(state[u64 0:25], buf0[0:SHAKE128_RATE]); + ctr0 = ctr0_s; + + ctr0, rr[2*KYBER_VECN+2*KYBER_N:KYBER_N] = __rej_uniform(rr[2*KYBER_VECN+2*KYBER_N:KYBER_N], ctr0, buf0[0:SHAKE128_RATE], SHAKE128_RATE); + + _, cf, _, _, zf = #CMP_64(ctr0, KYBER_N - 1); + bflg = #SETcc(cf || zf); + } + + for i = 0 to KYBER_K + { + for j = 0 to KYBER_K + { + rr[i*KYBER_VECN+j*KYBER_N:KYBER_N] = _nttunpack(rr[i*KYBER_VECN+j*KYBER_N:KYBER_N]); + } + } + + return rr; +} diff --git a/code/jasmin/mlkem_avx2/gen_matrix_old.jinc b/code/jasmin/mlkem_avx2/gen_matrix_old.jinc new file mode 100644 index 00000000..184ac209 --- /dev/null +++ b/code/jasmin/mlkem_avx2/gen_matrix_old.jinc @@ -0,0 +1,129 @@ +require "params.jinc" +require "consts.jinc" +require "shuffle.jinc" +require "fips202.jinc" + +param int GENMATRIX_NBLOCKS = 3; +param int REJ_UNIFORM_BUFLEN = GENMATRIX_NBLOCKS * SHAKE128_RATE; + +inline +fn __rej_uniform_old(stack u16[KYBER_N] rp, reg u64 offset, stack u8[REJ_UNIFORM_BUFLEN] buf, inline int buflen) -> reg u64, stack u16[KYBER_N] +{ + reg u16 val0 val1; + reg u16 t; + reg u64 pos ctr; + reg u8 fl1 fl2; + reg bool cf zf b; + + ctr = offset; + pos = 0; + + _, cf, _, _, zf = #CMP_64(ctr, KYBER_N - 1); + fl1 = #SETcc(cf || zf); //SETBE + + _, cf, _, _, zf = #CMP_64(pos, buflen - 3); + fl2 = #SETcc(cf || zf); //SETBE + + _, _, _, _, b = #TEST_8(fl1, fl2); + + while(!b) + { + val0 = (16u)buf[(int)pos]; + pos += 1; + + t = (16u)buf[(int)pos]; + val1 = t; + val1 >>= 4; + + t &= 0x0F; + t <<= 8; + val0 |= t; + pos += 1; + + t = (16u)buf[(int)pos]; + t <<= 4; + val1 |= t; + pos += 1; + + if(val0 < KYBER_Q) + { + rp[(int)ctr] = val0; + ctr += 1; + } + + if(ctr < KYBER_N) + { + if(val1 < KYBER_Q) + { + rp[(int)ctr] = val1; + ctr += 1; + } + } + + _, cf, _, _, zf = #CMP_64(ctr, KYBER_N - 1); + fl1 = #SETcc(cf || zf); //SETBE + + _, cf, _, _, zf = #CMP_64(pos, buflen - 3); + fl2 = #SETcc(cf || zf); //SETBE + + _, _, _, _, b = #TEST_8(fl1, fl2); + } + + return ctr, rp; +} + +inline +fn __gen_matrix_old(stack u8[KYBER_SYMBYTES] seed, inline int transposed) -> stack u16[KYBER_K*KYBER_VECN] +{ + stack u8[34] extseed; + stack u8[REJ_UNIFORM_BUFLEN] buf; + stack u8[REJ_UNIFORM_BUFLEN] buf; + stack u8[REJ_UNIFORM_BUFLEN] buf; + stack u8[REJ_UNIFORM_BUFLEN] buf; + stack u64[25] state; + stack u16[KYBER_K*KYBER_VECN] rr; + + reg u64 t64; + stack u64 t64_s; + inline int i, j, k; + + for j = 0 to 4 + { + t64 = seed[u64 j]; + extseed[u64 j] = t64; + } + + for i = 0 to KYBER_K + { + for j = 0 to KYBER_K + { + if(transposed == 0) + { + extseed[KYBER_SYMBYTES] = j; + extseed[KYBER_SYMBYTES+1] = i; + } + else + { + extseed[KYBER_SYMBYTES] = i; + extseed[KYBER_SYMBYTES+1] = j; + } + + state = _shake128_absorb34(state, extseed); + + state, buf = __shake128_squeezenblocks(state, buf); + t64 = 0; + t64, rr[i*KYBER_VECN+j*KYBER_N:KYBER_N] = __rej_uniform_old(rr[i*KYBER_VECN+j*KYBER_N:KYBER_N], t64, buf, REJ_UNIFORM_BUFLEN); + + while (t64 < KYBER_N) + { + t64_s = t64; + state, buf[0:SHAKE128_RATE] = _shake128_squeezeblock(state, buf[0:SHAKE128_RATE]); + t64 = t64_s; + t64, rr[i*KYBER_VECN+j*KYBER_N:KYBER_N] = __rej_uniform_old(rr[i*KYBER_VECN+j*KYBER_N:KYBER_N], t64, buf, SHAKE128_RATE); + } + rr[i*KYBER_VECN+j*KYBER_N:KYBER_N] = _nttunpack(rr[i*KYBER_VECN+j*KYBER_N:KYBER_N]); + } + } + + return rr; +} diff --git a/code/jasmin/mlkem_avx2/indcpa.c b/code/jasmin/mlkem_avx2/indcpa.c new file mode 100644 index 00000000..df90f7d8 --- /dev/null +++ b/code/jasmin/mlkem_avx2/indcpa.c @@ -0,0 +1,320 @@ +#include +#include "indcpa.h" +#include "poly.h" +#include "polyvec.h" +#include "ntt.h" +#include "symmetric.h" + +/************************************************* +* Name: pack_pk +* +* Description: Serialize the public key as concatenation of the +* serialized vector of polynomials pk +* and the public seed used to generate the matrix A. +* +* Arguments: unsigned char *r: pointer to the output serialized public key +* const poly *pk: pointer to the input public-key polynomial +* const unsigned char *seed: pointer to the input public seed +**************************************************/ +static void pack_pk(unsigned char *r, polyvec *pk, const unsigned char *seed) +{ + int i; + polyvec_tobytes(r, pk); + for(i=0;i> 4) | ((uint16_t)buf[pos+1] << 4)); + pos += 2; + + if(val1 < KYBER_Q) + { + r[ctr++] = (int16_t)val1; + } + + if(val2 < KYBER_Q && ctr < len) { + r[ctr++] = (int16_t)val2; + } + } + + return ctr; +} + +#define gen_a(A,B) gen_matrix(A,B,0) +#define gen_at(A,B) gen_matrix(A,B,1) + +/************************************************* +* Name: gen_matrix +* +* Description: Deterministically generate matrix A (or the transpose of A) +* from a seed. Entries of the matrix are polynomials that look +* uniformly random. Performs rejection sampling on output of +* a XOF +* +* Arguments: - polyvec *a: pointer to ouptput matrix A +* - const unsigned char *seed: pointer to input seed +* - int transposed: boolean deciding whether A or A^T is generated +**************************************************/ +static void gen_matrix(polyvec *a, const unsigned char *seed, int transposed) // Not static for benchmarking +{ + unsigned int ctr, i, j; + const unsigned int maxnblocks=(530+XOF_BLOCKBYTES)/XOF_BLOCKBYTES; /* 530 is expected number of required bytes */ + unsigned char buf[XOF_BLOCKBYTES*maxnblocks+1]; + xof_state state; + + for(i=0;i + +void indcpa_keypair(unsigned char *pk, + unsigned char *sk, + const unsigned char *randomness); + +void indcpa_enc(unsigned char *c, + const unsigned char *m, + const unsigned char *pk, + const unsigned char *coins); + +void indcpa_dec(unsigned char *m, + const unsigned char *c, + const unsigned char *sk); + + + + +void indcpa_keypair_jazz(unsigned char *pk, + unsigned char *sk, + const unsigned char *randomness); + +void indcpa_enc_jazz(unsigned char *c, + const unsigned char *m, + const unsigned char *pk, + const unsigned char *coins); + +void indcpa_dec_jazz(unsigned char *m, + const unsigned char *c, + const unsigned char *sk); + + +#endif diff --git a/code/jasmin/mlkem_avx2/indcpa.jinc b/code/jasmin/mlkem_avx2/indcpa.jinc new file mode 100644 index 00000000..d804e06a --- /dev/null +++ b/code/jasmin/mlkem_avx2/indcpa.jinc @@ -0,0 +1,245 @@ +require "params.jinc" +require "poly.jinc" +require "polyvec.jinc" +require "gen_matrix.jinc" + +inline +fn __indcpa_keypair(reg u64 pkp, reg u64 skp, reg ptr u8[KYBER_SYMBYTES] randomnessp) +{ + stack u64 spkp sskp; + stack u16[KYBER_K*KYBER_VECN] aa; + stack u16[KYBER_VECN] e pkpv skpv; + stack u8[64] buf; + stack u8[KYBER_SYMBYTES] publicseed noiseseed; + stack u8[32] inbuf; + reg u64 t64; + reg u8 nonce; + inline int i; + + spkp = pkp; + sskp = skp; + + for i=0 to KYBER_SYMBYTES/8 + { + t64 = randomnessp[u64 i]; + inbuf[u64 i] = t64; + } + + buf = _sha3_512_32(buf, inbuf); + + for i=0 to KYBER_SYMBYTES/8 + { + t64 = buf[u64 i]; + publicseed[u64 i] = t64; + t64 = buf[u64 i + KYBER_SYMBYTES/8]; + noiseseed[u64 i] = t64; + } + + aa = __gen_matrix(publicseed, 0); + + nonce = 0; + skpv[0:KYBER_N], skpv[KYBER_N:KYBER_N], skpv[2*KYBER_N:KYBER_N], e[0:KYBER_N] = _poly_getnoise_eta1_4x(skpv[0:KYBER_N], skpv[KYBER_N:KYBER_N], skpv[2*KYBER_N:KYBER_N], e[0:KYBER_N], noiseseed, nonce); + + nonce = 4; + e[KYBER_N:KYBER_N], e[2*KYBER_N:KYBER_N], pkpv[0:KYBER_N], pkpv[KYBER_N:KYBER_N] = _poly_getnoise_eta1_4x(e[KYBER_N:KYBER_N], e[2*KYBER_N:KYBER_N], pkpv[0:KYBER_N], pkpv[KYBER_N:KYBER_N], noiseseed, nonce); + + skpv = __polyvec_ntt(skpv); + e = __polyvec_ntt(e); + + + for i=0 to KYBER_K + { + pkpv[i*KYBER_N:KYBER_N] = __polyvec_pointwise_acc(pkpv[i*KYBER_N:KYBER_N], aa[i*KYBER_VECN:KYBER_VECN], skpv); + pkpv[i*KYBER_N:KYBER_N] = _poly_frommont(pkpv[i*KYBER_N:KYBER_N]); + } + + pkpv = __polyvec_add2(pkpv, e); + pkpv = __polyvec_reduce(pkpv); + + pkp = spkp; + skp = sskp; + + __polyvec_tobytes(skp, skpv); + __polyvec_tobytes(pkp, pkpv); + + pkp += KYBER_POLYVECBYTES; + + for i=0 to KYBER_SYMBYTES/8 + { + t64 = publicseed[u64 i]; + (u64)[pkp] = t64; + pkp += 8; + } +} + +inline +fn __indcpa_enc_0(stack u64 sctp, reg ptr u8[KYBER_INDCPA_MSGBYTES] msgp, reg u64 pkp, reg ptr u8[KYBER_SYMBYTES] noiseseed) +{ + stack u16[KYBER_VECN] pkpv sp ep bp; + stack u16[KYBER_K*KYBER_VECN] aat; + stack u16[KYBER_N] k epp v; + stack u8[KYBER_SYMBYTES] publicseed; + stack ptr u8[KYBER_SYMBYTES] s_noiseseed; + reg ptr u8[KYBER_SYMBYTES] lnoiseseed; + reg u64 i t64 ctp; + reg u8 nonce; + inline int w; + + pkpv = __polyvec_frombytes(pkp); + + i = 0; + pkp += KYBER_POLYVECBYTES; + while (i < KYBER_SYMBYTES/8) + { + t64 = (u64)[pkp]; + publicseed.[u64 8 * (int)i] = t64; + pkp += 8; + i += 1; + } + + k = _poly_frommsg_1(k, msgp); + + s_noiseseed = noiseseed; + aat = __gen_matrix(publicseed, 1); + lnoiseseed = s_noiseseed; + + nonce = 0; + sp[0:KYBER_N], sp[KYBER_N:KYBER_N], sp[2*KYBER_N:KYBER_N], ep[0:KYBER_N] = _poly_getnoise_eta1_4x(sp[0:KYBER_N], sp[KYBER_N:KYBER_N], sp[2*KYBER_N:KYBER_N], ep[0:KYBER_N], lnoiseseed, nonce); + + nonce = 4; + ep[KYBER_N:KYBER_N], ep[2*KYBER_N:KYBER_N], epp, bp[0:KYBER_N] = _poly_getnoise_eta1_4x(ep[KYBER_N:KYBER_N], ep[2*KYBER_N:KYBER_N], epp, bp[0:KYBER_N], lnoiseseed, nonce); + + sp = __polyvec_ntt(sp); + + for w=0 to KYBER_K + { + bp[w*KYBER_N:KYBER_N] = __polyvec_pointwise_acc(bp[w*KYBER_N:KYBER_N], aat[w*KYBER_VECN:KYBER_VECN], sp); + } + + v = __polyvec_pointwise_acc(v, pkpv, sp); + + bp = __polyvec_invntt(bp); + v = _poly_invntt(v); + + bp = __polyvec_add2(bp, ep); + v = _poly_add2(v, epp); + v = _poly_add2(v, k); + bp = __polyvec_reduce(bp); + v = __poly_reduce(v); + + ctp = sctp; + __polyvec_compress(ctp, bp); + ctp += KYBER_POLYVECCOMPRESSEDBYTES; + v = _poly_compress(ctp, v); +} + +inline +fn __indcpa_enc_1(reg ptr u8[KYBER_INDCPA_CIPHERTEXTBYTES] ctp, reg ptr u8[KYBER_INDCPA_MSGBYTES] msgp, reg u64 pkp, reg ptr u8[KYBER_SYMBYTES] noiseseed) -> reg ptr u8[KYBER_INDCPA_CIPHERTEXTBYTES] +{ + stack u16[KYBER_VECN] pkpv sp ep bp; + stack u16[KYBER_K*KYBER_VECN] aat; + stack u16[KYBER_N] k epp v; + stack u8[KYBER_SYMBYTES] publicseed; + stack ptr u8[KYBER_SYMBYTES] s_noiseseed; + reg ptr u8[KYBER_SYMBYTES] lnoiseseed; + stack ptr u8[KYBER_INDCPA_CIPHERTEXTBYTES] sctp; + reg u64 i t64; + reg u8 nonce; + inline int w; + + sctp = ctp; + + pkpv = __polyvec_frombytes(pkp); + + i = 0; + pkp += KYBER_POLYVECBYTES; + while (i < KYBER_SYMBYTES/8) + { + t64 = (u64)[pkp]; + publicseed.[u64 8*(int)i] = t64; + pkp += 8; + i += 1; + } + + k = _poly_frommsg_1(k, msgp); + + s_noiseseed = noiseseed; + aat = __gen_matrix(publicseed, 1); + lnoiseseed = s_noiseseed; + + nonce = 0; + sp[0:KYBER_N], sp[KYBER_N:KYBER_N], sp[2*KYBER_N:KYBER_N], ep[0:KYBER_N] = _poly_getnoise_eta1_4x(sp[0:KYBER_N], sp[KYBER_N:KYBER_N], sp[2*KYBER_N:KYBER_N], ep[0:KYBER_N], lnoiseseed, nonce); + + nonce = 4; + ep[KYBER_N:KYBER_N], ep[2*KYBER_N:KYBER_N], epp, bp[0:KYBER_N] = _poly_getnoise_eta1_4x(ep[KYBER_N:KYBER_N], ep[2*KYBER_N:KYBER_N], epp, bp[0:KYBER_N], lnoiseseed, nonce); + + sp = __polyvec_ntt(sp); + + for w=0 to KYBER_K + { + bp[w*KYBER_N:KYBER_N] = __polyvec_pointwise_acc(bp[w*KYBER_N:KYBER_N], aat[w*KYBER_VECN:KYBER_VECN], sp); + } + + v = __polyvec_pointwise_acc(v, pkpv, sp); + + bp = __polyvec_invntt(bp); + v = _poly_invntt(v); + + bp = __polyvec_add2(bp, ep); + v = _poly_add2(v, epp); + v = _poly_add2(v, k); + bp = __polyvec_reduce(bp); + v = __poly_reduce(v); + + ctp = sctp; + ctp[0:KYBER_POLYVECCOMPRESSEDBYTES] = __polyvec_compress_1(ctp[0:KYBER_POLYVECCOMPRESSEDBYTES], bp); + ctp[KYBER_POLYVECCOMPRESSEDBYTES:KYBER_POLYCOMPRESSEDBYTES], v = _poly_compress_1(ctp[KYBER_POLYVECCOMPRESSEDBYTES:KYBER_POLYCOMPRESSEDBYTES], v); + + return ctp; +} + +inline +fn __indcpa_dec_0(reg u64 msgp, reg u64 ctp, reg u64 skp) +{ + stack u16[KYBER_N] t v mp; + stack u16[KYBER_VECN] bp skpv; + + bp = __polyvec_decompress(ctp); + ctp += KYBER_POLYVECCOMPRESSEDBYTES; + v = _poly_decompress(v, ctp); + + skpv = __polyvec_frombytes(skp); + + bp = __polyvec_ntt(bp); + t = __polyvec_pointwise_acc(t, skpv, bp); + t = _poly_invntt(t); + + mp = _poly_sub(mp, v, t); + mp = __poly_reduce(mp); + + mp = _poly_tomsg(msgp, mp); +} + +inline +fn __indcpa_dec_1(reg ptr u8[KYBER_INDCPA_MSGBYTES] msgp, reg u64 ctp, reg u64 skp) -> reg ptr u8[KYBER_INDCPA_MSGBYTES] +{ + stack u16[KYBER_N] t v mp; + stack u16[KYBER_VECN] bp skpv; + + bp = __polyvec_decompress(ctp); + ctp += KYBER_POLYVECCOMPRESSEDBYTES; + v = _poly_decompress(v, ctp); + + skpv = __polyvec_frombytes(skp); + + bp = __polyvec_ntt(bp); + t = __polyvec_pointwise_acc(t, skpv, bp); + t = _poly_invntt(t); + + mp = _poly_sub(mp, v, t); + mp = __poly_reduce(mp); + + msgp, mp = _poly_tomsg_1(msgp, mp); + + return msgp; +} diff --git a/code/jasmin/mlkem_avx2/jbench.sh b/code/jasmin/mlkem_avx2/jbench.sh new file mode 100755 index 00000000..c7b01104 --- /dev/null +++ b/code/jasmin/mlkem_avx2/jbench.sh @@ -0,0 +1,20 @@ +#!/bin/bash +#exec compile.bench + +ulimit -s 50000 + +rm compile.bench + +for arg in -until_typing -until_cstexp -until_inline -until_rmfunc -until_unroll -until_splitting -until_valloc -until_vallocd -until_vshare -until_vshared -until_arrexp -until_rmarrinit -until_rmglobals -until_arrexp -until_makeref -until_lowering -until_stkalloc -until_ralloc -until_rallocd -until_linear -until_asm; do + + echo "=====================================================" >> compile.bench + echo "===== Benchmark with flag $arg" >> compile.bench + echo "=====================================================" >> compile.bench + + make clean + export JADDFLAGS=$arg + #(time make jindcpa.s) 2>compile.bench + (time make jpolyvec.s 2>&1) 2>>compile.bench +done + + diff --git a/code/jasmin/mlkem_avx2/jfips202.jazz b/code/jasmin/mlkem_avx2/jfips202.jazz new file mode 100644 index 00000000..7735695c --- /dev/null +++ b/code/jasmin/mlkem_avx2/jfips202.jazz @@ -0,0 +1,102 @@ +require "fips202.jinc" + +export fn shake256_128_33_jazz(reg u64 outp inp) +{ + stack u8[33] in; + stack u8[128] out; + stack u64 soutp; + reg u8 c; + inline int i; + + for i = 0 to 33 { + c = (u8)[inp + i]; + in[i] = c; + } + + soutp = outp; + out = _shake256_128_33(out, in); + outp = soutp; + + for i = 0 to 128 { + c = out[i]; + (u8)[outp + i] = c; + } +} + +export fn sha3_512_32_jazz(reg u64 outp inp) +{ + stack u8[32] in; + stack u8[64] out; + stack u64 soutp; + reg u8 c; + inline int i; + + for i = 0 to 32 { + c = (u8)[inp + i]; + in[i] = c; + } + + soutp = outp; + out = _sha3_512_32(out, in); + outp = soutp; + for i = 0 to 64 { + c = out[i]; + (u8)[outp + i] = c; + } +} + + +export fn shake128_absorb34_jazz(reg u64 statep, reg u64 inp) +{ + stack u64[25] state; + stack u8[34] in; + reg u8 c; + reg u64 t; + inline int i; + + for i = 0 to 34 { + c = (u8)[inp + i]; + in[i] = c; + } + + state = _shake128_absorb34(state, in); + + for i = 0 to 25 { + t = state[i]; + [statep + 8*i] = t; + } +} + +export fn shake128_squeezeblock_jazz(reg u64 outp, reg u64 statep) +{ + stack u64[25] state; + stack u8[SHAKE128_RATE] out; + reg u8 c; + reg u64 t; + inline int i; + stack u64 soutp; + stack u64 sstatep; + + for i = 0 to 25 { + t = [statep + 8*i]; + state[i] = t; + } + + soutp = outp; + sstatep = statep; + + state, out = _shake128_squeezeblock(state, out); + + outp = soutp; + statep = sstatep; + + for i = 0 to 25 { + t = state[i]; + [statep + 8*i] = t; + } + + for i = 0 to SHAKE128_RATE { + c = out[i]; + (u8)[outp + i] = c; + } +} diff --git a/code/jasmin/mlkem_avx2/jindcpa.jazz b/code/jasmin/mlkem_avx2/jindcpa.jazz new file mode 100644 index 00000000..7c6d20c5 --- /dev/null +++ b/code/jasmin/mlkem_avx2/jindcpa.jazz @@ -0,0 +1,94 @@ +require "indcpa.jinc" +require "params.jinc" +require "poly.jinc" +require "polyvec.jinc" +require "gen_matrix.jinc" + + +export fn indcpa_keypair_jazz(reg u64 pkp, reg u64 skp, reg u64 coins) +{ + stack u8[KYBER_SYMBYTES] randomness; + reg ptr u8[KYBER_SYMBYTES] randomnessp; + inline int i; + + randomnessp = randomness; + for i = 0 to KYBER_SYMBYTES { + randomnessp[i] = (u8)[coins + i]; + } + __indcpa_keypair(pkp, skp, randomnessp); +} + + +export fn indcpa_enc_jazz(reg u64 ctp, reg u64 msgp, reg u64 pkp, reg u64 coinsp) +{ + stack u16[KYBER_VECN] pkpv sp ep bp; + stack u16[KYBER_K*KYBER_VECN] aat; + stack u16[KYBER_N] k epp v; + stack u8[KYBER_SYMBYTES] publicseed; + stack u8[KYBER_SYMBYTES] noiseseed; + reg u64 i; + reg u8 c nonce; + stack u64 sctp; + + sctp = ctp; + + i = 0; + while (i < KYBER_SYMBYTES) + { + c = (u8)[coinsp+i]; + noiseseed[(int)i] = c; + i += 1; + } + + pkpv = __polyvec_frombytes(pkp); + + i = 0; + pkp += KYBER_POLYVECBYTES; + while (i < KYBER_SYMBYTES) + { + c = (u8)[pkp]; + publicseed[(int)i] = c; + pkp += 1; + i += 1; + } + + k = _poly_frommsg(k, msgp); + + aat = __gen_matrix(publicseed, 1); + + nonce = 0; + sp[0:KYBER_N], sp[KYBER_N:KYBER_N], sp[2*KYBER_N:KYBER_N], ep[0:KYBER_N] = _poly_getnoise_eta1_4x(sp[0:KYBER_N], sp[KYBER_N:KYBER_N], sp[2*KYBER_N:KYBER_N], ep[0:KYBER_N], noiseseed, nonce); + + nonce = 4; + ep[KYBER_N:KYBER_N], ep[2*KYBER_N:KYBER_N], epp, bp[0:KYBER_N] = _poly_getnoise_eta1_4x(ep[KYBER_N:KYBER_N], ep[2*KYBER_N:KYBER_N], epp, bp[0:KYBER_N], noiseseed, nonce); + + + sp = __polyvec_ntt(sp); + + bp[0:KYBER_N] = __polyvec_pointwise_acc(bp[0:KYBER_N], aat[0:KYBER_VECN], sp); + bp[KYBER_N:KYBER_N]= __polyvec_pointwise_acc(bp[KYBER_N:KYBER_N], aat[KYBER_VECN:KYBER_VECN], sp); + bp[2*KYBER_N:KYBER_N] = __polyvec_pointwise_acc(bp[2*KYBER_N:KYBER_N], aat[2*KYBER_VECN:KYBER_VECN], sp); + + v = __polyvec_pointwise_acc(v, pkpv, sp); + + bp = __polyvec_invntt(bp); + v = _poly_invntt(v); + + bp = __polyvec_add2(bp, ep); + v = _poly_add2(v, epp); + v = _poly_add2(v, k); + bp = __polyvec_reduce(bp); + v = __poly_reduce(v); + + ctp = sctp; + __polyvec_compress(ctp, bp); + ctp += KYBER_POLYVECCOMPRESSEDBYTES; + v = _poly_compress(ctp, v); +} + + + +export fn indcpa_dec_jazz(reg u64 msgp, reg u64 ctp, reg u64 skp) +{ + __indcpa_dec_0(msgp, ctp, skp); +} diff --git a/code/jasmin/mlkem_avx2/jkem.jazz b/code/jasmin/mlkem_avx2/jkem.jazz new file mode 100644 index 00000000..7519e83c --- /dev/null +++ b/code/jasmin/mlkem_avx2/jkem.jazz @@ -0,0 +1,87 @@ +require "kem.jinc" + +export fn jade_kem_kyber_kyber768_amd64_avx2v_keypair_derand(reg u64 public_key secret_key coins) -> reg u64 +{ + reg u64 r; + stack u8[KYBER_SYMBYTES*2] randomness; + reg ptr u8[KYBER_SYMBYTES*2] randomnessp; + inline int i; + + public_key = public_key; + secret_key = secret_key; + + for i = 0 to KYBER_SYMBYTES*2 + { + randomness[i] = (u8)[coins + i]; + } + + randomnessp = randomness; + + __crypto_kem_keypair_jazz(public_key, secret_key, randomnessp); + ?{}, r = #set0(); + return r; +} + +export fn jade_kem_kyber_kyber768_amd64_avx2v_enc_derand(reg u64 ciphertext shared_secret public_key coins) -> reg u64 +{ + reg u64 r; + stack u8[KYBER_SYMBYTES] randomness; + reg ptr u8[KYBER_SYMBYTES] randomnessp; + inline int i; + + ciphertext = ciphertext; + shared_secret = shared_secret; + public_key = public_key; + + for i = 0 to KYBER_SYMBYTES + { + randomness[i] = (u8)[coins + i]; + } + + randomnessp = randomness; + + __crypto_kem_enc_jazz(ciphertext, shared_secret, public_key, randomnessp); + ?{}, r = #set0(); + return r; +} + +export fn jade_kem_kyber_kyber768_amd64_avx2v_keypair(reg u64 public_key secret_key) -> reg u64 +{ + reg u64 r; + stack u8[KYBER_SYMBYTES*2] randomness; + reg ptr u8[KYBER_SYMBYTES*2] randomnessp; + + public_key = public_key; + secret_key = secret_key; + + randomnessp = randomness; + randomnessp = #randombytes(randomnessp); + __crypto_kem_keypair_jazz(public_key, secret_key, randomnessp); + ?{}, r = #set0(); + return r; +} + +export fn jade_kem_kyber_kyber768_amd64_avx2v_enc(reg u64 ciphertext shared_secret public_key) -> reg u64 +{ + reg u64 r; + stack u8[KYBER_SYMBYTES] randomness; + reg ptr u8[KYBER_SYMBYTES] randomnessp; + + ciphertext = ciphertext; + shared_secret = shared_secret; + public_key = public_key; + + randomnessp = randomness; + randomnessp = #randombytes(randomnessp); + __crypto_kem_enc_jazz(ciphertext, shared_secret, public_key, randomnessp); + ?{}, r = #set0(); + return r; +} + +export fn jade_kem_kyber_kyber768_amd64_avx2v_dec(reg u64 shared_secret ciphertext secret_key) -> reg u64 +{ + reg u64 r; + __crypto_kem_dec_jazz(shared_secret, ciphertext, secret_key); + ?{}, r = #set0(); + return r; +} diff --git a/code/jasmin/mlkem_avx2/jpoly.jazz b/code/jasmin/mlkem_avx2/jpoly.jazz new file mode 100644 index 00000000..4a352883 --- /dev/null +++ b/code/jasmin/mlkem_avx2/jpoly.jazz @@ -0,0 +1,316 @@ +require "params.jinc" +require "poly.jinc" + +/* These exported functions are just for unit testing */ + +export fn poly_compress_jazz(reg u64 rp, reg u64 ap) +{ + stack u16[KYBER_N] a; + reg u16 t; + inline int i; + + for i = 0 to KYBER_N { + t = (u16)[ap + 2*i]; + a[i] = t; + } + + a = _poly_compress(rp, a); +} + +export fn poly_decompress_jazz(reg u64 rp, reg u64 ap) +{ + stack u16[KYBER_N] r; + reg u16 t; + inline int i; + + r = _poly_decompress(r, ap); + + for i = 0 to KYBER_N { + t = r[i]; + (u16)[rp + 2*i] = t; + } +} + +export fn poly_tobytes_jazz(reg u64 rp, reg u64 ap) +{ + stack u16[KYBER_N] a; + reg u16 t; + inline int i; + + for i = 0 to KYBER_N { + t = (u16)[ap + 2*i]; + a[i] = t; + } + + a = _nttunpack(a); + a = _poly_tobytes(rp, a); +} + +export fn poly_frombytes_jazz(reg u64 rp, reg u64 ap) +{ + stack u16[KYBER_N] r; + reg u16 t; + inline int i; + + r = _poly_frombytes(r, ap); + + r = _nttpack(r); + + for i = 0 to KYBER_N { + t = r[i]; + (u16)[rp + 2*i] = t; + } +} + +export fn poly_tomsg_jazz(reg u64 rp, reg u64 ap) +{ + stack u16[KYBER_N] a; + reg u16 t; + inline int i; + + for i = 0 to KYBER_N { + t = (u16)[ap + 2*i]; + a[i] = t; + } + + a = _poly_tomsg(rp, a); +} + +export fn poly_frommsg_jazz(reg u64 rp, reg u64 ap) +{ + stack u16[KYBER_N] r; + reg u16 t; + inline int i; + + r = _poly_frommsg(r, ap); + + for i = 0 to KYBER_N { + t = r[i]; + (u16)[rp + 2*i] = t; + } +} + + +export fn poly_add2_jazz(reg u64 rp, reg u64 bp) +{ + stack u16[KYBER_N] r; + stack u16[KYBER_N] b; + reg u16 t; + inline int i; + + for i = 0 to KYBER_N { + t = (u16)[rp + 2*i]; + r[i] = t; + t = (u16)[bp + 2*i]; + b[i] = t; + } + + r = _poly_add2(r, b); + + for i = 0 to KYBER_N { + t = r[i]; + (u16)[rp + 2*i] = t; + } +} + +export fn poly_sub_jazz(reg u64 rp, reg u64 ap, reg u64 bp) +{ + stack u16[KYBER_N] a; + stack u16[KYBER_N] b; + stack u16[KYBER_N] r; + reg u16 t; + inline int i; + + for i = 0 to KYBER_N { + t = (u16)[ap + 2*i]; + a[i] = t; + t = (u16)[bp + 2*i]; + b[i] = t; + } + + r = _poly_sub(r, a, b); + + for i = 0 to KYBER_N { + t = r[i]; + (u16)[rp + 2*i] = t; + } +} + +export fn poly_ntt_jazz(reg u64 rp) +{ + stack u16[KYBER_N] r; + reg u16 t; + inline int i; + + for i = 0 to KYBER_N { + t = (u16)[rp + 2*i]; + r[i] = t; + } + + r = _poly_ntt(r); + + r = _nttpack(r); + + for i = 0 to KYBER_N { + t = r[i]; + (u16)[rp + 2*i] = t; + } +} + +export fn poly_invntt_jazz(reg u64 rp) +{ + stack u16[KYBER_N] r; + reg u16 t; + inline int i; + + for i = 0 to KYBER_N { + t = (u16)[rp + 2*i]; + r[i] = t; + } + + r = _nttunpack(r); + + r = _poly_invntt(r); + + for i = 0 to KYBER_N { + t = r[i]; + (u16)[rp + 2*i] = t; + } +} + +export fn poly_basemul_jazz(reg u64 rp, reg u64 ap, reg u64 bp) +{ + stack u16[KYBER_N] a; + stack u16[KYBER_N] b; + stack u16[KYBER_N] r; + reg u16 t; + inline int i; + + for i = 0 to KYBER_N { + t = (u16)[ap + 2*i]; + a[i] = t; + t = (u16)[bp + 2*i]; + b[i] = t; + t = (u16)[rp + 2*i]; + r[i] = t; + } + + a = _nttunpack(a); + b = _nttunpack(b); + r = _poly_basemul(r, a, b); + r = _nttpack(r); + + for i = 0 to KYBER_N { + t = r[i]; + (u16)[rp + 2*i] = t; + } +} + +export fn poly_frommont_jazz(reg u64 rp) +{ + stack u16[KYBER_N] r; + reg u16 t; + inline int i; + + for i = 0 to KYBER_N { + t = (u16)[rp + 2*i]; + r[i] = t; + } + + r = _poly_frommont(r); + + for i = 0 to KYBER_N { + t = r[i]; + (u16)[rp + 2*i] = t; + } +} + + +export fn poly_getnoise_eta1_4x_jazz(reg u64 rp, reg u64 seedp, reg u8 nonce) +{ + stack u16[4 * KYBER_N] r; + stack u8[KYBER_SYMBYTES] seed; + stack u64 srp; + reg u16 t; + reg u8 d; + inline int i; + + srp = rp; + + for i = 0 to KYBER_SYMBYTES { + d = (u8)[seedp + i]; + seed[i] = d; + } + + r[0:KYBER_N], r[KYBER_N:KYBER_N], r[2*KYBER_N:KYBER_N], r[3*KYBER_N:KYBER_N] = _poly_getnoise_eta1_4x(r[0:KYBER_N], r[KYBER_N:KYBER_N], r[2*KYBER_N:KYBER_N], r[3*KYBER_N:KYBER_N], seed, nonce); + + rp = srp; + for i = 0 to 4*KYBER_N { + t = r[i]; + (u16)[rp + 2*i] = t; + } +} + +export fn poly_getnoise_eta1122_4x_jazz(reg u64 rp, reg u64 seedp, reg u8 nonce) +{ + stack u16[4 * KYBER_N] r; + stack u8[KYBER_SYMBYTES] seed; + stack u64 srp; + reg u16 t; + reg u8 d; + inline int i; + + srp = rp; + + for i = 0 to KYBER_SYMBYTES { + d = (u8)[seedp + i]; + seed[i] = d; + } + + r[0:KYBER_N], r[KYBER_N:KYBER_N], r[2*KYBER_N:KYBER_N], r[3*KYBER_N:KYBER_N] = _poly_getnoise_eta1122_4x(r[0:KYBER_N], r[KYBER_N:KYBER_N], r[2*KYBER_N:KYBER_N], r[3*KYBER_N:KYBER_N], seed, nonce); + + rp = srp; + for i = 0 to 4*KYBER_N { + t = r[i]; + (u16)[rp + 2*i] = t; + } +} + + +export fn poly_reduce_jazz(reg u64 rp) +{ + stack u16[KYBER_N] r; + reg u16 t; + inline int i; + + for i = 0 to KYBER_N { + t = (u16)[rp + 2*i]; + r[i] = t; + } + + r = __poly_reduce(r); + + for i = 0 to KYBER_N { + t = r[i]; + (u16)[rp + 2*i] = t; + } +} + +export fn poly_csubq_jazz(reg u64 rp) +{ + stack u16[KYBER_N] r; + reg u16 t; + inline int i; + + for i = 0 to KYBER_N { + t = (u16)[rp + 2*i]; + r[i] = t; + } + + r = _poly_csubq(r); + + for i = 0 to KYBER_N { + t = r[i]; + (u16)[rp + 2*i] = t; + } +} diff --git a/code/jasmin/mlkem_avx2/jpolyvec.jazz b/code/jasmin/mlkem_avx2/jpolyvec.jazz new file mode 100644 index 00000000..4407b7e6 --- /dev/null +++ b/code/jasmin/mlkem_avx2/jpolyvec.jazz @@ -0,0 +1,211 @@ +require "params.jinc" +require "polyvec.jinc" + +/* These exported functions are just for unit testing */ + +export fn polyvec_tobytes_jazz(reg u64 rp, reg u64 ap) +{ + stack u16[KYBER_VECN] a; + reg u16 t; + inline int i; + + for i = 0 to KYBER_VECN { + t = (u16)[ap + 2*i]; + a[i] = t; + } + + a[0:KYBER_N] = _nttunpack(a[0:KYBER_N]); + a[KYBER_N:KYBER_N] = _nttunpack(a[KYBER_N:KYBER_N]); + a[2*KYBER_N:KYBER_N] = _nttunpack(a[2*KYBER_N:KYBER_N]); + + __polyvec_tobytes(rp, a); +} + + +export fn polyvec_decompress_jazz(reg u64 rp, reg u64 ap) +{ + stack u16[KYBER_VECN] r; + reg u16 t; + inline int i; + + r = __polyvec_decompress(ap); + + for i = 0 to KYBER_VECN { + t = r[i]; + (u16)[rp + 2*i] = t; + } +} + + +export fn polyvec_compress_jazz(reg u64 rp, reg u64 ap) +{ + stack u16[KYBER_VECN] a; + reg u16 t; + inline int i; + + for i = 0 to KYBER_VECN { + t = (u16)[ap + 2*i]; + a[i] = t; + } + + __polyvec_compress(rp, a); +} + + +export fn polyvec_frombytes_jazz(reg u64 rp, reg u64 ap) +{ + stack u16[KYBER_VECN] r; + reg u16 t; + inline int i; + + r = __polyvec_frombytes(ap); + + r[0:KYBER_N] = _nttpack(r[0:KYBER_N]); + r[KYBER_N:KYBER_N] = _nttpack(r[KYBER_N:KYBER_N]); + r[2*KYBER_N:KYBER_N] = _nttpack(r[2*KYBER_N:KYBER_N]); + + for i = 0 to KYBER_VECN { + t = r[i]; + (u16)[rp + 2*i] = t; + } +} + +export fn polyvec_add2_jazz(reg u64 rp, reg u64 bp) +{ + stack u16[KYBER_VECN] a; + stack u16[KYBER_VECN] b; + stack u16[KYBER_VECN] r; + reg u16 t; + inline int i; + + for i = 0 to KYBER_VECN { + t = (u16)[rp + 2*i]; + a[i] = t; + t = (u16)[bp + 2*i]; + b[i] = t; + } + + r = __polyvec_add2(a, b); + + for i = 0 to KYBER_VECN { + t = r[i]; + (u16)[rp + 2*i] = t; + } +} + +export fn polyvec_pointwise_acc_jazz(reg u64 rp, reg u64 ap, reg u64 bp) +{ + stack u16[KYBER_VECN] a; + stack u16[KYBER_VECN] b; + stack u16[KYBER_N] r; + reg u16 t; + inline int i; + + for i = 0 to KYBER_VECN { + t = (u16)[ap + 2*i]; + a[i] = t; + t = (u16)[bp + 2*i]; + b[i] = t; + } + + a[0:KYBER_N] = _nttunpack(a[0:KYBER_N]); + a[KYBER_N:KYBER_N] = _nttunpack(a[KYBER_N:KYBER_N]); + a[2*KYBER_N:KYBER_N] = _nttunpack(a[2*KYBER_N:KYBER_N]); + + b[0:KYBER_N] = _nttunpack(b[0:KYBER_N]); + b[KYBER_N:KYBER_N] = _nttunpack(b[KYBER_N:KYBER_N]); + b[2*KYBER_N:KYBER_N] = _nttunpack(b[2*KYBER_N:KYBER_N]); + + r = __polyvec_pointwise_acc(r, a, b); + + r = _nttpack(r); + + for i = 0 to KYBER_N { + t = r[i]; + (u16)[rp + 2*i] = t; + } +} + +export fn polyvec_ntt_jazz(reg u64 rp) +{ + stack u16[KYBER_VECN] r; + reg u16 t; + inline int i; + + for i = 0 to KYBER_VECN { + t = (u16)[rp + 2*i]; + r[i] = t; + } + + r = __polyvec_ntt(r); + + r[0:KYBER_N] = _nttpack(r[0:KYBER_N]); + r[KYBER_N:KYBER_N] = _nttpack(r[KYBER_N:KYBER_N]); + r[2*KYBER_N:KYBER_N] = _nttpack(r[2*KYBER_N:KYBER_N]); + + for i = 0 to KYBER_VECN { + t = r[i]; + (u16)[rp + 2*i] = t; + } +} + +export fn polyvec_invntt_jazz(reg u64 rp) +{ + stack u16[KYBER_VECN] r; + reg u16 t; + inline int i; + + for i = 0 to KYBER_VECN { + t = (u16)[rp + 2*i]; + r[i] = t; + } + + r[0:KYBER_N] = _nttunpack(r[0:KYBER_N]); + r[KYBER_N:KYBER_N] = _nttunpack(r[KYBER_N:KYBER_N]); + r[2*KYBER_N:KYBER_N] = _nttunpack(r[2*KYBER_N:KYBER_N]); + + r = __polyvec_invntt(r); + + for i = 0 to KYBER_VECN { + t = r[i]; + (u16)[rp + 2*i] = t; + } +} + +export fn polyvec_csubq_jazz(reg u64 rp) +{ + stack u16[KYBER_VECN] r; + reg u16 t; + inline int i; + + for i = 0 to KYBER_VECN { + t = (u16)[rp + 2*i]; + r[i] = t; + } + + r = __polyvec_csubq(r); + + for i = 0 to KYBER_VECN { + t = r[i]; + (u16)[rp + 2*i] = t; + } +} + +export fn polyvec_reduce_jazz(reg u64 rp) +{ + stack u16[KYBER_VECN] r; + reg u16 t; + inline int i; + + for i = 0 to KYBER_VECN { + t = (u16)[rp + 2*i]; + r[i] = t; + } + + r = __polyvec_reduce(r); + + for i = 0 to KYBER_VECN { + t = r[i]; + (u16)[rp + 2*i] = t; + } +} diff --git a/code/jasmin/mlkem_avx2/jspeed.jazz b/code/jasmin/mlkem_avx2/jspeed.jazz new file mode 100644 index 00000000..45ff728d --- /dev/null +++ b/code/jasmin/mlkem_avx2/jspeed.jazz @@ -0,0 +1,197 @@ +require "poly.jinc" +require "polyvec.jinc" +require "gen_matrix.jinc" +require "indcpa.jinc" +require "kem.jinc" +require "verify.jinc" + +/* Exported functions only for benchmarking */ +export fn gen_matrix_jazz(reg u64 ap, reg u64 seedp) +{ + stack u16[KYBER_K*KYBER_VECN] aa; + stack u8[KYBER_SYMBYTES] seed; + + aa = __gen_matrix(seed, 1); +} + +export fn poly_compress_jazz(reg u64 rp, reg u64 ap) +{ + stack u16[KYBER_N] a; + + a = _poly_compress(rp, a); +} + +export fn poly_decompress_jazz(reg u64 rp, reg u64 ap) +{ + stack u16[KYBER_N] r; + + r = _poly_decompress(r, ap); +} + +export fn poly_tomsg_jazz(reg u64 rp, reg u64 ap) +{ + stack u16[KYBER_N] a; + + a = _poly_tomsg(rp, a); +} + +export fn poly_frommsg_jazz(reg u64 rp, reg u64 ap) +{ + stack u16[KYBER_N] r; + + r = _poly_frommsg(r, ap); +} + +export fn poly_ntt_jazz(reg u64 rp) +{ + stack u16[KYBER_N] r; + + r = _poly_ntt(r); +} + +export fn poly_invntt_jazz(reg u64 rp) +{ + stack u16[KYBER_N] r; + + r = _poly_invntt(r); +} + + +export fn poly_getnoise_jazz(reg u64 rp, reg u64 seedp, reg u8 nonce) +{ + stack u16[KYBER_N] r; + stack u8[KYBER_SYMBYTES] seed; + + //r = _poly_getnoise_eta1_4x(r, seed, nonce); +} + + +export fn poly_getnoise_4x_jazz(reg u64 r0 r1 r2 r3, reg u64 seedp, reg u8 nonce) +{ + stack u16[KYBER_N] r0 r1 r2 r3; + stack u8[KYBER_SYMBYTES] seed; + + r0, r1, r2, r3 = _poly_getnoise_eta1_4x(r0, r1, r2, r3, seed, nonce); +} + + + +export fn polyvec_decompress_jazz(reg u64 rp, reg u64 ap) +{ + stack u16[KYBER_VECN] r; + + r = __polyvec_decompress(ap); +} + + +export fn polyvec_compress_jazz(reg u64 rp, reg u64 ap) +{ + stack u16[KYBER_VECN] a; + + __polyvec_compress(rp, a); +} + + +export fn polyvec_pointwise_acc_jazz(reg u64 rp, reg u64 ap, reg u64 bp) +{ + stack u16[KYBER_VECN] a; + stack u16[KYBER_VECN] b; + stack u16[KYBER_N] r; + + r = __polyvec_pointwise_acc(r, a, b); +} + + +export fn indcpa_keypair_jazz(reg u64 pkp, reg u64 skp, reg u64 randomnessp) +{ + //__indcpa_keypair(pkp, skp, randomnessp); +} + + +export fn indcpa_enc_jazz(reg u64 ctp, reg u64 msgp, reg u64 pkp, reg u64 coinsp) +{ + stack u16[KYBER_VECN] pkpv sp ep bp; + stack u16[KYBER_K*KYBER_VECN] aat; + stack u16[KYBER_N] k epp v; + stack u8[KYBER_SYMBYTES] publicseed; + stack u8[KYBER_SYMBYTES] noiseseed; + reg u64 i; + reg u8 c nonce; + stack u64 sctp; + + sctp = ctp; + + i = 0; + while (i < KYBER_SYMBYTES) + { + c = (u8)[coinsp+i]; + noiseseed[(int)i] = c; + i += 1; + } + + pkpv = __polyvec_frombytes(pkp); + + i = 0; + pkp += KYBER_POLYVECBYTES; + while (i < KYBER_SYMBYTES) + { + c = (u8)[pkp]; + publicseed[(int)i] = c; + pkp += 1; + i += 1; + } + + k = _poly_frommsg(k, msgp); + + aat = __gen_matrix(publicseed, 1); + + nonce = 0; + sp[0:KYBER_N], sp[KYBER_N:KYBER_N], sp[2*KYBER_N:KYBER_N], ep[0:KYBER_N] = _poly_getnoise_eta1_4x(sp[0:KYBER_N], sp[KYBER_N:KYBER_N], sp[2*KYBER_N:KYBER_N], ep[0:KYBER_N], noiseseed, nonce); + + nonce = 4; + ep[KYBER_N:KYBER_N], ep[2*KYBER_N:KYBER_N], epp, bp[0:KYBER_N] = _poly_getnoise_eta1_4x(ep[KYBER_N:KYBER_N], ep[2*KYBER_N:KYBER_N], epp, bp[0:KYBER_N], noiseseed, nonce); + + sp = __polyvec_ntt(sp); + + bp[0:KYBER_N] = __polyvec_pointwise_acc(bp[0:KYBER_N], aat[0:KYBER_VECN], sp); + bp[KYBER_N:KYBER_N]= __polyvec_pointwise_acc(bp[KYBER_N:KYBER_N], aat[KYBER_VECN:KYBER_VECN], sp); + bp[2*KYBER_N:KYBER_N] = __polyvec_pointwise_acc(bp[2*KYBER_N:KYBER_N], aat[2*KYBER_VECN:KYBER_VECN], sp); + + v = __polyvec_pointwise_acc(v, pkpv, sp); + + bp = __polyvec_invntt(bp); + v = _poly_invntt(v); + + bp = __polyvec_add2(bp, ep); + v = _poly_add2(v, epp); + v = _poly_add2(v, k); + bp = __polyvec_reduce(bp); + v = __poly_reduce(v); + + ctp = sctp; + __polyvec_compress(ctp, bp); + ctp += KYBER_POLYVECCOMPRESSEDBYTES; + v = _poly_compress(ctp, v); +} + + +export fn indcpa_dec_jazz(reg u64 msgp, reg u64 ctp, reg u64 skp) +{ + __indcpa_dec_0(msgp, ctp, skp); +} + +export fn crypto_kem_keypair_jazz(reg u64 pkp, reg u64 skp, reg u64 randomnessp) +{ + //__crypto_kem_keypair_jazz(pkp, skp, randomnessp); +} + + +export fn crypto_kem_enc_jazz(reg u64 ctp, reg u64 shkp, reg u64 pkp, reg u64 randomnessp) +{ + //__crypto_kem_enc_jazz(ctp, shkp, pkp, randomnessp); +} + +export fn crypto_kem_dec_jazz(reg u64 shkp, reg u64 ctp, reg u64 skp) +{ + __crypto_kem_dec_jazz(shkp, ctp, skp); +} diff --git a/code/jasmin/mlkem_avx2/keccakf1600.jinc b/code/jasmin/mlkem_avx2/keccakf1600.jinc new file mode 100644 index 00000000..02996b6a --- /dev/null +++ b/code/jasmin/mlkem_avx2/keccakf1600.jinc @@ -0,0 +1,194 @@ +u64[24] KECCAK_RC = +{ 0x0000000000000001 + ,0x0000000000008082 + ,0x800000000000808a + ,0x8000000080008000 + ,0x000000000000808b + ,0x0000000080000001 + ,0x8000000080008081 + ,0x8000000000008009 + ,0x000000000000008a + ,0x0000000000000088 + ,0x0000000080008009 + ,0x000000008000000a + ,0x000000008000808b + ,0x800000000000008b + ,0x8000000000008089 + ,0x8000000000008003 + ,0x8000000000008002 + ,0x8000000000000080 + ,0x000000000000800a + ,0x800000008000000a + ,0x8000000080008081 + ,0x8000000000008080 + ,0x0000000080000001 + ,0x8000000080008008 +}; + +inline fn __index(inline int x y) -> inline int +{ + inline int r; + r = (x % 5) + 5 * (y % 5); + return r; +} + +inline fn __keccak_rho_offsets(inline int i) -> inline int +{ + inline int r x y z t; + + r = 0; + x = 1; + y = 0; + + for t = 0 to 24 { + if (i == x + 5 * y) { + r = ((t + 1) * (t + 2) / 2) % 64; + } + z = (2 * x + 3 * y) % 5; + x = y; + y = z; + } + + return r; +} + +inline fn __rhotates(inline int x y) -> inline int +{ + inline int i r; + i = __index(x, y); + r = __keccak_rho_offsets(i); + return r; +} + +inline fn __theta_sum_scalar(reg ptr u64[25] a) -> reg u64[5] +{ + inline int i j ti; + reg u64[5] c; + + for i=0 to 5 + { + ti = __index(i, 0); + c[i] = a[ti]; + } + + for j=1 to 5 + { for i=0 to 5 + { + ti = __index(i, j); + c[i] ^= a[ti]; + } + } + + return c; +} + +inline fn __theta_rol_scalar(reg u64[5] c) -> reg u64[5] +{ + inline int i; + reg u64[5] d; + + for i = 0 to 5 + { d[i] = c[(i+1)%5]; + _, _, d[i] = #ROL_64(d[i], 1); + d[i] ^= c[(i+4)%5]; + } + + return d; +} + +inline fn __rol_sum_scalar( + reg u64[5] d, + reg ptr u64[25] a, + inline int offset +) -> reg u64[5] +{ + inline int j j1 k ti; + reg u64[5] c; + + for j = 0 to 5 + { + j1 = (j+offset) % 5; + k = __rhotates(j1, j); + ti = __index(j1, j); + c[j] = a[ti]; + c[j] ^= d[j1]; + _, _, c[j] = #ROL_64(c[j], k); + } + + return c; +} + +inline fn __set_row_scalar( + reg ptr u64[25] r, + inline int row, + reg u64[5] c, + reg u64 iota +) -> reg ptr u64[25] +{ + inline int j j1 j2 ti; + reg u64 t; + + for j= 0 to 5 + { + j1 = (j+1) % 5; + j2 = (j+2) % 5; + t = !c[j1] & c[j2]; + if row==0 && j==0 { t ^= iota; } + t ^= c[j]; + ti = __index(j, row); + r[ti] = t; + } + + return r; +} + +inline fn __round2x_scalar(reg ptr u64[25] a r, reg u64 iota) -> reg ptr u64[25], reg ptr u64[25] +{ + reg u64[5] c d; + + c = __theta_sum_scalar(a); + d = __theta_rol_scalar(c); + c = __rol_sum_scalar(d, a, 0); + r = __set_row_scalar(r, 0, c, iota); + c = __rol_sum_scalar(d, a, 3); + r = __set_row_scalar(r, 1, c, iota); + c = __rol_sum_scalar(d, a, 1); + r = __set_row_scalar(r, 2, c, iota); + c = __rol_sum_scalar(d, a, 4); + r = __set_row_scalar(r, 3, c, iota); + c = __rol_sum_scalar(d, a, 2); + r = __set_row_scalar(r, 4, c, iota); + + return a, r; +} + +#[returnaddress="stack"] +fn _keccakf1600_scalar(reg ptr u64[25] a) -> reg ptr u64[25] +{ + stack u64[25] r; + reg ptr u64[24] iotas_p; + reg u64 iota; + reg u64 round; + stack u64 round_s; + + iotas_p = KECCAK_RC; + + round = 0; + + while(round < 24) + { + iota = iotas_p[(int) round]; + round_s = round; + a, r = __round2x_scalar(a, r, iota); + round = round_s; + round += 1; + + iota = iotas_p[(int) round]; + round_s = round; + r, a = __round2x_scalar(r, a, iotas_p[(int) round]); + round = round_s; + round += 1; + } + + return a; +} diff --git a/code/jasmin/mlkem_avx2/kem.c b/code/jasmin/mlkem_avx2/kem.c new file mode 100644 index 00000000..667e45a7 --- /dev/null +++ b/code/jasmin/mlkem_avx2/kem.c @@ -0,0 +1,145 @@ +#include +#include +#include +#include "kem.h" +#include "indcpa.h" +#include "symmetric.h" + +/************************************************* +* Name: verify +* +* Description: Compare two arrays for equality in constant time. +* +* Arguments: const uint8_t *a: pointer to first byte array +* const uint8_t *b: pointer to second byte array +* size_t len: length of the byte arrays +* +* Returns 0 if the byte arrays are equal, 1 otherwise +**************************************************/ +uint64_t verify(const uint8_t *a, const uint8_t *b, size_t len) +{ + size_t i; + uint8_t r; + + r = 0; + for(i=0; i < len; i ++) + r |= a[i] ^ b[i]; + + return (-(uint64_t)r) >> 63; +} + +/************************************************* +* Name: cmov +* +* Description: Copy len bytes from x to r if b is 1; +* don't modify x if b is 0. Requires b to be in {0,1}; +* assumes two's complement representation of negative integers. +* Runs in constant time. +* +* Arguments: uint8_t *r: pointer to output byte array +* const uint8_t *x: pointer to input byte array +* size_t len: Amount of bytes to be copied +* uint8_t b: Condition bit; has to be in {0,1} +**************************************************/ +void cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b) +{ + size_t i; + + b = -b; + for(i=0; i < len; i++) + r[i] ^= b & (r[i] ^ x[i]); +} + +/************************************************* +* Name: crypto_kem_keypair +* +* Description: Generates public and private key for the CCA-secure +* Kyber key encapsulation mechanism +* +* Arguments: - unsigned char *pk: pointer to output public key +* - unsigned char *sk: pointer to output private key +**************************************************/ +void crypto_kem_keypair(unsigned char *pk, + unsigned char *sk, + const unsigned char *randomness) +{ + indcpa_keypair(pk, sk, randomness); + + memcpy(sk+KYBER_INDCPA_SECRETKEYBYTES, pk, KYBER_INDCPA_PUBLICKEYBYTES); + + hash_h(sk+KYBER_SECRETKEYBYTES-2*KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); + + memcpy(sk+KYBER_SECRETKEYBYTES-KYBER_SYMBYTES, randomness + KYBER_SYMBYTES, KYBER_SYMBYTES); +} + +/************************************************* +* Name: crypto_kem_enc +* +* Description: Generates cipher text and shared +* secret for given public key +* +* Arguments: - unsigned char *c: pointer to output ciphertext (of length KYBER_INDCPA_BYTES bytes) +* - const unsigned char *m: pointer to input message (of length KYBER_INDCPA_MSGBYTES bytes) +* - const unsigned char *pk: pointer to input public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) +* - const unsigned char *coin: pointer to input random coins used as seed (of length KYBER_SYMBYTES bytes) +* to deterministically generate all randomness +**************************************************/ +void crypto_kem_enc(unsigned char *ct, + unsigned char *ss, + const unsigned char *pk, + const unsigned char *coins) +{ + uint8_t buf[2*KYBER_SYMBYTES]; + /* Will contain key, coins */ + uint8_t kr[2*KYBER_SYMBYTES]; + + memcpy(buf, coins, KYBER_SYMBYTES); + + /* Multitarget countermeasure for coins + contributory KEM */ + hash_h(buf+KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); + hash_g(kr, buf, 2*KYBER_SYMBYTES); + + /* coins are in kr+KYBER_SYMBYTES */ + indcpa_enc(ct, buf, pk, kr+KYBER_SYMBYTES); + + memcpy(ss,kr,KYBER_SYMBYTES); +} + +/************************************************* +* Name: crypto_kem_dec +* +* Description: Generates shared secret for given +* cipher text and private key +* +* Arguments: - unsigned char *m: pointer to output decrypted message (of length KYBER_INDCPA_MSGBYTES) +* - const unsigned char *c: pointer to input ciphertext (of length KYBER_INDCPA_BYTES) +* - const unsigned char *sk: pointer to input secret key (of length KYBER_INDCPA_SECRETKEYBYTES) +**************************************************/ +void crypto_kem_dec(uint8_t *ss, + const uint8_t *ct, + const uint8_t *sk) +{ + int fail; + uint8_t buf[2*KYBER_SYMBYTES]; + /* Will contain key, coins */ + uint8_t kr[2*KYBER_SYMBYTES]; + uint8_t cmp[KYBER_CIPHERTEXTBYTES+KYBER_SYMBYTES]; + const uint8_t *pk = sk+KYBER_INDCPA_SECRETKEYBYTES; + + indcpa_dec(buf, ct, sk); + + /* Multitarget countermeasure for coins + contributory KEM */ + memcpy(buf+KYBER_SYMBYTES, sk+KYBER_SECRETKEYBYTES-2*KYBER_SYMBYTES, KYBER_SYMBYTES); + hash_g(kr, buf, 2*KYBER_SYMBYTES); + + /* coins are in kr+KYBER_SYMBYTES */ + indcpa_enc(cmp, buf, pk, kr+KYBER_SYMBYTES); + + fail = verify(ct, cmp, KYBER_CIPHERTEXTBYTES); + + /* Compute rejection key */ + rkprf(ss,sk+KYBER_SECRETKEYBYTES-KYBER_SYMBYTES,ct); + + /* Copy true key to return buffer if fail is false */ + cmov(ss,kr,KYBER_SYMBYTES,!fail); +} diff --git a/code/jasmin/mlkem_avx2/kem.h b/code/jasmin/mlkem_avx2/kem.h new file mode 100644 index 00000000..5a4fa9bf --- /dev/null +++ b/code/jasmin/mlkem_avx2/kem.h @@ -0,0 +1,41 @@ +#ifndef CRYPTO_KEM_H +#define CRYPTO_KEM_H + +#include + +void crypto_kem_keypair(unsigned char *pk, + unsigned char *sk, + const unsigned char *randomness); + +void crypto_kem_enc(unsigned char *c, + unsigned char *m, + const unsigned char *pk, + const unsigned char *coins); + +void crypto_kem_dec(unsigned char *m, + const unsigned char *c, + const unsigned char *sk); + +void jade_kem_kyber_kyber768_amd64_avx2v_keypair_derand(unsigned char *pk, + unsigned char *sk, + const unsigned char *randomness); + +void jade_kem_kyber_kyber768_amd64_avx2v_enc_derand(unsigned char *c, + const unsigned char *m, + const unsigned char *pk, + const unsigned char *coins); + + +void jade_kem_kyber_kyber768_amd64_avx2v_keypair(unsigned char *pk, + unsigned char *sk); + +void jade_kem_kyber_kyber768_amd64_avx2v_enc(unsigned char *c, + const unsigned char *m, + const unsigned char *pk); + +void jade_kem_kyber_kyber768_amd64_avx2v_dec(unsigned char *m, + const unsigned char *c, + const unsigned char *sk); + + +#endif diff --git a/code/jasmin/mlkem_avx2/kem.jinc b/code/jasmin/mlkem_avx2/kem.jinc new file mode 100644 index 00000000..8f020389 --- /dev/null +++ b/code/jasmin/mlkem_avx2/kem.jinc @@ -0,0 +1,142 @@ +require "indcpa.jinc" +require "verify.jinc" + +inline +fn __crypto_kem_keypair_jazz(reg u64 pkp, reg u64 skp, reg ptr u8[KYBER_SYMBYTES*2] randomnessp) +{ + stack ptr u8[KYBER_SYMBYTES*2] s_randomnessp; + reg ptr u8[KYBER_SYMBYTES] randomnessp1 randomnessp2; + + stack u8[32] h_pk; + stack u64 s_skp s_pkp; + reg u64 t64; + inline int i; + + s_randomnessp = randomnessp; + s_pkp = pkp; + s_skp = skp; + + randomnessp1 = randomnessp[0:KYBER_SYMBYTES]; + __indcpa_keypair(pkp, skp, randomnessp1); + + skp = s_skp; + skp += KYBER_POLYVECBYTES; + pkp = s_pkp; + + for i=0 to KYBER_INDCPA_PUBLICKEYBYTES/8 + { + t64 = (u64)[pkp + 8*i]; + (u64)[skp] = t64; + skp += 8; + } + + s_skp = skp; + pkp = s_pkp; + t64 = KYBER_PUBLICKEYBYTES; + h_pk = _isha3_256(h_pk, pkp, t64); + skp = s_skp; + + for i=0 to 4 + { + t64 = h_pk[u64 i]; + (u64)[skp] = t64; + skp += 8; + } + + randomnessp = s_randomnessp; + randomnessp2 = randomnessp[KYBER_SYMBYTES:KYBER_SYMBYTES]; + for i=0 to KYBER_SYMBYTES/8 + { + t64 = randomnessp2[u64 i]; + (u64)[skp] = t64; + skp += 8; + } +} + +inline +fn __crypto_kem_enc_jazz(reg u64 ctp, reg u64 shkp, reg u64 pkp, reg ptr u8[KYBER_SYMBYTES] randomnessp) +{ + inline int i; + + stack u8[KYBER_SYMBYTES * 2] buf kr; + stack u64 s_pkp s_ctp s_shkp; + reg u64 t64; + + s_pkp = pkp; + s_ctp = ctp; + s_shkp = shkp; + + for i=0 to KYBER_SYMBYTES/8 + { + t64 = randomnessp[u64 i]; + buf[u64 i] = t64; + } + + pkp = s_pkp; + + t64 = KYBER_PUBLICKEYBYTES; + buf[KYBER_SYMBYTES:KYBER_SYMBYTES] = _isha3_256(buf[KYBER_SYMBYTES:KYBER_SYMBYTES], pkp, t64); + + kr = _sha3_512_64(kr, buf); + + pkp = s_pkp; + + __indcpa_enc_0(s_ctp, buf[0:KYBER_INDCPA_MSGBYTES], pkp, kr[KYBER_SYMBYTES:KYBER_SYMBYTES]); + + shkp = s_shkp; + + for i=0 to KYBER_SYMBYTES/8 + { + t64 = kr[u64 i]; + (u64)[shkp + 8*i] = t64; + } +} + +inline +fn __crypto_kem_dec_jazz(reg u64 shkp, reg u64 ctp, reg u64 skp) +{ + stack u8[KYBER_INDCPA_CIPHERTEXTBYTES] ctpc; + stack u8[2*KYBER_SYMBYTES] kr buf; + stack u64 s_skp s_ctp s_shkp s_cnd; + reg u64 pkp hp zp t64 cnd; + inline int i; + + s_shkp = shkp; + s_ctp = ctp; + + buf[0:KYBER_INDCPA_MSGBYTES] = __indcpa_dec_1(buf[0:KYBER_INDCPA_MSGBYTES], ctp, skp); + + hp = skp + 32; + hp += 24 * KYBER_K * KYBER_N>>3; + + for i=0 to KYBER_SYMBYTES/8 + { + t64 = (u64)[hp + 8*i]; + buf.[u64 KYBER_SYMBYTES + 8*i] = t64; + } + + s_skp = skp; + + kr = _sha3_512_64(kr, buf); + + pkp = s_skp; + pkp += 12 * KYBER_K * KYBER_N>>3; + + ctpc = __indcpa_enc_1(ctpc, buf[0:KYBER_INDCPA_MSGBYTES], pkp, kr[KYBER_SYMBYTES:KYBER_SYMBYTES]); + + ctp = s_ctp; + cnd = __verify(ctp, ctpc); + s_cnd = cnd; /* avoidable ? */ + + ctp = s_ctp; + zp = s_skp; + zp += 64; + zp += 24 * KYBER_K * KYBER_N>>3; + + shkp = s_shkp; + _shake256_1120_32(shkp, zp, ctp); + + shkp = s_shkp; + cnd = s_cnd; + __cmov(shkp, kr[0:KYBER_SYMBYTES], cnd); +} diff --git a/code/jasmin/mlkem_avx2/ntt.S b/code/jasmin/mlkem_avx2/ntt.S new file mode 100644 index 00000000..30f7c210 --- /dev/null +++ b/code/jasmin/mlkem_avx2/ntt.S @@ -0,0 +1,198 @@ +#include "consts.h" +.include "shuffle.inc" +.include "fq.inc" + +# We break the dependency chains with the cost of slightly more additions. +# But they can be run in parallel to the multiplications on execution port 5 +# (multiplications only go to ports 0 and 1) +.macro butterfly2 rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,x=3,y=2,zl0=15,zl1=15,zh0=1,zh1=1 +#mul +vpmullw %ymm\zl0,%ymm\rh0,%ymm12 +vpmulhw %ymm\zh0,%ymm\rh0,%ymm\x +vpmullw %ymm\zl0,%ymm\rh1,%ymm13 +vpmulhw %ymm\zh0,%ymm\rh1,%ymm\rh0 +vpmullw %ymm\zl1,%ymm\rh2,%ymm14 +vpmulhw %ymm\zh1,%ymm\rh2,%ymm\y +vpmullw %ymm\zl1,%ymm\rh3,%ymm15 +vpmulhw %ymm\zh1,%ymm\rh3,%ymm\rh2 + +#reduce +vpmulhw %ymm0,%ymm12,%ymm12 +vpmulhw %ymm0,%ymm13,%ymm13 +vpmulhw %ymm0,%ymm14,%ymm14 +vpmulhw %ymm0,%ymm15,%ymm15 + +vpsubw %ymm\rh0,%ymm\rl1,%ymm\rh1 +vpaddw %ymm\rh0,%ymm\rl1,%ymm\rl1 +vpsubw %ymm\x,%ymm\rl0,%ymm\rh0 +vpaddw %ymm\x,%ymm\rl0,%ymm\rl0 +vpsubw %ymm\rh2,%ymm\rl3,%ymm\rh3 +vpaddw %ymm\rh2,%ymm\rl3,%ymm\rl3 +vpsubw %ymm\y,%ymm\rl2,%ymm\rh2 +vpaddw %ymm\y,%ymm\rl2,%ymm\rl2 + +#update +vpaddw %ymm12,%ymm\rh0,%ymm\rh0 +vpsubw %ymm12,%ymm\rl0,%ymm\rl0 +vpaddw %ymm13,%ymm\rh1,%ymm\rh1 +vpsubw %ymm13,%ymm\rl1,%ymm\rl1 +vpaddw %ymm14,%ymm\rh2,%ymm\rh2 +vpsubw %ymm14,%ymm\rl2,%ymm\rl2 +vpaddw %ymm15,%ymm\rh3,%ymm\rh3 +vpsubw %ymm15,%ymm\rl3,%ymm\rl3 +.endm + +.text +ntt_level0_avx: +level0: +#zetas +vpbroadcastd (%rsi),%ymm15 +vpbroadcastd 4(%rsi),%ymm1 + +#load +vmovdqa (%rdi),%ymm4 +vmovdqa 32(%rdi),%ymm5 +vmovdqa 64(%rdi),%ymm6 +vmovdqa 96(%rdi),%ymm7 +vmovdqa 256(%rdi),%ymm8 +vmovdqa 288(%rdi),%ymm9 +vmovdqa 320(%rdi),%ymm10 +vmovdqa 352(%rdi),%ymm11 + +butterfly2 4,5,6,7,8,9,10,11 + +#store +vmovdqa %ymm4,(%rdi) +vmovdqa %ymm5,32(%rdi) +vmovdqa %ymm6,64(%rdi) +vmovdqa %ymm7,96(%rdi) +vmovdqa %ymm8,256(%rdi) +vmovdqa %ymm9,288(%rdi) +vmovdqa %ymm10,320(%rdi) +vmovdqa %ymm11,352(%rdi) + +ret + +ntt_levels1t6_avx: +level1: +#zetas +vpbroadcastd (%rsi),%ymm15 +vpbroadcastd 4(%rsi),%ymm1 + +#load +vmovdqa (%rdi),%ymm4 +vmovdqa 32(%rdi),%ymm5 +vmovdqa 64(%rdi),%ymm6 +vmovdqa 96(%rdi),%ymm7 +vmovdqa 128(%rdi),%ymm8 +vmovdqa 160(%rdi),%ymm9 +vmovdqa 192(%rdi),%ymm10 +vmovdqa 224(%rdi),%ymm11 + +butterfly2 4,5,6,7,8,9,10,11,3 + +level2: +#zetas +vmovdqu 8(%rsi),%ymm15 +vmovdqu 40(%rsi),%ymm1 + +shuffle8 4,8,3,8 +shuffle8 5,9,4,9 +shuffle8 6,10,5,10 +shuffle8 7,11,6,11 + +butterfly2 3,8,4,9,5,10,6,11,7 + +level3: +#zetas +vmovdqu 72(%rsi),%ymm15 +vmovdqu 104(%rsi),%ymm1 + +shuffle4 3,5,7,5 +shuffle4 8,10,3,10 +shuffle4 4,6,8,6 +shuffle4 9,11,4,11 + +butterfly2 7,5,3,10,8,6,4,11,9 + +level4: +#zetas +vmovdqu 136(%rsi),%ymm15 +vmovdqu 168(%rsi),%ymm1 + +shuffle2 7,8,9,8 +shuffle2 5,6,7,6 +shuffle2 3,4,5,4 +shuffle2 10,11,3,11 + +butterfly2 9,8,7,6,5,4,3,11,10 + +level5: +#zetas +vmovdqu 200(%rsi),%ymm15 +vmovdqu 232(%rsi),%ymm1 + +shuffle1 9,5,10,5 +shuffle1 8,4,9,4 +shuffle1 7,3,8,3 +shuffle1 6,11,7,11 + +butterfly2 10,5,9,4,8,3,7,11,6 + +level6: +#zetas +vmovdqu 264(%rsi),%ymm14 +vmovdqu 328(%rsi),%ymm15 +vmovdqu 296(%rsi),%ymm1 +vmovdqu 360(%rsi),%ymm2 + +butterfly2 10,5,8,3,9,4,7,11,6,1,14,15,1,2 + +vmovdqa _16XV*2(%rdx),%ymm1 +red16 10,12 +red16 5,13 +red16 9,14 +red16 4,15 +red16 8,2 +red16 3,6 +red16 7,12 +red16 11,13 + +#store +vmovdqa %ymm10,(%rdi) +vmovdqa %ymm5,32(%rdi) +vmovdqa %ymm9,64(%rdi) +vmovdqa %ymm4,96(%rdi) +vmovdqa %ymm8,128(%rdi) +vmovdqa %ymm3,160(%rdi) +vmovdqa %ymm7,192(%rdi) +vmovdqa %ymm11,224(%rdi) + +ret + +.global cdecl(nttl0_avx) +cdecl(nttl0_avx): +#consts +vmovdqa _16XQ*2(%rsi),%ymm0 +mov %rsi,%rdx +add $_ZETAS_EXP*2,%rsi +call ntt_level0_avx +add $128,%rdi +call ntt_level0_avx +ret + + +.global cdecl(nttl1t6_avx) +cdecl(nttl1t6_avx): +#consts +vmovdqa _16XQ*2(%rsi),%ymm0 +mov %rsi,%rdx +add $_ZETAS_EXP*2,%rsi + +add $8,%rsi +call ntt_levels1t6_avx +add $256,%rdi +add $392,%rsi +call ntt_levels1t6_avx + +ret diff --git a/code/jasmin/mlkem_avx2/ntt.c b/code/jasmin/mlkem_avx2/ntt.c new file mode 100644 index 00000000..b706bbcc --- /dev/null +++ b/code/jasmin/mlkem_avx2/ntt.c @@ -0,0 +1,152 @@ +#include +#include "params.h" +#include "ntt.h" +#include "reduce.h" + +/* Code to generate zetas and zetas_inv used in the number-theoretic transform: + +#define KYBER_ROOT_OF_UNITY 17 + +static const uint16_t tree[128] = { + 0, 64, 32, 96, 16, 80, 48, 112, 8, 72, 40, 104, 24, 88, 56, 120, + 4, 68, 36, 100, 20, 84, 52, 116, 12, 76, 44, 108, 28, 92, 60, 124, + 2, 66, 34, 98, 18, 82, 50, 114, 10, 74, 42, 106, 26, 90, 58, 122, + 6, 70, 38, 102, 22, 86, 54, 118, 14, 78, 46, 110, 30, 94, 62, 126, + 1, 65, 33, 97, 17, 81, 49, 113, 9, 73, 41, 105, 25, 89, 57, 121, + 5, 69, 37, 101, 21, 85, 53, 117, 13, 77, 45, 109, 29, 93, 61, 125, + 3, 67, 35, 99, 19, 83, 51, 115, 11, 75, 43, 107, 27, 91, 59, 123, + 7, 71, 39, 103, 23, 87, 55, 119, 15, 79, 47, 111, 31, 95, 63, 127}; + + +static int16_t fqmul(int16_t a, int16_t b) { + return montgomery_reduce((int32_t)a*b); +} + +void init_ntt() { + unsigned int i, j, k; + int16_t tmp[128]; + + tmp[0] = MONT; + for(i = 1; i < 128; ++i) + tmp[i] = fqmul(tmp[i-1], KYBER_ROOT_OF_UNITY*MONT % KYBER_Q); + + for(i = 0; i < 128; ++i) + zetas[i] = tmp[tree[i]]; + + k = 0; + for(i = 64; i >= 1; i >>= 1) + for(j = i; j < 2*i; ++j) + zetas_inv[k++] = -tmp[128 - tree[j]]; + + zetas_inv[127] = MONT * (MONT * (KYBER_Q - 1) * ((KYBER_Q - 1)/128) % KYBER_Q) % KYBER_Q; +} + +*/ +int16_t zetas[128] = { + 2285, 2571, 2970, 1812, 1493, 1422, 287, 202, 3158, 622, 1577, 182, 962, 2127, 1855, 1468, + 573, 2004, 264, 383, 2500, 1458, 1727, 3199, 2648, 1017, 732, 608, 1787, 411, 3124, 1758, + 1223, 652, 2777, 1015, 2036, 1491, 3047, 1785, 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, + 2476, 3239, 3058, 830, 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, 448, 2264, 677, 2054, + 2226, 430, 555, 843, 2078, 871, 1550, 105, 422, 587, 177, 3094, 3038, 2869, 1574, 1653, + 3083, 778, 1159, 3182, 2552, 1483, 2727, 1119, 1739, 644, 2457, 349, 418, 329, 3173, 3254, + 817, 1097, 603, 610, 1322, 2044, 1864, 384, 2114, 3193, 1218, 1994, 2455, 220, 2142, 1670, + 2144, 1799, 2051, 794, 1819, 2475, 2459, 478, 3221, 3021, 996, 991, 958, 1869, 1522, 1628}; + +int16_t zetas_inv[128] = { + 1701, 1807, 1460, 2371, 2338, 2333, 308, 108, 2851, 870, 854, 1510, 2535, 1278, 1530, 1185, + 1659, 1187, 3109, 874, 1335, 2111, 136, 1215, 2945, 1465, 1285, 2007, 2719, 2726, 2232, 2512, + 75, 156, 3000, 2911, 2980, 872, 2685, 1590, 2210, 602, 1846, 777, 147, 2170, 2551, 246, + 1676, 1755, 460, 291, 235, 3152, 2742, 2907, 3224, 1779, 2458, 1251, 2486, 2774, 2899, 1103, + 1275, 2652, 1065, 2881, 725, 1508, 2368, 398, 951, 247, 1421, 3222, 2499, 271, 90, 853, + 1860, 3203, 1162, 1618, 666, 320, 8, 2813, 1544, 282, 1838, 1293, 2314, 552, 2677, 2106, + 1571, 205, 2918, 1542, 2721, 2597, 2312, 681, 130, 1602, 1871, 829, 2946, 3065, 1325, 2756, + 1861, 1474, 1202, 2367, 3147, 1752, 2707, 171, 3127, 3042, 1907, 1836, 1517, 359, 758, 1441}; + + +/************************************************* +* Name: fqmul +* +* Description: Multiplication followed by Montgomery reduction +* +* Arguments: - int16_t a: first factor +* - int16_t b: second factor +* +* Returns 16-bit integer congruent to a*b*R^{-1} mod q +**************************************************/ +static int16_t fqmul(int16_t a, int16_t b) { + return montgomery_reduce((int32_t)a*b); +} + +/************************************************* +* Name: ntt +* +* Description: Inplace number-theoretic transform (NTT) in Rq +* input is in standard order, output is in bitreversed order +* +* Arguments: - int16_t r[256]: pointer to input/output vector of elements of Zq +**************************************************/ +void ntt(int16_t r[256]) { + unsigned int len, start, j, k; + int16_t t, zeta; + + k = 1; + for(len = 128; len >= 2; len >>= 1) { + for(start = 0; start < 256; start = j + len) { + zeta = zetas[k++]; + for(j = start; j < start + len; ++j) { + t = fqmul(zeta, r[j + len]); + r[j + len] = r[j] - t; + r[j] = r[j] + t; + } + } + } +} + +/************************************************* +* Name: invntt +* +* Description: Inplace inverse number-theoretic transform in Rq +* input is in bitreversed order, output is in standard order +* +* Arguments: - int16_t r[256]: pointer to input/output vector of elements of Zq +**************************************************/ +void invntt(int16_t r[256]) { + unsigned int start, len, j, k; + int16_t t, zeta; + + k = 0; + for(len = 2; len <= 128; len <<= 1) { + for(start = 0; start < 256; start = j + len) { + zeta = zetas_inv[k++]; + for(j = start; j < start + len; ++j) { + t = r[j]; + r[j] = barrett_reduce(t + r[j + len]); + r[j + len] = t - r[j + len]; + r[j + len] = fqmul(zeta, r[j + len]); + } + } + } + + for(j = 0; j < 256; ++j) + r[j] = fqmul(r[j], zetas_inv[127]); +} + +/************************************************* +* Name: basemul +* +* Description: Multiplication of polynomials in Zq[X]/((X^2-zeta)) +* used for multiplication of elements in Rq in NTT domain +* +* Arguments: - int16_t r[2]: pointer to the output polynomial +* - const int16_t a[2]: pointer to the first factor +* - const int16_t b[2]: pointer to the second factor +* - int16_t zeta: integer defining the reduction polynomial +**************************************************/ +void basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta) { + r[0] = fqmul(a[1], b[1]); + r[0] = fqmul(r[0], zeta); + r[0] += fqmul(a[0], b[0]); + + r[1] = fqmul(a[0], b[1]); + r[1] += fqmul(a[1], b[0]); +} diff --git a/code/jasmin/mlkem_avx2/ntt.h b/code/jasmin/mlkem_avx2/ntt.h new file mode 100644 index 00000000..9d621d62 --- /dev/null +++ b/code/jasmin/mlkem_avx2/ntt.h @@ -0,0 +1,45 @@ +#ifndef NTT_H +#define NTT_H + +#include +#include "params.h" +#include "consts.h" + +extern int16_t zetas[128]; +extern int16_t zetas_inv[128]; + +void invntt(int16_t *poly); +void basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta); + +#define ntt_avx KYBER_NAMESPACE(ntt_avx) +//void ntt_avx(int16_t *r, const uint16_t *qdata); +//void nttl0_avx(int16_t *r, const uint16_t *qdata); +//void nttl1t6_avx(int16_t *r, const uint16_t *qdata); +#define invntt_avx KYBER_NAMESPACE(invntt_avx) +//void invntt_avx(int16_t *r, const uint16_t *qdata); + +#define nttpack_avx KYBER_NAMESPACE(nttpack_avx) +void nttpack_avx(int16_t *r, const uint16_t *qdata); +#define nttunpack_avx KYBER_NAMESPACE(nttunpack_avx) +void nttunpack_avx(int16_t *r, const uint16_t *qdata); + +#define basemul_avx KYBER_NAMESPACE(basemul_avx) +void basemul_avx(int16_t *r, + const int16_t *a, + const int16_t *b, + const uint16_t *qdata); +#define basemul_acc_avx KYBER_NAMESPACE(basemul_acc_avx) +void basemul_acc_avx(int16_t *r, + const int16_t *a, + const int16_t *b, + const uint16_t *qdata); + +#define ntttobytes_avx KYBER_NAMESPACE(ntttobytes_avx) +void ntttobytes_avx(uint8_t *r, const int16_t *a, const uint16_t *qdata); +#define nttfrombytes_avx KYBER_NAMESPACE(nttfrombytes_avx) +void nttfrombytes_avx(int16_t *r, const uint8_t *a, const uint16_t *qdata); + + +void ntt(int16_t *poly); + +#endif diff --git a/code/jasmin/mlkem_avx2/params.h b/code/jasmin/mlkem_avx2/params.h new file mode 100644 index 00000000..631fc411 --- /dev/null +++ b/code/jasmin/mlkem_avx2/params.h @@ -0,0 +1,50 @@ +#ifndef PARAMS_H +#define PARAMS_H + +#ifndef KYBER_K +#define KYBER_K 3 /* Change this for different security strengths */ +#endif + +#define KYBER_NAMESPACE(s) s + +/* Don't change parameters below this line */ + +#define KYBER_N 256 +#define KYBER_Q 3329 + + + +#define KYBER_SYMBYTES 32 /* size in bytes of hashes, and seeds */ +#define KYBER_SSBYTES 32 /* size in bytes of shared key */ + +#define KYBER_POLYBYTES 384 +#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) + + +#if KYBER_K == 2 +#define KYBER_ETA1 3 +#define KYBER_POLYCOMPRESSEDBYTES 128 +#define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 320) +#elif KYBER_K == 3 +#define KYBER_ETA1 2 +#define KYBER_POLYCOMPRESSEDBYTES 128 +#define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 320) +#elif KYBER_K == 4 +#define KYBER_ETA1 2 +#define KYBER_POLYCOMPRESSEDBYTES 160 +#define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 352) +#endif + +#define KYBER_ETA2 2 + +#define KYBER_INDCPA_MSGBYTES KYBER_SYMBYTES +#define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES) +#define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES) +#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES) + +#define KYBER_PUBLICKEYBYTES (KYBER_INDCPA_PUBLICKEYBYTES) +#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES + KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES) /* 32 bytes of additional space to save H(pk) */ +#define KYBER_CIPHERTEXTBYTES KYBER_INDCPA_BYTES +#define KYBER_SSBYTES 32 + +#endif diff --git a/code/jasmin/mlkem_avx2/params.jinc b/code/jasmin/mlkem_avx2/params.jinc new file mode 100644 index 00000000..caf2ec9f --- /dev/null +++ b/code/jasmin/mlkem_avx2/params.jinc @@ -0,0 +1,26 @@ +param int KYBER_K = 3; + +param int KYBER_Q = 3329; +param int KYBER_N = 256; +param int KYBER_VECN = KYBER_K * KYBER_N; + +param int KYBER_SYMBYTES = 32; +param int KYBER_SSBYTES = 32; + +param int KYBER_ETA1 = 2; +param int KYBER_ETA2 = 2; + +param int KYBER_POLYBYTES = 384; +param int KYBER_POLYVECBYTES = (KYBER_K * KYBER_POLYBYTES); + +param int KYBER_POLYCOMPRESSEDBYTES = 128; +param int KYBER_POLYVECCOMPRESSEDBYTES = (KYBER_K * 320); + +param int KYBER_INDCPA_MSGBYTES = KYBER_SYMBYTES; +param int KYBER_INDCPA_PUBLICKEYBYTES = KYBER_POLYVECBYTES + KYBER_SYMBYTES; +param int KYBER_INDCPA_SECRETKEYBYTES = KYBER_POLYVECBYTES; +param int KYBER_INDCPA_CIPHERTEXTBYTES = KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES; + +param int KYBER_PUBLICKEYBYTES = KYBER_INDCPA_PUBLICKEYBYTES; +param int KYBER_SECRETKEYBYTES = KYBER_INDCPA_SECRETKEYBYTES + KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES; +param int KYBER_CIPHERTEXTBYTES = KYBER_INDCPA_CIPHERTEXTBYTES; diff --git a/code/jasmin/mlkem_avx2/poly.c b/code/jasmin/mlkem_avx2/poly.c new file mode 100644 index 00000000..19c64f60 --- /dev/null +++ b/code/jasmin/mlkem_avx2/poly.c @@ -0,0 +1,378 @@ +#include +#include "params.h" +#include "poly.h" +#include "ntt.h" +#include "reduce.h" +#include "cbd.h" +#include "symmetric.h" + +/************************************************* +* Name: poly_compress +* +* Description: Compression and subsequent serialization of a polynomial +* +* Arguments: - unsigned char *r: pointer to output byte array (needs space for KYBER_POLYCOMPRESSEDBYTES bytes) +* - const poly *a: pointer to input polynomial +**************************************************/ +void poly_compress(unsigned char *r, poly *a) +{ + uint8_t t[8]; + int i,j,k=0; + + poly_csubq(a); + +#if (KYBER_POLYCOMPRESSEDBYTES == 96) + for(i=0;icoeffs[i+j] << 3) + KYBER_Q/2) / KYBER_Q) & 7; + + r[k] = t[0] | (t[1] << 3) | (t[2] << 6); + r[k+1] = (t[2] >> 2) | (t[3] << 1) | (t[4] << 4) | (t[5] << 7); + r[k+2] = (t[5] >> 1) | (t[6] << 2) | (t[7] << 5); + k += 3; + } +#elif (KYBER_POLYCOMPRESSEDBYTES == 128) + for(i=0;icoeffs[i+j] << 4) + KYBER_Q/2) / KYBER_Q) & 15; + + r[k] = t[0] | (t[1] << 4); + r[k+1] = t[2] | (t[3] << 4); + r[k+2] = t[4] | (t[5] << 4); + r[k+3] = t[6] | (t[7] << 4); + k += 4; + } +#elif (KYBER_POLYCOMPRESSEDBYTES == 160) + for(i=0;icoeffs[i+j] << 5) + KYBER_Q/2) / KYBER_Q) & 31; + + r[k] = t[0] | (t[1] << 5); + r[k+1] = (t[1] >> 3) | (t[2] << 2) | (t[3] << 7); + r[k+2] = (t[3] >> 1) | (t[4] << 4); + r[k+3] = (t[4] >> 4) | (t[5] << 1) | (t[6] << 6); + r[k+4] = (t[6] >> 2) | (t[7] << 3); + k += 5; + } +#else +#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {96, 128, 160}" +#endif +} + +/************************************************* +* Name: poly_decompress +* +* Description: De-serialization and subsequent decompression of a polynomial; +* approximate inverse of poly_compress +* +* Arguments: - poly *r: pointer to output polynomial +* - const unsigned char *a: pointer to input byte array (of length KYBER_POLYCOMPRESSEDBYTES bytes) +**************************************************/ +void poly_decompress(poly *r, const unsigned char *a) +{ + int i; +#if (KYBER_POLYCOMPRESSEDBYTES == 96) + for(i=0;icoeffs[i+0] = (((a[0] & 7) * KYBER_Q) + 4) >> 3; + r->coeffs[i+1] = ((((a[0] >> 3) & 7) * KYBER_Q) + 4) >> 3; + r->coeffs[i+2] = ((((a[0] >> 6) | ((a[1] << 2) & 4)) * KYBER_Q) + 4) >> 3; + r->coeffs[i+3] = ((((a[1] >> 1) & 7) * KYBER_Q) + 4) >> 3; + r->coeffs[i+4] = ((((a[1] >> 4) & 7) * KYBER_Q) + 4) >> 3; + r->coeffs[i+5] = ((((a[1] >> 7) | ((a[2] << 1) & 6)) * KYBER_Q) + 4) >> 3; + r->coeffs[i+6] = ((((a[2] >> 2) & 7) * KYBER_Q) + 4) >> 3; + r->coeffs[i+7] = ((((a[2] >> 5)) * KYBER_Q) + 4) >> 3; + a += 3; + } +#elif (KYBER_POLYCOMPRESSEDBYTES == 128) + for(i=0;icoeffs[i+0] = (((a[0] & 15) * KYBER_Q) + 8) >> 4; + r->coeffs[i+1] = (((a[0] >> 4) * KYBER_Q) + 8) >> 4; + r->coeffs[i+2] = (((a[1] & 15) * KYBER_Q) + 8) >> 4; + r->coeffs[i+3] = (((a[1] >> 4) * KYBER_Q) + 8) >> 4; + r->coeffs[i+4] = (((a[2] & 15) * KYBER_Q) + 8) >> 4; + r->coeffs[i+5] = (((a[2] >> 4) * KYBER_Q) + 8) >> 4; + r->coeffs[i+6] = (((a[3] & 15) * KYBER_Q) + 8) >> 4; + r->coeffs[i+7] = (((a[3] >> 4) * KYBER_Q) + 8) >> 4; + a += 4; + } +#elif (KYBER_POLYCOMPRESSEDBYTES == 160) + for(i=0;icoeffs[i+0] = (((a[0] & 31) * KYBER_Q) + 16) >> 5; + r->coeffs[i+1] = ((((a[0] >> 5) | ((a[1] & 3) << 3)) * KYBER_Q) + 16) >> 5; + r->coeffs[i+2] = ((((a[1] >> 2) & 31) * KYBER_Q) + 16) >> 5; + r->coeffs[i+3] = ((((a[1] >> 7) | ((a[2] & 15) << 1)) * KYBER_Q) + 16) >> 5; + r->coeffs[i+4] = ((((a[2] >> 4) | ((a[3] & 1) << 4)) * KYBER_Q) + 16) >> 5; + r->coeffs[i+5] = ((((a[3] >> 1) & 31) * KYBER_Q) + 16) >> 5; + r->coeffs[i+6] = ((((a[3] >> 6) | ((a[4] & 7) << 2)) * KYBER_Q) + 16) >> 5; + r->coeffs[i+7] = (((a[4] >> 3) * KYBER_Q) + 16) >> 5; + a += 5; + } +#else +#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {96, 128, 160}" +#endif +} + +/************************************************* +* Name: poly_tobytes +* +* Description: Serialization of a polynomial +* +* Arguments: - unsigned char *r: pointer to output byte array (needs space for KYBER_POLYBYTES bytes) +* - const poly *a: pointer to input polynomial +**************************************************/ +void poly_tobytes(unsigned char *r, poly *a) +{ + int i; + uint16_t t0, t1; + + poly_csubq(a); + + for(i=0;icoeffs[2*i]; + t1 = a->coeffs[2*i+1]; + r[3*i] = t0 & 0xff; + r[3*i+1] = (t0 >> 8) | ((t1 & 0xf) << 4); + r[3*i+2] = t1 >> 4; + } +} + +/************************************************* +* Name: poly_frombytes +* +* Description: De-serialization of a polynomial; +* inverse of poly_tobytes +* +* Arguments: - poly *r: pointer to output polynomial +* - const unsigned char *a: pointer to input byte array (of KYBER_POLYBYTES bytes) +**************************************************/ +void poly_frombytes(poly *r, const unsigned char *a) +{ + int i; + + for(i=0;icoeffs[2*i] = a[3*i] | ((uint16_t)a[3*i+1] & 0x0f) << 8; + r->coeffs[2*i+1] = a[3*i+1] >> 4 | ((uint16_t)a[3*i+2] & 0xff) << 4; + } +} + +/************************************************* +* Name: poly_getnoise_eta1 +* +* Description: Sample a polynomial deterministically from a seed and a nonce, +* with output polynomial close to centered binomial distribution +* with parameter KYBER_ETA1 +* +* Arguments: - poly *r: pointer to output polynomial +* - const unsigned char *seed: pointer to input seed (pointing to array of length KYBER_SYMBYTES bytes) +* - unsigned char nonce: one-byte input nonce +**************************************************/ +void poly_getnoise_eta1(poly *r, const unsigned char *seed, unsigned char nonce) +{ + uint8_t buf[KYBER_ETA1*KYBER_N/4]; + + prf(buf, sizeof(buf), seed, nonce); + poly_cbd_eta1(r, buf); +} + +/************************************************* +* Name: poly_getnoise_eta2 +* +* Description: Sample a polynomial deterministically from a seed and a nonce, +* with output polynomial close to centered binomial distribution +* with parameter KYBER_ETA2 +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *seed: pointer to input seed +* (of length KYBER_SYMBYTES bytes) +* - uint8_t nonce: one-byte input nonce +**************************************************/ +void poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) +{ + uint8_t buf[KYBER_ETA2*KYBER_N/4]; + prf(buf, sizeof(buf), seed, nonce); + poly_cbd_eta2(r, buf); +} + +/************************************************* +* Name: poly_ntt +* +* Description: Computes negacyclic number-theoretic transform (NTT) of +* a polynomial in place; +* inputs assumed to be in normal order, output in bitreversed order +* +* Arguments: - uint16_t *r: pointer to in/output polynomial +**************************************************/ +void poly_ntt(poly *r) +{ + ntt(r->coeffs); + poly_reduce(r); +} + +/************************************************* +* Name: poly_invntt +* +* Description: Computes inverse of negacyclic number-theoretic transform (NTT) of +* a polynomial in place; +* inputs assumed to be in bitreversed order, output in normal order +* +* Arguments: - uint16_t *a: pointer to in/output polynomial +**************************************************/ +void poly_invntt(poly *r) +{ + invntt(r->coeffs); +} + +/************************************************* +* Name: poly_basemul +* +* Description: Multiplication of two polynomials in NTT domain +* +* Arguments: - poly *r: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial +**************************************************/ +void poly_basemul(poly *r, const poly *a, const poly *b) +{ + unsigned int i; + + for(i = 0; i < KYBER_N/4; ++i) { + basemul(r->coeffs + 4*i, a->coeffs + 4*i, b->coeffs + 4*i, zetas[64 + i]); + basemul(r->coeffs + 4*i + 2, a->coeffs + 4*i + 2, b->coeffs + 4*i + 2, -zetas[64 + i]); + } +} + +/************************************************* +* Name: poly_frommont +* +* Description: Inplace conversion of all coefficients of a polynomial +* from Montgomery domain to normal domain +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ +void poly_frommont(poly *r) +{ + int i; + const int16_t f = (1ULL << 32) % KYBER_Q; + + for(i=0;icoeffs[i] = montgomery_reduce((int32_t)r->coeffs[i]*f); +} + +/************************************************* +* Name: poly_reduce +* +* Description: Applies Barrett reduction to all coefficients of a polynomial +* for details of the Barrett reduction see comments in reduce.c +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ +void poly_reduce(poly *r) +{ + int i; + + for(i=0;icoeffs[i] = barrett_reduce(r->coeffs[i]); +} + +/************************************************* +* Name: poly_csubq +* +* Description: Applies conditional subtraction of q to each coefficient of a polynomial +* for details of conditional subtraction of q see comments in reduce.c +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ +void poly_csubq(poly *r) +{ + int i; + + for(i=0;icoeffs[i] = csubq(r->coeffs[i]); +} + +/************************************************* +* Name: poly_add +* +* Description: Add two polynomials +* +* Arguments: - poly *r: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial +**************************************************/ +void poly_add(poly *r, const poly *a, const poly *b) +{ + int i; + for(i=0;icoeffs[i] = a->coeffs[i] + b->coeffs[i]; +} + +/************************************************* +* Name: poly_sub +* +* Description: Subtract two polynomials +* +* Arguments: - poly *r: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial +**************************************************/ +void poly_sub(poly *r, const poly *a, const poly *b) +{ + int i; + for(i=0;icoeffs[i] = a->coeffs[i] - b->coeffs[i]; +} + +/************************************************* +* Name: poly_frommsg +* +* Description: Convert 32-byte message to polynomial +* +* Arguments: - poly *r: pointer to output polynomial +* - const unsigned char *msg: pointer to input message +**************************************************/ +void poly_frommsg(poly *r, const unsigned char msg[KYBER_SYMBYTES]) +{ + int i,j; + uint16_t mask; + + for(i=0;i> j)&1); + r->coeffs[8*i+j] = mask & ((KYBER_Q+1)/2); + } + } +} + +/************************************************* +* Name: poly_tomsg +* +* Description: Convert polynomial to 32-byte message +* +* Arguments: - unsigned char *msg: pointer to output message +* - const poly *a: pointer to input polynomial +**************************************************/ +void poly_tomsg(unsigned char msg[KYBER_SYMBYTES], poly *a) +{ + uint16_t t; + int i,j; + + poly_csubq(a); + + for(i=0;icoeffs[8*i+j] << 1) + KYBER_Q/2) / KYBER_Q) & 1; + msg[i] |= t << j; + } + } +} diff --git a/code/jasmin/mlkem_avx2/poly.h b/code/jasmin/mlkem_avx2/poly.h new file mode 100644 index 00000000..d9c2f73d --- /dev/null +++ b/code/jasmin/mlkem_avx2/poly.h @@ -0,0 +1,68 @@ +#ifndef POLY_H +#define POLY_H + +#include +#include "params.h" + +/* + * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial + * coeffs[0] + X*coeffs[1] + X^2*xoeffs[2] + ... + X^{n-1}*coeffs[n-1] + */ +typedef struct{ + int16_t __attribute__((aligned(32))) coeffs[KYBER_N]; +} poly; + +void poly_compress(unsigned char *r, poly *a); +void poly_decompress(poly *r, const unsigned char *a); + +void poly_tobytes(unsigned char *r, poly *a); +void poly_frombytes(poly *r, const unsigned char *a); + +void poly_frommsg(poly *r, const unsigned char msg[KYBER_SYMBYTES]); +void poly_tomsg(unsigned char msg[KYBER_SYMBYTES], poly *r); + +void poly_getnoise_eta1(poly *r,const unsigned char *seed, unsigned char nonce); +void poly_getnoise_eta2(poly *r,const unsigned char *seed, unsigned char nonce); + +void poly_ntt(poly *r); +void poly_invntt(poly *r); +void poly_basemul(poly *r, const poly *a, const poly *b); +void poly_frommont(poly *r); + +void poly_reduce(poly *r); +void poly_csubq(poly *r); + +void poly_add(poly *r, const poly *a, const poly *b); +void poly_sub(poly *r, const poly *a, const poly *b); + + + + + + +void poly_compress_jazz(unsigned char *r, poly *a); +void poly_decompress_jazz(poly *r, const unsigned char *a); + +void poly_tobytes_jazz(unsigned char *r, poly *a); +void poly_frombytes_jazz(poly *r, const unsigned char *a); + +void poly_frommsg_jazz(poly *r, const unsigned char msg[KYBER_SYMBYTES]); +void poly_tomsg_jazz(unsigned char msg[KYBER_SYMBYTES], poly *r); + + +void poly_getnoise_eta1_4x_jazz(poly *r,const unsigned char *seed, unsigned char nonce); +void poly_getnoise_eta1122_4x_jazz(poly *r,const unsigned char *seed, unsigned char nonce); + +void poly_ntt_jazz(poly *r); +void poly_invntt_jazz(poly *r); +void poly_basemul_jazz(poly *r, const poly *a, const poly *b); +void poly_frommont_jazz(poly *r); + +void poly_reduce_jazz(poly *r); +void poly_csubq_jazz(poly *r); + +void poly_add2_jazz(poly *r, const poly *b); +void poly_sub_jazz(poly *r, const poly *a, const poly *b); + + +#endif diff --git a/code/jasmin/mlkem_avx2/poly.jinc b/code/jasmin/mlkem_avx2/poly.jinc new file mode 100644 index 00000000..0947512c --- /dev/null +++ b/code/jasmin/mlkem_avx2/poly.jinc @@ -0,0 +1,1411 @@ +require "params.jinc" +require "shuffle.jinc" +require "consts.jinc" +require "reduce.jinc" +require "fips202.jinc" +require "fips202_4x.jinc" + +fn _poly_add2(reg ptr u16[KYBER_N] rp bp) -> stack u16[KYBER_N] +{ + inline int i; + reg u256 a; + reg u256 b; + reg u256 r; + + for i = 0 to 16 { + a = rp.[u256 32*i]; + b = bp.[u256 32*i]; + r = #VPADD_16u16(a, b); + rp.[u256 32*i] = r; + } + + return rp; +} + +fn _poly_csubq(reg ptr u16[KYBER_N] rp) -> reg ptr u16[KYBER_N] +{ + reg u256 r qx16; + inline int i; + + qx16 = jqx16[u256 0]; + + for i=0 to 16 { + r = rp.[u256 32*i]; + r = __csubq(r, qx16); + rp.[u256 32*i] = r; + } + + return rp; +} + +inline +fn __w256_interleave_u16(reg u256 al ah) -> reg u256, reg u256 { + reg u256 a0 a1; + + a0 = #VPUNPCKL_16u16(al, ah); + a1 = #VPUNPCKH_16u16(al, ah); + + return a0, a1; +} + +inline +fn __w256_deinterleave_u16(reg u256 _zero a0 a1) -> reg u256, reg u256 { + reg u256 al ah; + + al = #VPBLEND_16u16(a0,_zero,0xAA); + ah = #VPBLEND_16u16(a1,_zero,0xAA); + al = #VPACKUS_8u32(al, ah); + a0 = #VPSRL_8u32(a0,16); + a1 = #VPSRL_8u32(a1,16); + ah = #VPACKUS_8u32(a0, a1); + + return al, ah; +} + +inline +fn __mont_red(reg u256 lo hi qx16 qinvx16) -> reg u256 { + reg u256 m; + + m = #VPMULL_16u16(lo, qinvx16); + m = #VPMULH_16u16(m, qx16); + lo = #VPSUB_16u16(hi, m); + + return lo; +} + +inline +fn __wmul_16u16(reg u256 x y) -> reg u256, reg u256 { + reg u256 xyL xyH xy0 xy1; + xyL = #VPMULL_16u16(x, y); + xyH = #VPMULH_16u16(x, y); + xy0, xy1 = __w256_interleave_u16(xyL, xyH); + + return xy0, xy1; +} + +inline +fn __schoolbook16x(reg u256 are aim bre bim zeta zetaqinv qx16 qinvx16, inline int sign) -> reg u256, reg u256 +{ reg u256 zaim ac0 ac1 zbd0 zbd1 ad0 ad1 bc0 bc1 x0 x1 y0 y1 _zero; + + zaim = __fqmulprecomp16x(aim, zetaqinv, zeta, qx16); + + ac0, ac1 = __wmul_16u16(are, bre); + ad0, ad1 = __wmul_16u16(are, bim); + bc0, bc1 = __wmul_16u16(aim, bre); + zbd0, zbd1 = __wmul_16u16(zaim, bim); + + if (sign == 0) { + x0 = #VPADD_8u32(ac0, zbd0); + x1 = #VPADD_8u32(ac1, zbd1); + } else { + x0 = #VPSUB_8u32(ac0, zbd0); + x1 = #VPSUB_8u32(ac1, zbd1); + } + y0 = #VPADD_8u32(bc0, ad0); + y1 = #VPADD_8u32(bc1, ad1); + + _zero = #set0_256(); + x0, x1 = __w256_deinterleave_u16(_zero, x0, x1); + y0, y1 = __w256_deinterleave_u16(_zero, y0, y1); + x0 = __mont_red(x0, x1, qx16, qinvx16); + y0 = __mont_red(y0, y1, qx16, qinvx16); + return x0, y0; +} + +fn _poly_basemul(reg ptr u16[KYBER_N] rp ap bp) -> reg ptr u16[KYBER_N] +{ + reg u256 zeta zetaqinv qx16 qinvx16 are aim bre bim; + + qx16 = jqx16.[u256 0]; + qinvx16 = jqinvx16.[u256 0]; + + zetaqinv = jzetas_exp.[u256 272]; + zeta = jzetas_exp.[u256 304]; + + are = ap.[u256 32*0]; + aim = ap.[u256 32*1]; + bre = bp.[u256 32*0]; + bim = bp.[u256 32*1]; + are, aim = __schoolbook16x(are, aim, bre, bim, zeta, zetaqinv, qx16, qinvx16, 0); + rp.[u256 32*0] = are; + rp.[u256 32*1] = aim; + + are = ap.[u256 32*2]; + aim = ap.[u256 32*3]; + bre = bp.[u256 32*2]; + bim = bp.[u256 32*3]; + are, aim = __schoolbook16x(are, aim, bre, bim, zeta, zetaqinv, qx16, qinvx16, 1); + rp.[u256 32*2] = are; + rp.[u256 32*3] = aim; + + zetaqinv = jzetas_exp.[u256 336]; + zeta = jzetas_exp.[u256 368]; + + are = ap.[u256 32*4]; + aim = ap.[u256 32*5]; + bre = bp.[u256 32*4]; + bim = bp.[u256 32*5]; + are, aim = __schoolbook16x(are, aim, bre, bim, zeta, zetaqinv, qx16, qinvx16, 0); + rp.[u256 32*4] = are; + rp.[u256 32*5] = aim; + + are = ap.[u256 32*6]; + aim = ap.[u256 32*7]; + bre = bp.[u256 32*6]; + bim = bp.[u256 32*7]; + are, aim = __schoolbook16x(are, aim, bre, bim, zeta, zetaqinv, qx16, qinvx16, 1); + rp.[u256 32*6] = are; + rp.[u256 32*7] = aim; + + zetaqinv = jzetas_exp.[u256 664]; + zeta = jzetas_exp.[u256 696]; + + are = ap.[u256 32*8]; + aim = ap.[u256 32*9]; + bre = bp.[u256 32*8]; + bim = bp.[u256 32*9]; + are, aim = __schoolbook16x(are, aim, bre, bim, zeta, zetaqinv, qx16, qinvx16, 0); + rp.[u256 32*8] = are; + rp.[u256 32*9] = aim; + + are = ap.[u256 32*10]; + aim = ap.[u256 32*11]; + bre = bp.[u256 32*10]; + bim = bp.[u256 32*11]; + are, aim = __schoolbook16x(are, aim, bre, bim, zeta, zetaqinv, qx16, qinvx16, 1); + rp.[u256 32*10] = are; + rp.[u256 32*11] = aim; + + zetaqinv = jzetas_exp.[u256 728]; + zeta = jzetas_exp.[u256 760]; + + are = ap.[u256 32*12]; + aim = ap.[u256 32*13]; + bre = bp.[u256 32*12]; + bim = bp.[u256 32*13]; + are, aim = __schoolbook16x(are, aim, bre, bim, zeta, zetaqinv, qx16, qinvx16, 0); + rp.[u256 32*12] = are; + rp.[u256 32*13] = aim; + + are = ap.[u256 32*14]; + aim = ap.[u256 32*15]; + bre = bp.[u256 32*14]; + bim = bp.[u256 32*15]; + are, aim = __schoolbook16x(are, aim, bre, bim, zeta, zetaqinv, qx16, qinvx16, 1); + rp.[u256 32*14] = are; + rp.[u256 32*15] = aim; + + return rp; +} + +u16 pc_shift1_s = 0x200; +u16 pc_mask_s = 0x0F; +u16 pc_shift2_s = 0x1001; +u32[8] pc_permidx_s = {0,4,1,5,2,6,3,7}; + +fn _poly_compress(reg u64 rp, reg ptr u16[KYBER_N] a) -> reg ptr u16[KYBER_N] +{ + inline int i; + reg u256 f0 f1 f2 f3 v shift1 mask shift2 permidx; + reg ptr u16[16] x16p; + + a = _poly_csubq(a); + + x16p = jvx16; + v = x16p[u256 0]; + shift1 = #VPBROADCAST_16u16(pc_shift1_s); + mask = #VPBROADCAST_16u16(pc_mask_s); + shift2 = #VPBROADCAST_16u16(pc_shift2_s); + permidx = pc_permidx_s[u256 0]; + + for i=0 to KYBER_N/64 + { + f0 = a[u256 4*i]; + f1 = a[u256 4*i + 1]; + f2 = a[u256 4*i + 2]; + f3 = a[u256 4*i + 3]; + f0 = #VPMULH_16u16(f0, v); + f1 = #VPMULH_16u16(f1, v); + f2 = #VPMULH_16u16(f2, v); + f3 = #VPMULH_16u16(f3, v); + f0 = #VPMULHRS_16u16(f0, shift1); + f1 = #VPMULHRS_16u16(f1, shift1); + f2 = #VPMULHRS_16u16(f2, shift1); + f3 = #VPMULHRS_16u16(f3, shift1); + f0 = #VPAND_256(f0, mask); + f1 = #VPAND_256(f1, mask); + f2 = #VPAND_256(f2, mask); + f3 = #VPAND_256(f3, mask); + f0 = #VPACKUS_16u16(f0, f1); + f2 = #VPACKUS_16u16(f2, f3); + f0 = #VPMADDUBSW_256(f0, shift2); + f2 = #VPMADDUBSW_256(f2, shift2); + f0 = #VPACKUS_16u16(f0, f2); + f0 = #VPERMD(permidx, f0); + (u256)[rp + 32*i] = f0; + } + + return a; +} + +fn _poly_compress_1(reg ptr u8[KYBER_POLYCOMPRESSEDBYTES] rp, reg ptr u16[KYBER_N] a) -> reg ptr u8[KYBER_POLYCOMPRESSEDBYTES], reg ptr u16[KYBER_N] +{ + inline int i; + reg u256 f0 f1 f2 f3 v shift1 mask shift2 permidx; + reg ptr u16[16] x16p; + + a = _poly_csubq(a); + + x16p = jvx16; + v = x16p[u256 0]; + shift1 = #VPBROADCAST_16u16(pc_shift1_s); + mask = #VPBROADCAST_16u16(pc_mask_s); + shift2 = #VPBROADCAST_16u16(pc_shift2_s); + permidx = pc_permidx_s[u256 0]; + + for i=0 to KYBER_N/64 + { + f0 = a[u256 4*i]; + f1 = a[u256 4*i + 1]; + f2 = a[u256 4*i + 2]; + f3 = a[u256 4*i + 3]; + f0 = #VPMULH_16u16(f0, v); + f1 = #VPMULH_16u16(f1, v); + f2 = #VPMULH_16u16(f2, v); + f3 = #VPMULH_16u16(f3, v); + f0 = #VPMULHRS_16u16(f0, shift1); + f1 = #VPMULHRS_16u16(f1, shift1); + f2 = #VPMULHRS_16u16(f2, shift1); + f3 = #VPMULHRS_16u16(f3, shift1); + f0 = #VPAND_256(f0, mask); + f1 = #VPAND_256(f1, mask); + f2 = #VPAND_256(f2, mask); + f3 = #VPAND_256(f3, mask); + f0 = #VPACKUS_16u16(f0, f1); + f2 = #VPACKUS_16u16(f2, f3); + f0 = #VPMADDUBSW_256(f0, shift2); + f2 = #VPMADDUBSW_256(f2, shift2); + f0 = #VPACKUS_16u16(f0, f2); + f0 = #VPERMD(permidx, f0); + rp.[u256 32*i] = f0; + } + + return rp, a; +} + +u8[32] pd_jshufbidx = {0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3, + 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7}; +u32 pd_mask_s = 0x00F0000F; +u32 pd_shift_s = 0x800800; + +fn _poly_decompress(reg ptr u16[KYBER_N] rp, reg u64 ap) -> stack u16[KYBER_N] +{ + inline int i; + reg u256 f q shufbidx mask shift; + reg ptr u16[16] x16p; + reg ptr u8[32] x32p; + + x16p = jqx16; + q = x16p[u256 0]; + x32p = pd_jshufbidx; + shufbidx = x32p[u256 0]; + mask = #VPBROADCAST_8u32(pd_mask_s); + shift = #VPBROADCAST_8u32(pd_shift_s); + + f = #set0_256(); + + for i=0 to KYBER_N/16 + { + f = #VPBROADCAST_2u128((u128)[ap + 8*i]); + f = #VPSHUFB_256(f, shufbidx); + f = #VPAND_256(f, mask); + f = #VPMULL_16u16(f, shift); + f = #VPMULHRS_16u16(f, q); + rp[u256 i] = f; + } + + return rp; +} + + +fn _poly_frombytes(reg ptr u16[KYBER_N] rp, reg u64 ap) -> reg ptr u16[KYBER_N] +{ + inline int i; + reg u256 t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 tt mask; + reg ptr u16[16] maskp; + + maskp = maskx16; + mask = maskp[u256 0]; + + for i=0 to 2 + { + t0 = (u256)[ap + 192*i]; + t1 = (u256)[ap + 192*i + 32]; + t2 = (u256)[ap + 192*i + 64]; + t3 = (u256)[ap + 192*i + 96]; + t4 = (u256)[ap + 192*i + 128]; + t5 = (u256)[ap + 192*i + 160]; + + tt, t3 = __shuffle8(t0, t3); + t0, t4 = __shuffle8(t1, t4); + t1, t5 = __shuffle8(t2, t5); + + t2, t4 = __shuffle4(tt, t4); + tt, t1 = __shuffle4(t3, t1); + t3, t5 = __shuffle4(t0, t5); + + t0, t1 = __shuffle2(t2, t1); + t2, t3 = __shuffle2(t4, t3); + t4, t5 = __shuffle2(tt, t5); + + t6, t3 = __shuffle1(t0, t3); + t0, t4 = __shuffle1(t1, t4); + t1, t5 = __shuffle1(t2, t5); + + t7 = #VPSRL_16u16(t6, 12); + t8 = #VPSLL_16u16(t3, 4); + t7 = #VPOR_256(t7, t8); + t6 = #VPAND_256(mask, t6); + t7 = #VPAND_256(mask, t7); + + t8 = #VPSRL_16u16(t3, 8); + t9 = #VPSLL_16u16(t0, 8); + t8 = #VPOR_256(t8,t9); + t8 = #VPAND_256(mask,t8); + + t9 = #VPSRL_16u16(t0, 4); + t9 = #VPAND_256(mask, t9); + + t10 = #VPSRL_16u16(t4, 12); + t11 = #VPSLL_16u16(t1, 4); + t10 = #VPOR_256(t10, t11); + t4 = #VPAND_256(mask,t4); + t10 = #VPAND_256(mask, t10); + + t11 = #VPSRL_16u16(t1, 8); + tt = #VPSLL_16u16(t5, 8); + t11 = #VPOR_256(t11, tt); + t11 = #VPAND_256(mask, t11); + + tt = #VPSRL_16u16(t5, 4); + tt = #VPAND_256(mask, tt); + + rp[u256 8*i] = t6; + rp[u256 8*i + 1] = t7; + rp[u256 8*i + 2] = t8; + rp[u256 8*i + 3] = t9; + rp[u256 8*i + 4] = t4; + rp[u256 8*i + 5] = t10; + rp[u256 8*i + 6] = t11; + rp[u256 8*i + 7] = tt; + } + + return rp; +} + +param int DMONT = 1353; /* (1ULL << 32) % KYBER_Q */ + +fn _poly_frommont(reg ptr u16[KYBER_N] rp) -> reg ptr u16[KYBER_N] +{ + reg u256 t qx16 qinvx16 dmontx16; + inline int i; + reg ptr u16[16] x16p; + + x16p = jqx16; + qx16 = x16p[u256 0]; + x16p = jqinvx16; + qinvx16 = x16p[u256 0]; + x16p = jdmontx16; + dmontx16 = x16p[u256 0]; + + for i=0 to KYBER_N/16 + { + t = rp[u256 i]; + t = __fqmulx16(t, dmontx16, qx16, qinvx16); + rp[u256 i] = t; + } + + return rp; +} + +u32[4] pfm_shift_s = {3, 2, 1, 0}; +u8[16] pfm_idx_s = {0, 1, 4, 5, 8, 9, 12, 13, + 2, 3, 6, 7, 10, 11, 14, 15}; + +fn _poly_frommsg(reg ptr u16[KYBER_N] rp, reg u64 ap) -> stack u16[KYBER_N] +{ + inline int i; + reg u256 f g0 g1 g2 g3 h0 h1 h2 h3; + reg u256 shift idx hqs; + reg ptr u16[16] x16p; + + x16p = hqx16_p1; + hqs = x16p[u256 0]; + shift = #VPBROADCAST_2u128(pfm_shift_s[u128 0]); + idx = #VPBROADCAST_2u128(pfm_idx_s[u128 0]); + + f = (u256)[ap]; + + for i=0 to 4 + { + g3 = #VPSHUFD_256(f, 0x55*i); + g3 = #VPSLLV_8u32(g3, shift); + g3 = #VPSHUFB_256(g3, idx); + g0 = #VPSLL_16u16(g3,12); + g1 = #VPSLL_16u16(g3,8); + g2 = #VPSLL_16u16(g3,4); + g0 = #VPSRA_16u16(g0,15); + g1 = #VPSRA_16u16(g1,15); + g2 = #VPSRA_16u16(g2,15); + g3 = #VPSRA_16u16(g3,15); + g0 = #VPAND_256(g0,hqs); + g1 = #VPAND_256(g1,hqs); + g2 = #VPAND_256(g2,hqs); + g3 = #VPAND_256(g3,hqs); + h0 = #VPUNPCKL_4u64(g0,g1); + h2 = #VPUNPCKH_4u64(g0,g1); + h1 = #VPUNPCKL_4u64(g2,g3); + h3 = #VPUNPCKH_4u64(g2,g3); + g0 = #VPERM2I128(h0,h1,0x20); + g2 = #VPERM2I128(h0,h1,0x31); + g1 = #VPERM2I128(h2,h3,0x20); + g3 = #VPERM2I128(h2,h3,0x31); + rp[u256 2*i] = g0; + rp[u256 2*i + 1] = g1; + rp[u256 2*i + 8] = g2; + rp[u256 2*i + 8 + 1] = g3; + } + + return rp; +} + + +fn _poly_frommsg_1(reg ptr u16[KYBER_N] rp, reg ptr u8[32] ap) -> stack u16[KYBER_N] +{ + inline int i; + reg u256 f g0 g1 g2 g3 h0 h1 h2 h3; + reg u256 shift idx hqs; + reg ptr u16[16] x16p; + + x16p = hqx16_p1; + hqs = x16p[u256 0]; + shift = #VPBROADCAST_2u128(pfm_shift_s[u128 0]); + idx = #VPBROADCAST_2u128(pfm_idx_s[u128 0]); + + f = ap[u256 0]; + + for i=0 to 4 + { + g3 = #VPSHUFD_256(f, 0x55*i); + g3 = #VPSLLV_8u32(g3, shift); + g3 = #VPSHUFB_256(g3, idx); + g0 = #VPSLL_16u16(g3,12); + g1 = #VPSLL_16u16(g3,8); + g2 = #VPSLL_16u16(g3,4); + g0 = #VPSRA_16u16(g0,15); + g1 = #VPSRA_16u16(g1,15); + g2 = #VPSRA_16u16(g2,15); + g3 = #VPSRA_16u16(g3,15); + g0 = #VPAND_256(g0,hqs); + g1 = #VPAND_256(g1,hqs); + g2 = #VPAND_256(g2,hqs); + g3 = #VPAND_256(g3,hqs); + h0 = #VPUNPCKL_4u64(g0,g1); + h2 = #VPUNPCKH_4u64(g0,g1); + h1 = #VPUNPCKL_4u64(g2,g3); + h3 = #VPUNPCKH_4u64(g2,g3); + g0 = #VPERM2I128(h0,h1,0x20); + g2 = #VPERM2I128(h0,h1,0x31); + g1 = #VPERM2I128(h2,h3,0x20); + g3 = #VPERM2I128(h2,h3,0x31); + rp[u256 2*i] = g0; + rp[u256 2*i + 1] = g1; + rp[u256 2*i + 8] = g2; + rp[u256 2*i + 8 + 1] = g3; + } + + return rp; +} + + +param int NOISE_NBLOCKS = (KYBER_ETA1 * KYBER_N/4 + SHAKE256_RATE - 1)/SHAKE256_RATE; + +u8[32] cbd_jshufbidx = {0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, -1, + 4, 5, 6, -1, 7, 8, 9, -1, 10, 11, 12, -1, 13, 14, 15, -1}; + +inline +fn __cbd3(reg ptr u16[KYBER_N] rp, reg ptr u8[KYBER_ETA1*KYBER_N/4+(KYBER_ETA1 - 2)*8] buf) -> reg ptr u16[KYBER_N]{ + inline int i; + reg u256 f0 f1 f2 f3; + reg u256 mask249 mask6DB mask07 mask70 mask3 shufbidx; + stack u32 mask249_s mask6DB_s mask07_s mask70_s; + stack u16 mask3_s; + + mask249_s = 0x249249; + mask6DB_s = 0x6DB6DB; + mask07_s = 7; + mask70_s = (7 << 16); + mask3_s = 3; + + mask249 = #VPBROADCAST_8u32(mask249_s); + mask6DB = #VPBROADCAST_8u32(mask6DB_s); + mask07 = #VPBROADCAST_8u32(mask07_s); + mask70 = #VPBROADCAST_8u32(mask70_s); + mask3 = #VPBROADCAST_16u16(mask3_s); + shufbidx = cbd_jshufbidx[u256 0]; + + for i=0 to KYBER_N/32 + { + f0 = buf.[u256 24*i]; + f0 = #VPERMQ(f0, 0x94); + f0 = #VPSHUFB_256(f0, shufbidx); + + f1 = #VPSRL_8u32(f0, 1); + f2 = #VPSRL_8u32(f0, 2); + f0 = #VPAND_256(mask249, f0); + f1 = #VPAND_256(mask249, f1); + f2 = #VPAND_256(mask249, f2); + f0 = #VPADD_8u32(f0, f1); + f0 = #VPADD_8u32(f0, f2); + + f1 = #VPSRL_8u32(f0, 3); + f0 = #VPADD_8u32(f0, mask6DB); + f0 = #VPSUB_8u32(f0, f1); + + f1 = #VPSLL_8u32(f0, 10); + f2 = #VPSRL_8u32(f0, 12); + f3 = #VPSRL_8u32(f0, 2); + f0 = #VPAND_256(f0, mask07); + f1 = #VPAND_256(f1, mask70); + f2 = #VPAND_256(f2, mask07); + f3 = #VPAND_256(f3, mask70); + f0 = #VPADD_16u16(f0, f1); + f1 = #VPADD_16u16(f2, f3); + f0 = #VPSUB_16u16(f0, mask3); + f1 = #VPSUB_16u16(f1, mask3); + + f2 = #VPUNPCKL_8u32(f0, f1); + f3 = #VPUNPCKH_8u32(f0, f1); + + f0 = #VPERM2I128(f2, f3, 0x20); + f1 = #VPERM2I128(f2, f3, 0x31); + + rp[u256 2*i] = f0; + rp[u256 2*i + 1] = f1; + } + + return rp; +} + + +inline +fn __cbd2(reg ptr u16[KYBER_N] rp, reg ptr u8[KYBER_ETA2*KYBER_N/4] buf) -> reg ptr u16[KYBER_N] +{ + inline int i; + reg u256 f0 f1 f2 f3; + reg u256 mask55 mask33 mask03 mask0F; + reg u128 t; + stack u32 mask55_s mask33_s mask03_s mask0F_s; + + mask55_s = 0x55555555; + mask33_s = 0x33333333; + mask03_s = 0x03030303; + mask0F_s = 0x0F0F0F0F; + + mask55 = #VPBROADCAST_8u32(mask55_s); + mask33 = #VPBROADCAST_8u32(mask33_s); + mask03 = #VPBROADCAST_8u32(mask03_s); + mask0F = #VPBROADCAST_8u32(mask0F_s); + + for i=0 to KYBER_N/64 + { + f0 = buf[u256 i]; + + f1 = #VPSRL_16u16(f0, 1); + f0 = #VPAND_256(mask55, f0); + f1 = #VPAND_256(mask55, f1); + f0 = #VPADD_32u8(f0, f1); + + f1 = #VPSRL_16u16(f0, 2); + f0 = #VPAND_256(mask33, f0); + f1 = #VPAND_256(mask33, f1); + f0 = #VPADD_32u8(f0, mask33); + f0 = #VPSUB_32u8(f0, f1); + + f1 = #VPSRL_16u16(f0, 4); + f0 = #VPAND_256(mask0F, f0); + f1 = #VPAND_256(mask0F, f1); + f0 = #VPSUB_32u8(f0, mask03); + f1 = #VPSUB_32u8(f1, mask03); + + f2 = #VPUNPCKL_32u8(f0, f1); + f3 = #VPUNPCKH_32u8(f0, f1); + + t = (128u)f2; + f0 = #VPMOVSX_16u8_16u16(t); + t = #VEXTRACTI128(f2, 1); + f1 = #VPMOVSX_16u8_16u16(t); + t = (128u)f3; + f2 = #VPMOVSX_16u8_16u16(t); + t = #VEXTRACTI128(f3, 1); + f3 = #VPMOVSX_16u8_16u16(t); + rp[u256 4*i] = f0; + rp[u256 4*i + 1] = f2; + rp[u256 4*i + 2] = f1; + rp[u256 4*i + 3] = f3; + } + + return rp; +} + +/* buf 32 bytes longer for cbd3 (KYBER_ETA1 == 3) */ +inline +fn __poly_cbd_eta1(reg ptr u16[KYBER_N] rp, reg ptr u8[KYBER_ETA1*KYBER_N/4+(KYBER_ETA1 - 2)*8] buf) -> reg ptr u16[KYBER_N] +{ + if(KYBER_ETA1 == 2) { // resolved at compile-time + rp = __cbd2(rp, buf[0:KYBER_ETA2*KYBER_N/4]); + } else { + rp = __cbd3(rp, buf); + } + + return rp; +} + +inline +fn __poly_cbd_eta2(reg ptr u16[KYBER_N] rp, reg ptr u8[KYBER_ETA2*KYBER_N/4] buf) -> reg ptr u16[KYBER_N] +{ + if(KYBER_ETA2 == 2) { + rp = __cbd2(rp, buf); + } + return rp; +} + +/* +#[returnaddress="stack"] +fn _poly_getnoise(reg ptr u16[KYBER_N] rp, reg ptr u8[KYBER_SYMBYTES] seed, reg u8 nonce) -> reg ptr u16[KYBER_N] +{ + inline int i; + reg u256 f0 f1 f2 f3; + reg u256 mask55 mask33 mask03 mask0F; + reg u128 t; + reg u64 t64; + stack ptr u16[KYBER_N] srp; + stack u8[128] buf; + stack u8[33] extseed; + stack u32 mask55_s mask33_s mask03_s mask0F_s; + + mask55_s = 0x55555555; + mask33_s = 0x33333333; + mask03_s = 0x03030303; + mask0F_s = 0x0F0F0F0F; + + srp = rp; + + for i=0 to KYBER_SYMBYTES/8 + { + t64 = seed[u64 i]; + extseed[u64 i] = t64; + } + extseed[KYBER_SYMBYTES] = nonce; + + buf = _shake256_128_33(buf, extseed); + + mask55 = #VPBROADCAST_8u32(mask55_s); + mask33 = #VPBROADCAST_8u32(mask33_s); + mask03 = #VPBROADCAST_8u32(mask03_s); + mask0F = #VPBROADCAST_8u32(mask0F_s); + + rp = srp; + + for i=0 to KYBER_N/64 + { + f0 = buf[u256 i]; + + f1 = #VPSRL_16u16(f0, 1); + f0 = #VPAND_256(mask55, f0); + f1 = #VPAND_256(mask55, f1); + f0 = #VPADD_32u8(f0, f1); + + f1 = #VPSRL_16u16(f0, 2); + f0 = #VPAND_256(mask33, f0); + f1 = #VPAND_256(mask33, f1); + f0 = #VPADD_32u8(f0, mask33); + f0 = #VPSUB_32u8(f0, f1); + + f1 = #VPSRL_16u16(f0, 4); + f0 = #VPAND_256(mask0F, f0); + f1 = #VPAND_256(mask0F, f1); + f0 = #VPSUB_32u8(f0, mask03); + f1 = #VPSUB_32u8(f1, mask03); + + f2 = #VPUNPCKL_32u8(f0, f1); + f3 = #VPUNPCKH_32u8(f0, f1); + + t = (128u)f2; + f0 = #VPMOVSX_16u8_16u16(t); + t = #VEXTRACTI128(f2, 1); + f1 = #VPMOVSX_16u8_16u16(t); + t = (128u)f3; + f2 = #VPMOVSX_16u8_16u16(t); + t = #VEXTRACTI128(f3, 1); + f3 = #VPMOVSX_16u8_16u16(t); + rp[u256 4*i] = f0; + rp[u256 4*i + 1] = f2; + rp[u256 4*i + 2] = f1; + rp[u256 4*i + 3] = f3; + } + + return rp; +} +*/ + +inline +fn __shake256_squeezenblocks4x(reg ptr u256[25] state, reg ptr u8[NOISE_NBLOCKS * SHAKE256_RATE] buf0 buf1 buf2 buf3) -> reg ptr u256[25], reg ptr u8[NOISE_NBLOCKS*SHAKE256_RATE], reg ptr u8[NOISE_NBLOCKS*SHAKE256_RATE], reg ptr u8[NOISE_NBLOCKS*SHAKE256_RATE], reg ptr u8[NOISE_NBLOCKS*SHAKE256_RATE] +{ + inline int i; + + for i = 0 to NOISE_NBLOCKS + { + state, buf0[i*SHAKE256_RATE:SHAKE256_RATE], buf1[i*SHAKE256_RATE:SHAKE256_RATE], buf2[i*SHAKE256_RATE:SHAKE256_RATE], buf3[i*SHAKE256_RATE:SHAKE256_RATE] = __shake256_squeezeblock4x(state, buf0[i*SHAKE256_RATE:SHAKE256_RATE], buf1[i*SHAKE256_RATE:SHAKE256_RATE], buf2[i*SHAKE256_RATE:SHAKE256_RATE], buf3[i*SHAKE256_RATE:SHAKE256_RATE]); + } + + return state, buf0, buf1, buf2, buf3; +} + +#[returnaddress="stack"] +fn _poly_getnoise_eta1_4x(reg ptr u16[KYBER_N] r0 r1 r2 r3, reg ptr u8[KYBER_SYMBYTES] seed, reg u8 nonce) -> reg ptr u16[KYBER_N], reg ptr u16[KYBER_N], reg ptr u16[KYBER_N], reg ptr u16[KYBER_N] +{ + reg u256 f; + stack u256[25] state; + stack u8[NOISE_NBLOCKS * SHAKE256_RATE] buf0 buf1 buf2 buf3; + + f = seed[u256 0]; + buf0[u256 0] = f; + buf1[u256 0] = f; + buf2[u256 0] = f; + buf3[u256 0] = f; + + buf0.[32] = nonce; + nonce += 1; + buf1.[32] = nonce; + nonce += 1; + buf2.[32] = nonce; + nonce += 1; + buf3.[32] = nonce; + + state = _shake256_absorb4x_33(state, buf0[0:33], buf1[0:33], buf2[0:33], buf3[0:33]); + state, buf0, buf1, buf2, buf3 = __shake256_squeezenblocks4x(state, buf0, buf1, buf2, buf3); + + r0 = __poly_cbd_eta1(r0, buf0[0:KYBER_ETA1*KYBER_N/4+(KYBER_ETA1 - 2)*8]); + r1 = __poly_cbd_eta1(r1, buf1[0:KYBER_ETA1*KYBER_N/4+(KYBER_ETA1 - 2)*8]); + r2 = __poly_cbd_eta1(r2, buf2[0:KYBER_ETA1*KYBER_N/4+(KYBER_ETA1 - 2)*8]); + r3 = __poly_cbd_eta1(r3, buf3[0:KYBER_ETA1*KYBER_N/4+(KYBER_ETA1 - 2)*8]); + + return r0, r1, r2, r3; +} + +#[returnaddress="stack"] +fn _poly_getnoise_eta1122_4x(reg ptr u16[KYBER_N] r0 r1 r2 r3, reg ptr u8[KYBER_SYMBYTES] seed, reg u8 nonce) -> reg ptr u16[KYBER_N], reg ptr u16[KYBER_N], reg ptr u16[KYBER_N], reg ptr u16[KYBER_N] +{ + reg u256 f; + stack u256[25] state; + stack u8[NOISE_NBLOCKS * SHAKE256_RATE] buf0 buf1 buf2 buf3; + + f = seed[u256 0]; + buf0[u256 0] = f; + buf1[u256 0] = f; + buf2[u256 0] = f; + buf3[u256 0] = f; + + buf0.[32] = nonce; + nonce += 1; + buf1.[32] = nonce; + nonce += 1; + buf2.[32] = nonce; + nonce += 1; + buf3.[32] = nonce; + + state = _shake256_absorb4x_33(state, buf0[0:33], buf1[0:33], buf2[0:33], buf3[0:33]); + state, buf0, buf1, buf2, buf3 = __shake256_squeezenblocks4x(state, buf0, buf1, buf2, buf3); + + r0 = __poly_cbd_eta1(r0, buf0[0:KYBER_ETA1*KYBER_N/4+(KYBER_ETA1 - 2)*8]); + r1 = __poly_cbd_eta1(r1, buf1[0:KYBER_ETA1*KYBER_N/4+(KYBER_ETA1 - 2)*8]); + r2 = __poly_cbd_eta2(r2, buf2[0:KYBER_ETA2*KYBER_N/4]); + r3 = __poly_cbd_eta2(r3, buf3[0:KYBER_ETA2*KYBER_N/4]); + + return r0, r1, r2, r3; +} + + +inline +fn __invntt___butterfly64x(reg u256 rl0 rl1 rl2 rl3 rh0 rh1 rh2 rh3 zl0 zl1 zh0 zh1 qx16) + -> reg u256, reg u256, reg u256, reg u256, reg u256, reg u256, reg u256, reg u256 +{ + reg u256 t0 t1 t2 t3; + + t0 = #VPSUB_16u16(rl0, rh0); + t1 = #VPSUB_16u16(rl1, rh1); + t2 = #VPSUB_16u16(rl2, rh2); + + rl0 = #VPADD_16u16(rh0, rl0); + rl1 = #VPADD_16u16(rh1, rl1); + rh0 = #VPMULL_16u16(zl0, t0); + + rl2 = #VPADD_16u16(rh2, rl2); + rh1 = #VPMULL_16u16(zl0, t1); + t3 = #VPSUB_16u16(rl3, rh3); + + rl3 = #VPADD_16u16(rh3, rl3); + rh2 = #VPMULL_16u16(zl1, t2); + rh3 = #VPMULL_16u16(zl1, t3); + + t0 = #VPMULH_16u16(zh0, t0); + t1 = #VPMULH_16u16(zh0, t1); + + t2 = #VPMULH_16u16(zh1, t2); + t3 = #VPMULH_16u16(zh1, t3); + + // Reduce + rh0 = #VPMULH_16u16(qx16, rh0); + rh1 = #VPMULH_16u16(qx16, rh1); + rh2 = #VPMULH_16u16(qx16, rh2); + rh3 = #VPMULH_16u16(qx16, rh3); + + rh0 = #VPSUB_16u16(t0, rh0); + rh1 = #VPSUB_16u16(t1, rh1); + rh2 = #VPSUB_16u16(t2, rh2); + rh3 = #VPSUB_16u16(t3, rh3); + + return rl0, rl1, rl2, rl3, rh0, rh1, rh2, rh3; +} + +fn _poly_invntt(reg ptr u16[KYBER_N] rp) -> reg ptr u16[KYBER_N] +{ + reg u256 zeta0 zeta1 zeta2 zeta3 r0 r1 r2 r3 r4 r5 r6 r7 qx16 vx16 flox16 fhix16; + reg ptr u16[400] zetasp; + reg ptr u16[16] qx16p; + inline int i; + + zetasp = jzetas_inv_exp; + qx16 = jqx16[u256 0]; + + for i=0 to 2 + { + // level 0: + zeta0 = zetasp.[u256 0+392*i]; + zeta1 = zetasp.[u256 64+392*i]; + zeta2 = zetasp.[u256 32+392*i]; + zeta3 = zetasp.[u256 96+392*i]; + + r0 = rp.[u256 32*0+256*i]; + r1 = rp.[u256 32*1+256*i]; + r2 = rp.[u256 32*2+256*i]; + r3 = rp.[u256 32*3+256*i]; + r4 = rp.[u256 32*4+256*i]; + r5 = rp.[u256 32*5+256*i]; + r6 = rp.[u256 32*6+256*i]; + r7 = rp.[u256 32*7+256*i]; + + r0, r1, r4, r5, r2, r3, r6, r7 = __invntt___butterfly64x(r0, r1, r4, r5, r2, r3, r6, r7, zeta0, zeta1, zeta2, zeta3, qx16); + + // level 1: + vx16 = jvx16[u256 0]; + zeta0 = zetasp.[u256 128+392*i]; + zeta1 = zetasp.[u256 160+392*i]; + r0 = __red16x(r0, qx16, vx16); + r1 = __red16x(r1, qx16, vx16); + r4 = __red16x(r4, qx16, vx16); + r5 = __red16x(r5, qx16, vx16); + + r0, r1, r2, r3, r4, r5, r6, r7 = __invntt___butterfly64x(r0, r1, r2, r3, r4, r5, r6, r7, zeta0, zeta0, zeta1, zeta1, qx16); + + r0, r1 = __shuffle1(r0, r1); + r2, r3 = __shuffle1(r2, r3); + r4, r5 = __shuffle1(r4, r5); + r6, r7 = __shuffle1(r6, r7); + + // level 2: + zeta0 = zetasp.[u256 192+392*i]; + zeta1 = zetasp.[u256 224+392*i]; + + + r0, r2, r4, r6, r1, r3, r5, r7 = __invntt___butterfly64x(r0, r2, r4, r6, r1, r3, r5, r7, zeta0, zeta0, zeta1, zeta1, qx16); + + r0 = __red16x(r0, qx16, vx16); + + r0, r2 = __shuffle2(r0, r2); + r4, r6 = __shuffle2(r4, r6); + r1, r3 = __shuffle2(r1, r3); + r5, r7 = __shuffle2(r5, r7); + + // level 3: + zeta0 = zetasp.[u256 256+392*i]; + zeta1 = zetasp.[u256 288+392*i]; + + r0, r4, r1, r5, r2, r6, r3, r7 = __invntt___butterfly64x(r0, r4, r1, r5, r2, r6, r3, r7, zeta0, zeta0, zeta1, zeta1, qx16); + + r0 = __red16x(r0, qx16, vx16); + + r0, r4 = __shuffle4(r0, r4); + r1, r5 = __shuffle4(r1, r5); + r2, r6 = __shuffle4(r2, r6); + r3, r7 = __shuffle4(r3, r7); + + // level 4: + zeta0 = zetasp.[u256 320+392*i]; + zeta1 = zetasp.[u256 352+392*i]; + + r0, r1, r2, r3, r4, r5, r6, r7 = __invntt___butterfly64x(r0, r1, r2, r3, r4, r5, r6, r7, zeta0, zeta0, zeta1, zeta1, qx16); + + r0 = __red16x(r0, qx16, vx16); + + r0, r1 = __shuffle8(r0, r1); + r2, r3 = __shuffle8(r2, r3); + r4, r5 = __shuffle8(r4, r5); + r6, r7 = __shuffle8(r6, r7); + + // level 5: + zeta0 = #VPBROADCAST_8u32(zetasp.[u32 384+392*i]); + zeta1 = #VPBROADCAST_8u32(zetasp.[u32 388+392*i]); + + r0, r2, r4, r6, r1, r3, r5, r7 = __invntt___butterfly64x(r0, r2, r4, r6, r1, r3, r5, r7, zeta0, zeta0, zeta1, zeta1, qx16); + + r0 = __red16x(r0, qx16, vx16); + + if (i==0) { + rp.[u256 32*0+256*i] = r0; + rp.[u256 32*1+256*i] = r2; + rp.[u256 32*2+256*i] = r4; + rp.[u256 32*3+256*i] = r6; + } + rp.[u256 32*4+256*i] = r1; + rp.[u256 32*5+256*i] = r3; + rp.[u256 32*6+256*i] = r5; + rp.[u256 32*7+256*i] = r7; + } + + zeta0 = #VPBROADCAST_8u32(zetasp.[u32 784]); + zeta1 = #VPBROADCAST_8u32(zetasp.[u32 788]); + + for i=0 to 2 + { + if (i == 0) { + r7 = r6; + r6 = r4; + r5 = r2; + r4 = r0; + } else { + r4 = rp.[u256 32*8+128*i]; + r5 = rp.[u256 32*9+128*i]; + r6 = rp.[u256 32*10+128*i]; + r7 = rp.[u256 32*11+128*i]; + } + r0 = rp.[u256 32*0+128*i]; + r1 = rp.[u256 32*1+128*i]; + r2 = rp.[u256 32*2+128*i]; + r3 = rp.[u256 32*3+128*i]; + + r0, r1, r2, r3, r4, r5, r6, r7 = __invntt___butterfly64x(r0, r1, r2, r3, r4, r5, r6, r7, zeta0, zeta0, zeta1, zeta1, qx16); + + flox16 = jflox16[u256 0]; + fhix16 = jfhix16[u256 0]; + + rp.[u256 32*8+128*i] = r4; + rp.[u256 32*9+128*i] = r5; + rp.[u256 32*10+128*i] = r6; + rp.[u256 32*11+128*i] = r7; + + r0 = __fqmulprecomp16x(r0, flox16, fhix16, qx16); + r1 = __fqmulprecomp16x(r1, flox16, fhix16, qx16); + r2 = __fqmulprecomp16x(r2, flox16, fhix16, qx16); + r3 = __fqmulprecomp16x(r3, flox16, fhix16, qx16); + + rp.[u256 32*0+128*i] = r0; + rp.[u256 32*1+128*i] = r1; + rp.[u256 32*2+128*i] = r2; + rp.[u256 32*3+128*i] = r3; + } + + return rp; +} + +inline +fn __butterfly64x(reg u256 rl0 rl1 rl2 rl3 rh0 rh1 rh2 rh3 zl0 zl1 zh0 zh1 qx16) + -> reg u256, reg u256, reg u256, reg u256, reg u256, reg u256, reg u256, reg u256 +{ + reg u256 t0 t1 t2 t3 t4 t5 t6 t7; + + t0 = #VPMULL_16u16(zl0, rh0); + t1 = #VPMULH_16u16(zh0, rh0); + t2 = #VPMULL_16u16(zl0, rh1); + t3 = #VPMULH_16u16(zh0, rh1); + t4 = #VPMULL_16u16(zl1, rh2); + t5 = #VPMULH_16u16(zh1, rh2); + t6 = #VPMULL_16u16(zl1, rh3); + t7 = #VPMULH_16u16(zh1, rh3); + + t0 = #VPMULH_16u16(t0, qx16); + t2 = #VPMULH_16u16(t2, qx16); + t4 = #VPMULH_16u16(t4, qx16); + t6 = #VPMULH_16u16(t6, qx16); + + //rh1 = #VPSUB_16u16(t3, rl1); + rh1 = #VPSUB_16u16(rl1, t3); + rl1 = #VPADD_16u16(t3, rl1); + //rh0 = #VPSUB_16u16(t1, rl0); + rh0 = #VPSUB_16u16(rl0, t1); + rl0 = #VPADD_16u16(t1, rl0); + //rh3 = #VPSUB_16u16(t7, rl3); + rh3 = #VPSUB_16u16(rl3, t7); + rl3 = #VPADD_16u16(t7, rl3); + //rh2 = #VPSUB_16u16(t5, rl2); + rh2 = #VPSUB_16u16(rl2, t5); + rl2 = #VPADD_16u16(t5, rl2); + + rh0 = #VPADD_16u16(t0, rh0); + //rl0 = #VPSUB_16u16(t0, rl0); + rl0 = #VPSUB_16u16(rl0, t0); + rh1 = #VPADD_16u16(t2, rh1); + //rl1 = #VPSUB_16u16(t2, rl1); + rl1 = #VPSUB_16u16(rl1, t2); + rh2 = #VPADD_16u16(t4, rh2); + //rl2 = #VPSUB_16u16(t4, rl2); + rl2 = #VPSUB_16u16(rl2, t4); + rh3 = #VPADD_16u16(t6, rh3); + //rl3 = #VPSUB_16u16(t6, rl3); + rl3 = #VPSUB_16u16(rl3, t6); + + return rl0, rl1, rl2, rl3, rh0, rh1, rh2, rh3; +} + +fn _poly_ntt(reg ptr u16[KYBER_N] rp) -> reg ptr u16[KYBER_N] +{ + reg u256 zeta0 zeta1 zeta2 zeta3 r0 r1 r2 r3 r4 r5 r6 r7 qx16 vx16; + reg u32 t; + reg u16 w; + reg ptr u16[400] zetasp; + inline int i; + + zetasp = jzetas_exp; + qx16 = jqx16[u256 0]; + + zeta0 = #VPBROADCAST_8u32(zetasp[u32 0]); + zeta1 = #VPBROADCAST_8u32(zetasp[u32 1]); + + r0 = rp.[u256 32*0]; + r1 = rp.[u256 32*1]; + r2 = rp.[u256 32*2]; + r3 = rp.[u256 32*3]; + r4 = rp.[u256 32*8]; + r5 = rp.[u256 32*9]; + r6 = rp.[u256 32*10]; + r7 = rp.[u256 32*11]; + + r0, r1, r2, r3, r4, r5, r6, r7 = __butterfly64x(r0, r1, r2, r3, r4, r5, r6, r7, zeta0, zeta0, zeta1, zeta1, qx16); + + rp.[u256 32*0] = r0; + rp.[u256 32*1] = r1; + rp.[u256 32*2] = r2; + rp.[u256 32*3] = r3; + rp.[u256 32*8] = r4; + rp.[u256 32*9] = r5; + rp.[u256 32*10] = r6; + rp.[u256 32*11] = r7; + + r0 = rp.[u256 32*4]; + r1 = rp.[u256 32*5]; + r2 = rp.[u256 32*6]; + r3 = rp.[u256 32*7]; + r4 = rp.[u256 32*12]; + r5 = rp.[u256 32*13]; + r6 = rp.[u256 32*14]; + r7 = rp.[u256 32*15]; + + r0, r1, r2, r3, r4, r5, r6, r7 = __butterfly64x(r0, r1, r2, r3, r4, r5, r6, r7, zeta0, zeta0, zeta1, zeta1, qx16); + + /* + rp.[u256 32*4] = r0; + rp.[u256 32*5] = r1; + rp.[u256 32*6] = r2; + rp.[u256 32*7] = r3; + */ + rp.[u256 32*12] = r4; + rp.[u256 32*13] = r5; + rp.[u256 32*14] = r6; + rp.[u256 32*15] = r7; + + for i=0 to 2 { + + // level 1 + zeta0 = #VPBROADCAST_8u32(zetasp.[u32 8 + 392*i]); + zeta1 = #VPBROADCAST_8u32(zetasp.[u32 12 + 392*i]); + + if ( i == 0) { + r4 = r0; + r5 = r1; + r6 = r2; + r7 = r3; + } else { + r4 = rp.[u256 32*4+256*i]; + r5 = rp.[u256 32*5+256*i]; + r6 = rp.[u256 32*6+256*i]; + r7 = rp.[u256 32*7+256*i]; + } + r0 = rp.[u256 32*0+256*i]; + r1 = rp.[u256 32*1+256*i]; + r2 = rp.[u256 32*2+256*i]; + r3 = rp.[u256 32*3+256*i]; + + r0, r1, r2, r3, r4, r5, r6, r7 = __butterfly64x(r0, r1, r2, r3, r4, r5, r6, r7, zeta0, zeta0, zeta1, zeta1, qx16); + + // level 2 + zeta0 = zetasp.[u256 16 + 392*i]; + zeta1 = zetasp.[u256 48 + 392*i]; + + r0, r4 = __shuffle8(r0, r4); + r1, r5 = __shuffle8(r1, r5); + r2, r6 = __shuffle8(r2, r6); + r3, r7 = __shuffle8(r3, r7); + + r0, r4, r1, r5, r2, r6, r3, r7 = __butterfly64x(r0, r4, r1, r5, r2, r6, r3, r7, zeta0, zeta0, zeta1, zeta1, qx16); + + // level 3 + zeta0 = zetasp.[u256 80 + 392*i]; + zeta1 = zetasp.[u256 112 + 392*i]; + + r0, r2 = __shuffle4(r0, r2); + r4, r6 = __shuffle4(r4, r6); + r1, r3 = __shuffle4(r1, r3); + r5, r7 = __shuffle4(r5, r7); + + r0, r2, r4, r6, r1, r3, r5, r7 = __butterfly64x(r0, r2, r4, r6, r1, r3, r5, r7, zeta0, zeta0, zeta1, zeta1, qx16); + + // level 4 + zeta0 = zetasp.[u256 144 + 392*i]; + zeta1 = zetasp.[u256 176 + 392*i]; + + r0, r1 = __shuffle2(r0, r1); + r2, r3 = __shuffle2(r2, r3); + r4, r5 = __shuffle2(r4, r5); + r6, r7 = __shuffle2(r6, r7); + + r0, r1, r2, r3, r4, r5, r6, r7 = __butterfly64x(r0, r1, r2, r3, r4, r5, r6, r7, zeta0, zeta0, zeta1, zeta1, qx16); + + // level 5 + zeta0 = zetasp.[u256 208 + 392*i]; + zeta1 = zetasp.[u256 240 + 392*i]; + + r0, r4 = __shuffle1(r0, r4); + r1, r5 = __shuffle1(r1, r5); + r2, r6 = __shuffle1(r2, r6); + r3, r7 = __shuffle1(r3, r7); + + r0, r4, r1, r5, r2, r6, r3, r7 = __butterfly64x(r0, r4, r1, r5, r2, r6, r3, r7, zeta0, zeta0, zeta1, zeta1, qx16); + + // level 6 + zeta0 = zetasp.[u256 272 + 392*i]; + zeta2 = zetasp.[u256 304 + 392*i]; + zeta1 = zetasp.[u256 336 + 392*i]; + zeta3 = zetasp.[u256 368 + 392*i]; + + r0, r4, r2, r6, r1, r5, r3, r7 = __butterfly64x(r0, r4, r2, r6, r1, r5, r3, r7, zeta0, zeta1, zeta2, zeta3, qx16); + + vx16 = jvx16[u256 0]; + + r0 = __red16x(r0, qx16, vx16); + r4 = __red16x(r4, qx16, vx16); + r2 = __red16x(r2, qx16, vx16); + r6 = __red16x(r6, qx16, vx16); + r1 = __red16x(r1, qx16, vx16); + r5 = __red16x(r5, qx16, vx16); + r3 = __red16x(r3, qx16, vx16); + r7 = __red16x(r7, qx16, vx16); + + rp.[u256 32*0+256*i] = r0; + rp.[u256 32*1+256*i] = r4; + rp.[u256 32*2+256*i] = r1; + rp.[u256 32*3+256*i] = r5; + rp.[u256 32*4+256*i] = r2; + rp.[u256 32*5+256*i] = r6; + rp.[u256 32*6+256*i] = r3; + rp.[u256 32*7+256*i] = r7; + } + + return rp; +} + +inline +fn __poly_reduce(reg ptr u16[KYBER_N] rp) -> reg ptr u16[KYBER_N] +{ + inline int i; + reg u256 r qx16 vx16; + + qx16 = jqx16[u256 0]; + vx16 = jvx16[u256 0]; + + for i=0 to 16 + { + r = rp.[u256 32*i]; + r = __red16x(r, qx16, vx16); + rp.[u256 32*i] = r; + } + return rp; +} + +fn _poly_sub(reg ptr u16[KYBER_N] rp ap bp) -> reg ptr u16[KYBER_N] +{ + inline int i; + reg u256 a; + reg u256 b; + reg u256 r; + + for i = 0 to 16 { + a = ap.[u256 32*i]; + b = bp.[u256 32*i]; + r = #VPSUB_16u16(a, b); + rp.[u256 32*i] = r; + } + + return rp; +} + +fn _poly_tobytes(reg u64 rp, reg ptr u16[KYBER_N] a) -> reg ptr u16[KYBER_N] +{ + inline int i; + reg u256 t0 t1 t2 t3 t4 t5 t6 t7 qx16 tt ttt; + reg ptr u16[16] jqx16_p; + + jqx16_p = jqx16; + qx16 = jqx16_p[u256 0]; + + a = _poly_csubq(a); + + for i = 0 to 2 + { + t0 = a[u256 8*i]; + t1 = a[u256 8*i + 1]; + t2 = a[u256 8*i + 2]; + t3 = a[u256 8*i + 3]; + t4 = a[u256 8*i + 4]; + t5 = a[u256 8*i + 5]; + t6 = a[u256 8*i + 6]; + t7 = a[u256 8*i + 7]; + + tt = #VPSLL_16u16(t1, 12); + tt |= t0; + + t0 = #VPSRL_16u16(t1, 4); + t1 = #VPSLL_16u16(t2, 8); + t0 |= t1; + + t1 = #VPSRL_16u16(t2, 8); + t2 = #VPSLL_16u16(t3, 4); + t1 |= t2; + + t2 = #VPSLL_16u16(t5, 12); + t2 |= t4; + + t3 = #VPSRL_16u16(t5, 4); + t4 = #VPSLL_16u16(t6, 8); + t3 |= t4; + + t4 = #VPSRL_16u16(t6, 8); + t5 = #VPSLL_16u16(t7, 4); + t4 |= t5; + + ttt, t0 = __shuffle1(tt, t0); + tt, t2 = __shuffle1(t1, t2); + t1, t4 = __shuffle1(t3, t4); + + t3, tt= __shuffle2(ttt, tt); + ttt, t0 = __shuffle2(t1, t0); + t1, t4 = __shuffle2(t2, t4); + + t2, ttt = __shuffle4(t3, ttt); + t3, tt = __shuffle4(t1, tt); + t1, t4 = __shuffle4(t0, t4); + + t0, t3 = __shuffle8(t2, t3); + t2, ttt = __shuffle8(t1, ttt); + t1, t4 = __shuffle8(tt, t4); + + (u256)[rp + 192*i] = t0; + (u256)[rp + 192*i + 32] = t2; + (u256)[rp + 192*i + 64] = t1; + (u256)[rp + 192*i + 96] = t3; + (u256)[rp + 192*i + 128] = ttt; + (u256)[rp + 192*i + 160] = t4; + } + + return a; +} + +fn _poly_tomsg(reg u64 rp, reg ptr u16[KYBER_N] a) -> reg ptr u16[KYBER_N] +{ + inline int i; + reg u256 f0 f1 g0 g1 hq hhq; + reg ptr u16[16] px16; + reg u32 c; + + a = _poly_csubq(a); + + px16 = hqx16_m1; + hq = px16[u256 0]; + + px16 = hhqx16; + hhq = px16[u256 0]; + + for i=0 to KYBER_N/32 + { + f0 = a[u256 2*i]; + f1 = a[u256 2*i + 1]; + f0 = #VPSUB_16u16(hq, f0); + f1 = #VPSUB_16u16(hq, f1); + g0 = #VPSRA_16u16(f0, 15); + g1 = #VPSRA_16u16(f1, 15); + f0 = #VPXOR_256(f0, g0); + f1 = #VPXOR_256(f1, g1); + f0 = #VPSUB_16u16(f0, hhq); + f1 = #VPSUB_16u16(f1, hhq); + f0 = #VPACKSS_16u16(f0, f1); + f0 = #VPERMQ(f0, 0xD8); + c = #VPMOVMSKB_u256u32(f0); + (u32)[rp+4*i] = c; + } + return a; +} + +fn _poly_tomsg_1(reg ptr u8[KYBER_INDCPA_MSGBYTES] rp, reg ptr u16[KYBER_N] a) -> reg ptr u8[KYBER_INDCPA_MSGBYTES], reg ptr u16[KYBER_N] +{ + inline int i; + reg u256 f0 f1 g0 g1 hq hhq; + reg ptr u16[16] px16; + reg u32 c; + + a = _poly_csubq(a); + + px16 = hqx16_m1; + hq = px16[u256 0]; + + px16 = hhqx16; + hhq = px16[u256 0]; + + for i=0 to KYBER_N/32 + { + f0 = a[u256 2*i]; + f1 = a[u256 2*i + 1]; + f0 = #VPSUB_16u16(hq, f0); + f1 = #VPSUB_16u16(hq, f1); + g0 = #VPSRA_16u16(f0, 15); + g1 = #VPSRA_16u16(f1, 15); + f0 = #VPXOR_256(f0, g0); + f1 = #VPXOR_256(f1, g1); + f0 = #VPSUB_16u16(f0, hhq); + f1 = #VPSUB_16u16(f1, hhq); + f0 = #VPACKSS_16u16(f0, f1); + f0 = #VPERMQ(f0, 0xD8); + c = #VPMOVMSKB_u256u32(f0); + rp[u32 i] = c; + } + return rp, a; +} diff --git a/code/jasmin/mlkem_avx2/poly_ntt.c b/code/jasmin/mlkem_avx2/poly_ntt.c new file mode 100644 index 00000000..83341812 --- /dev/null +++ b/code/jasmin/mlkem_avx2/poly_ntt.c @@ -0,0 +1,10 @@ +#include "poly.h" +#include "ntt.h" + +void poly_ntt_jazz(poly *r, int16_t *zetas) +{ + ntt(r->coeffs); + poly_reduce(r); +} + + diff --git a/code/jasmin/mlkem_avx2/polyvec.c b/code/jasmin/mlkem_avx2/polyvec.c new file mode 100644 index 00000000..316543a0 --- /dev/null +++ b/code/jasmin/mlkem_avx2/polyvec.c @@ -0,0 +1,237 @@ +#include +#include "polyvec.h" +#include "poly.h" + +/************************************************* +* Name: polyvec_compress +* +* Description: Compress and serialize vector of polynomials +* +* Arguments: - unsigned char *r: pointer to output byte array (needs space for KYBER_POLYVECCOMPRESSEDBYTES) +* - const polyvec *a: pointer to input vector of polynomials +**************************************************/ +void polyvec_compress(unsigned char *r, polyvec *a) +{ + int i,j,k; + + polyvec_csubq(a); + +#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352)) + uint16_t t[8]; + for(i=0;ivec[i].coeffs[8*j+k] << 11) + KYBER_Q/2) / KYBER_Q) & 0x7ff; + + r[11*j+ 0] = t[0] & 0xff; + r[11*j+ 1] = (t[0] >> 8) | ((t[1] & 0x1f) << 3); + r[11*j+ 2] = (t[1] >> 5) | ((t[2] & 0x03) << 6); + r[11*j+ 3] = (t[2] >> 2) & 0xff; + r[11*j+ 4] = (t[2] >> 10) | ((t[3] & 0x7f) << 1); + r[11*j+ 5] = (t[3] >> 7) | ((t[4] & 0x0f) << 4); + r[11*j+ 6] = (t[4] >> 4) | ((t[5] & 0x01) << 7); + r[11*j+ 7] = (t[5] >> 1) & 0xff; + r[11*j+ 8] = (t[5] >> 9) | ((t[6] & 0x3f) << 2); + r[11*j+ 9] = (t[6] >> 6) | ((t[7] & 0x07) << 5); + r[11*j+10] = (t[7] >> 3); + } + r += 352; + } +#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320)) + uint16_t t[4]; + for(i=0;ivec[i].coeffs[4*j+k] << 10) + KYBER_Q/2) / KYBER_Q) & 0x3ff; + + r[5*j+ 0] = t[0] & 0xff; + r[5*j+ 1] = (t[0] >> 8) | ((t[1] & 0x3f) << 2); + r[5*j+ 2] = (t[1] >> 6) | ((t[2] & 0x0f) << 4); + r[5*j+ 3] = (t[2] >> 4) | ((t[3] & 0x03) << 6); + r[5*j+ 4] = (t[3] >> 2); + } + r += 320; + } +#else +#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}" +#endif +} + +/************************************************* +* Name: polyvec_decompress +* +* Description: De-serialize and decompress vector of polynomials; +* approximate inverse of polyvec_compress +* +* Arguments: - polyvec *r: pointer to output vector of polynomials +* - unsigned char *a: pointer to input byte array (of length KYBER_POLYVECCOMPRESSEDBYTES) +**************************************************/ +void polyvec_decompress(polyvec *r, const unsigned char *a) +{ + int i,j; +#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352)) + for(i=0;ivec[i].coeffs[8*j+0] = (((a[11*j+ 0] | (((uint32_t)a[11*j+ 1] & 0x07) << 8)) * KYBER_Q) + 1024) >> 11; + r->vec[i].coeffs[8*j+1] = ((((a[11*j+ 1] >> 3) | (((uint32_t)a[11*j+ 2] & 0x3f) << 5)) * KYBER_Q) + 1024) >> 11; + r->vec[i].coeffs[8*j+2] = ((((a[11*j+ 2] >> 6) | (((uint32_t)a[11*j+ 3] & 0xff) << 2) | (((uint32_t)a[11*j+ 4] & 0x01) << 10)) * KYBER_Q) + 1024) >> 11; + r->vec[i].coeffs[8*j+3] = ((((a[11*j+ 4] >> 1) | (((uint32_t)a[11*j+ 5] & 0x0f) << 7)) * KYBER_Q) + 1024) >> 11; + r->vec[i].coeffs[8*j+4] = ((((a[11*j+ 5] >> 4) | (((uint32_t)a[11*j+ 6] & 0x7f) << 4)) * KYBER_Q) + 1024) >> 11; + r->vec[i].coeffs[8*j+5] = ((((a[11*j+ 6] >> 7) | (((uint32_t)a[11*j+ 7] & 0xff) << 1) | (((uint32_t)a[11*j+ 8] & 0x03) << 9)) * KYBER_Q) + 1024) >> 11; + r->vec[i].coeffs[8*j+6] = ((((a[11*j+ 8] >> 2) | (((uint32_t)a[11*j+ 9] & 0x1f) << 6)) * KYBER_Q) + 1024) >> 11; + r->vec[i].coeffs[8*j+7] = ((((a[11*j+ 9] >> 5) | (((uint32_t)a[11*j+10] & 0xff) << 3)) * KYBER_Q) + 1024) >> 11; + } + a += 352; + } +#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320)) + for(i=0;ivec[i].coeffs[4*j+0] = (((a[5*j+ 0] | (((uint32_t)a[5*j+ 1] & 0x03) << 8)) * KYBER_Q) + 512) >> 10; + r->vec[i].coeffs[4*j+1] = ((((a[5*j+ 1] >> 2) | (((uint32_t)a[5*j+ 2] & 0x0f) << 6)) * KYBER_Q) + 512) >> 10; + r->vec[i].coeffs[4*j+2] = ((((a[5*j+ 2] >> 4) | (((uint32_t)a[5*j+ 3] & 0x3f) << 4)) * KYBER_Q) + 512) >> 10; + r->vec[i].coeffs[4*j+3] = ((((a[5*j+ 3] >> 6) | (((uint32_t)a[5*j+ 4] & 0xff) << 2)) * KYBER_Q) + 512) >> 10; + } + a += 320; + } +#else +#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}" +#endif +} + +/************************************************* +* Name: polyvec_tobytes +* +* Description: Serialize vector of polynomials +* +* Arguments: - unsigned char *r: pointer to output byte array (needs space for KYBER_POLYVECBYTES) +* - const polyvec *a: pointer to input vector of polynomials +**************************************************/ +void polyvec_tobytes(unsigned char *r, polyvec *a) +{ + int i; + for(i=0;ivec[i]); +} + +/************************************************* +* Name: polyvec_frombytes +* +* Description: De-serialize vector of polynomials; +* inverse of polyvec_tobytes +* +* Arguments: - unsigned char *r: pointer to output byte array +* - const polyvec *a: pointer to input vector of polynomials (of length KYBER_POLYVECBYTES) +**************************************************/ +void polyvec_frombytes(polyvec *r, const unsigned char *a) +{ + int i; + for(i=0;ivec[i], a+i*KYBER_POLYBYTES); +} + +/************************************************* +* Name: polyvec_ntt +* +* Description: Apply forward NTT to all elements of a vector of polynomials +* +* Arguments: - polyvec *r: pointer to in/output vector of polynomials +**************************************************/ +void polyvec_ntt(polyvec *r) +{ + int i; + for(i=0;ivec[i]); +} + +/************************************************* +* Name: polyvec_invntt +* +* Description: Apply inverse NTT to all elements of a vector of polynomials +* +* Arguments: - polyvec *r: pointer to in/output vector of polynomials +**************************************************/ +void polyvec_invntt(polyvec *r) +{ + int i; + for(i=0;ivec[i]); +} + +/************************************************* +* Name: polyvec_pointwise_acc +* +* Description: Pointwise multiply elements of a and b and accumulate into r +* +* Arguments: - poly *r: pointer to output polynomial +* - const polyvec *a: pointer to first input vector of polynomials +* - const polyvec *b: pointer to second input vector of polynomials +**************************************************/ +void polyvec_pointwise_acc(poly *r, const polyvec *a, const polyvec *b) +{ + int i; + poly t; + + poly_basemul(r, &a->vec[0], &b->vec[0]); + for(i=1;ivec[i], &b->vec[i]); + poly_add(r, r, &t); + } + + poly_reduce(r); +} + +/************************************************* +* Name: polyvec_reduce +* +* Description: Applies Barrett reduction to each coefficient +* of each element of a vector of polynomials +* for details of the Barrett reduction see comments in reduce.c +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ +void polyvec_reduce(polyvec *r) +{ + int i; + for(i=0;ivec[i]); +} + +/************************************************* +* Name: polyvec_csubq +* +* Description: Applies conditional subtraction of q to each coefficient +* of each element of a vector of polynomials +* for details of conditional subtraction of q see comments in reduce.c +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ +void polyvec_csubq(polyvec *r) +{ + int i; + for(i=0;ivec[i]); +} + +/************************************************* +* Name: polyvec_add +* +* Description: Add vectors of polynomials +* +* Arguments: - polyvec *r: pointer to output vector of polynomials +* - const polyvec *a: pointer to first input vector of polynomials +* - const polyvec *b: pointer to second input vector of polynomials +**************************************************/ +void polyvec_add(polyvec *r, const polyvec *a, const polyvec *b) +{ + int i; + for(i=0;ivec[i], &a->vec[i], &b->vec[i]); +} diff --git a/code/jasmin/mlkem_avx2/polyvec.h b/code/jasmin/mlkem_avx2/polyvec.h new file mode 100644 index 00000000..9fbdb673 --- /dev/null +++ b/code/jasmin/mlkem_avx2/polyvec.h @@ -0,0 +1,47 @@ +#ifndef POLYVEC_H +#define POLYVEC_H + +#include "params.h" +#include "poly.h" + +typedef struct{ + poly vec[KYBER_K]; +} polyvec; + +void polyvec_compress(unsigned char *r, polyvec *a); +void polyvec_decompress(polyvec *r, const unsigned char *a); + +void polyvec_tobytes(unsigned char *r, polyvec *a); +void polyvec_frombytes(polyvec *r, const unsigned char *a); + +void polyvec_ntt(polyvec *r); +void polyvec_invntt(polyvec *r); + +void polyvec_pointwise_acc(poly *r, const polyvec *a, const polyvec *b); + +void polyvec_reduce(polyvec *r); +void polyvec_csubq(polyvec *r); + +void polyvec_add(polyvec *r, const polyvec *a, const polyvec *b); + + + + +void polyvec_compress_jazz(unsigned char *r, polyvec *a); +void polyvec_decompress_jazz(polyvec *r, const unsigned char *a); + +void polyvec_tobytes_jazz(unsigned char *r, polyvec *a); +void polyvec_frombytes_jazz(polyvec *r, const unsigned char *a); + +void polyvec_ntt_jazz(polyvec *r); +void polyvec_invntt_jazz(polyvec *r); + +void polyvec_pointwise_acc_jazz(poly *r, const polyvec *a, const polyvec *b); + +void polyvec_reduce_jazz(polyvec *r); +void polyvec_csubq_jazz(polyvec *r); + +void polyvec_add2_jazz(polyvec *r, const polyvec *b); + + +#endif diff --git a/code/jasmin/mlkem_avx2/polyvec.jinc b/code/jasmin/mlkem_avx2/polyvec.jinc new file mode 100644 index 00000000..d68af2ad --- /dev/null +++ b/code/jasmin/mlkem_avx2/polyvec.jinc @@ -0,0 +1,241 @@ +require "params.jinc" +require "poly.jinc" +require "shuffle.jinc" + +inline +fn __polyvec_add2(stack u16[KYBER_VECN] r, stack u16[KYBER_VECN] b) -> stack u16[KYBER_VECN] +{ + r[0:KYBER_N] = _poly_add2(r[0:KYBER_N], b[0:KYBER_N]); + r[KYBER_N:KYBER_N] = _poly_add2(r[KYBER_N:KYBER_N], b[KYBER_N:KYBER_N]); + r[2*KYBER_N:KYBER_N] = _poly_add2(r[2*KYBER_N:KYBER_N], b[2*KYBER_N:KYBER_N]); + + return r; +} + +inline +fn __polyvec_csubq(stack u16[KYBER_VECN] r) -> stack u16[KYBER_VECN] +{ + r[0:KYBER_N] = _poly_csubq(r[0:KYBER_N]); + r[KYBER_N:KYBER_N] = _poly_csubq(r[KYBER_N:KYBER_N]); + r[2*KYBER_N:KYBER_N] = _poly_csubq(r[2*KYBER_N:KYBER_N]); + + return r; +} + +u32 pvd_q_s = 0x0d013404; +u8[32] pvd_shufbdidx_s = {0, 1, 1, 2, 2, 3, 3, 4, + 5, 6, 6, 7, 7, 8, 8, 9, + 2, 3, 3, 4, 4, 5, 5, 6, + 7, 8, 8, 9, 9, 10, 10, 11}; +u64 pvd_sllvdidx_s = 0x04; +u32 pvd_mask_s = 0x7fe01ff8; + +inline +fn __polyvec_decompress(reg u64 rp) -> stack u16[KYBER_VECN] +{ + inline int i k; + reg u256 f q shufbidx sllvdidx mask; + stack u16[KYBER_VECN] r; + + q = #VPBROADCAST_8u32(pvd_q_s); + shufbidx = pvd_shufbdidx_s[u256 0]; + sllvdidx = #VPBROADCAST_4u64(pvd_sllvdidx_s); + mask = #VPBROADCAST_8u32(pvd_mask_s); + + for k=0 to KYBER_K + { + for i=0 to KYBER_N/16 + { + f = (u256)[rp + 320 * k + 20 * i]; + f = #VPERMQ(f, 0x94); + f = #VPSHUFB_256(f, shufbidx); + f = #VPSLLV_8u32(f, sllvdidx); + f = #VPSRL_16u16(f, 1); + f = #VPAND_256(f, mask); + f = #VPMULHRS_16u16(f, q); + r[u256 16*k + i] = f; + } + } + + return r; +} + +u16 pvc_off_s = 0x0f; +u16 pvc_shift1_s = 0x1000; +u16 pvc_mask_s = 0x03ff; +u64 pvc_shift2_s = 0x0400000104000001; +u64 pvc_sllvdidx_s = 0x0C; +u8[32] pvc_shufbidx_s = {0, 1, 2, 3, 4, 8, 9, 10, 11, 12, -1, -1, -1, -1, -1, -1, + 9, 10, 11, 12, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 8}; + +inline +fn __polyvec_compress(reg u64 rp, stack u16[KYBER_VECN] a) +{ + inline int i; + reg u256 f0 f1 f2 v v8 off shift1 mask shift2 sllvdidx shufbidx; + reg u128 t0 t1; + reg ptr u16[16] x16p; + + a = __polyvec_csubq(a); + + x16p = jvx16; + v = x16p[u256 0]; + v8 = #VPSLL_16u16(v, 3); + off = #VPBROADCAST_16u16(pvc_off_s); + shift1 = #VPBROADCAST_16u16(pvc_shift1_s); + mask = #VPBROADCAST_16u16(pvc_mask_s); + shift2 = #VPBROADCAST_4u64(pvc_shift2_s); + sllvdidx = #VPBROADCAST_4u64(pvc_sllvdidx_s); + shufbidx = pvc_shufbidx_s[u256 0]; + + for i=0 to KYBER_VECN/16 + { + f0 = a[u256 i]; + f1 = #VPMULL_16u16(f0, v8); + f2 = #VPADD_16u16(f0, off); + f0 = #VPSLL_16u16(f0, 3); + f0 = #VPMULH_16u16(f0, v); + f2 = #VPSUB_16u16(f1, f2); + f1 = #VPANDN_256(f1, f2); + f1 = #VPSRL_16u16(f1, 15); + f0 = #VPSUB_16u16(f0, f1); + f0 = #VPMULHRS_16u16(f0, shift1); + f0 = #VPAND_256(f0, mask); + f0 = #VPMADDWD_256(f0, shift2); + f0 = #VPSLLV_8u32(f0, sllvdidx); + f0 = #VPSRL_4u64(f0, 12); + f0 = #VPSHUFB_256(f0, shufbidx); + t0 = (128u)f0; + t1 = #VEXTRACTI128(f0, 1); + t0 = #VPBLEND_8u16(t0, t1, 0xE0); + (u128)[rp + 20*i] = t0; + (u32)[rp + 20*i + 16] = #VPEXTR_32(t1, 0); + } +} + +inline +fn __polyvec_compress_1(reg ptr u8[KYBER_POLYVECCOMPRESSEDBYTES] rp, stack u16[KYBER_VECN] a) -> reg ptr u8[KYBER_POLYVECCOMPRESSEDBYTES] +{ + inline int i; + reg u256 f0 f1 f2 v v8 off shift1 mask shift2 sllvdidx shufbidx; + reg u128 t0 t1; + reg ptr u16[16] x16p; + + a = __polyvec_csubq(a); + + x16p = jvx16; + v = x16p[u256 0]; + v8 = #VPSLL_16u16(v, 3); + off = #VPBROADCAST_16u16(pvc_off_s); + shift1 = #VPBROADCAST_16u16(pvc_shift1_s); + mask = #VPBROADCAST_16u16(pvc_mask_s); + shift2 = #VPBROADCAST_4u64(pvc_shift2_s); + sllvdidx = #VPBROADCAST_4u64(pvc_sllvdidx_s); + shufbidx = pvc_shufbidx_s[u256 0]; + + for i=0 to KYBER_VECN/16 + { + f0 = a[u256 i]; + f1 = #VPMULL_16u16(f0, v8); + f2 = #VPADD_16u16(f0, off); + f0 = #VPSLL_16u16(f0, 3); + f0 = #VPMULH_16u16(f0, v); + f2 = #VPSUB_16u16(f1, f2); + f1 = #VPANDN_256(f1, f2); + f1 = #VPSRL_16u16(f1, 15); + f0 = #VPSUB_16u16(f0, f1); + f0 = #VPMULHRS_16u16(f0, shift1); + f0 = #VPAND_256(f0, mask); + f0 = #VPMADDWD_256(f0, shift2); + f0 = #VPSLLV_8u32(f0, sllvdidx); + f0 = #VPSRL_4u64(f0, 12); + f0 = #VPSHUFB_256(f0, shufbidx); + t0 = (128u)f0; + t1 = #VEXTRACTI128(f0, 1); + t0 = #VPBLEND_8u16(t0, t1, 0xE0); + rp.[u128 20*i] = t0; + rp.[u32 20*i + 16] = #VPEXTR_32(t1, 0); + } + + return rp; +} + +inline +fn __polyvec_frombytes(reg u64 ap) -> stack u16[KYBER_VECN] +{ + stack u16[KYBER_VECN] r; + reg u64 pp; + + pp = ap; + r[0:KYBER_N] = _poly_frombytes(r[0:KYBER_N], pp); + pp += KYBER_POLYBYTES; + r[KYBER_N:KYBER_N] = _poly_frombytes(r[KYBER_N:KYBER_N], pp); + pp += KYBER_POLYBYTES; + r[2*KYBER_N:KYBER_N] = _poly_frombytes(r[2*KYBER_N:KYBER_N], pp); + + return r; +} + + +inline +fn __polyvec_invntt(stack u16[KYBER_VECN] r) -> stack u16[KYBER_VECN] +{ + r[0:KYBER_N] = _poly_invntt(r[0:KYBER_N]); + r[KYBER_N:KYBER_N] = _poly_invntt(r[KYBER_N:KYBER_N]); + r[2*KYBER_N:KYBER_N] = _poly_invntt(r[2*KYBER_N:KYBER_N]); + + return r; +} + + +inline +fn __polyvec_ntt(stack u16[KYBER_VECN] r) -> stack u16[KYBER_VECN] +{ + r[0:KYBER_N] = _poly_ntt(r[0:KYBER_N]); + r[KYBER_N:KYBER_N] = _poly_ntt(r[KYBER_N:KYBER_N]); + r[2*KYBER_N:KYBER_N] = _poly_ntt(r[2*KYBER_N:KYBER_N]); + + return r; +} + + +inline +fn __polyvec_reduce(stack u16[KYBER_VECN] r) -> stack u16[KYBER_VECN] +{ + r[0:KYBER_N] = __poly_reduce(r[0:KYBER_N]); + r[KYBER_N:KYBER_N] = __poly_reduce(r[KYBER_N:KYBER_N]); + r[2*KYBER_N:KYBER_N] = __poly_reduce(r[2*KYBER_N:KYBER_N]); + + return r; +} + + +inline +fn __polyvec_pointwise_acc(stack u16[KYBER_N] r, stack u16[KYBER_VECN] a b) -> stack u16[KYBER_N] +{ + stack u16[KYBER_N] t; + + r = _poly_basemul(r, a[0:KYBER_N], b[0:KYBER_N]); + t = _poly_basemul(t, a[KYBER_N:KYBER_N], b[KYBER_N:KYBER_N]); + r = _poly_add2(r, t); + t = _poly_basemul(t, a[2*KYBER_N:KYBER_N], b[2*KYBER_N:KYBER_N]); + r = _poly_add2(r, t); + + // r = __poly_reduce(r); + + return r; +} + + +inline +fn __polyvec_tobytes(reg u64 rp, stack u16[KYBER_VECN] a) +{ + reg u64 pp; + + pp = rp; + a[0:KYBER_N] = _poly_tobytes(pp, a[0:KYBER_N]); + pp += KYBER_POLYBYTES; + a[KYBER_N:KYBER_N] = _poly_tobytes(pp, a[KYBER_N:KYBER_N]); + pp += KYBER_POLYBYTES; + a[2*KYBER_N:KYBER_N] = _poly_tobytes(pp, a[2*KYBER_N:KYBER_N]); +} diff --git a/code/jasmin/mlkem_avx2/reduce.c b/code/jasmin/mlkem_avx2/reduce.c new file mode 100644 index 00000000..39264b09 --- /dev/null +++ b/code/jasmin/mlkem_avx2/reduce.c @@ -0,0 +1,62 @@ +#include +#include "params.h" +#include "reduce.h" + +/************************************************* +* Name: montgomery_reduce +* +* Description: Montgomery reduction; given a 32-bit integer a, computes +* 16-bit integer congruent to a * R^-1 mod q, +* where R=2^16 +* +* Arguments: - int32_t a: input integer to be reduced; has to be in {-q2^15,...,q2^15-1} +* +* Returns: integer in {-q+1,...,q-1} congruent to a * R^-1 modulo q. +**************************************************/ +int16_t montgomery_reduce(int32_t a) +{ + int32_t t; + int16_t u; + +// printf("a: %d\n", a); + u = a * QINV; + t = (int32_t)u * KYBER_Q; + t = a - t; + t >>= 16; + return t; +} + +/************************************************* +* Name: barrett_reduce +* +* Description: Barrett reduction; given a 16-bit integer a, computes +* 16-bit integer congruent to a mod q in {0,...,q} +* +* Arguments: - int16_t a: input integer to be reduced +* +* Returns: integer in {0,...,q} congruent to a modulo q. +**************************************************/ +int16_t barrett_reduce(int16_t a) { + int32_t t; + const int32_t v = (1U << 26)/KYBER_Q + 1; + + t = v*a; + t >>= 26; + t *= KYBER_Q; + return a - t; +} + +/************************************************* +* Name: csubq +* +* Description: Conditionallly subtract q +* +* Arguments: - int16_t x: input integer +* +* Returns: a - q if a >= q, else a +**************************************************/ +int16_t csubq(int16_t a) { + a -= KYBER_Q; + a += (a >> 15) & KYBER_Q; + return a; +} diff --git a/code/jasmin/mlkem_avx2/reduce.h b/code/jasmin/mlkem_avx2/reduce.h new file mode 100644 index 00000000..59ee6ef4 --- /dev/null +++ b/code/jasmin/mlkem_avx2/reduce.h @@ -0,0 +1,15 @@ +#ifndef REDUCE_H +#define REDUCE_H + +#include + +#define MONT 2285 // 2^16 % Q +#define QINV 62209 // q^(-1) mod 2^16 + +int16_t montgomery_reduce(int32_t a); + +int16_t barrett_reduce(int16_t a); + +int16_t csubq(int16_t x); + +#endif diff --git a/code/jasmin/mlkem_avx2/reduce.jinc b/code/jasmin/mlkem_avx2/reduce.jinc new file mode 100644 index 00000000..f4070e6b --- /dev/null +++ b/code/jasmin/mlkem_avx2/reduce.jinc @@ -0,0 +1,95 @@ +require "params.jinc" + +param int QINV = 62209; /* q^(-1) mod 2^16 */ +param int MONT = 2285; /* 2^16 % Q */ +param int BARR = 20159; /* (1U << 26)/KYBER_Q + 1 */ + +inline +fn __csubq(reg u256 r qx16) -> reg u256 +{ + reg u256 t; + r = #VPSUB_16u16(r, qx16); + t = #VPSRA_16u16(r, 15); + t = #VPAND_256(t, qx16); + r = #VPADD_16u16(t, r); + return r; +} + +inline +fn __red16x(reg u256 r qx16 vx16) -> reg u256 +{ + reg u256 x; + x = #VPMULH_16u16(r, vx16); + x = #VPSRA_16u16(x, 10); + x = #VPMULL_16u16(x, qx16); + r = #VPSUB_16u16(r, x); + return r; +} + +inline +fn __fqmulprecomp16x(reg u256 b al ah qx16) -> reg u256 +{ + reg u256 x; + x = #VPMULL_16u16(al, b); + b = #VPMULH_16u16(ah, b); + x = #VPMULH_16u16(x, qx16); + b = #VPSUB_16u16(b, x); + return b; +} + +inline +fn __fqmulx16(reg u256 a b qx16 qinvx16) -> reg u256 +{ + reg u256 rd rhi rlo; + rhi = #VPMULH_16u16(a, b); + rlo = #VPMULL_16u16(a, b); + + rlo = #VPMULL_16u16(rlo, qinvx16); + rlo = #VPMULH_16u16(rlo, qx16); + rd = #VPSUB_16u16(rhi, rlo); + + return rd; +} + +inline +fn __fqmul(reg u16 a, reg u16 b) -> reg u16 +{ + reg u32 ad; + reg u32 bd; + reg u32 c; + reg u32 t; + reg u16 r; + reg u32 u; + + ad = (32s)a; + bd = (32s)b; + + c = ad * bd; + + u = c * QINV; + u <<= 16; + //u = #SAR_32(u, 16); + u >>s= 16; + t = u * KYBER_Q; + t = c - t; + //t = #SAR_32(t, 16); + t >>s= 16; + r = t; + return r; +} + +inline +fn __barrett_reduce(reg u16 a) -> reg u16 +{ + reg u32 t; + reg u16 r; + t = (32s)a; + t = t * BARR; + //t = #SAR_32(t, 26); + t >>s= 26; + t *= KYBER_Q; + r = t; + r = a; + r -= t; + return r; +} diff --git a/code/jasmin/mlkem_avx2/shuffle.S b/code/jasmin/mlkem_avx2/shuffle.S new file mode 100644 index 00000000..46b676a1 --- /dev/null +++ b/code/jasmin/mlkem_avx2/shuffle.S @@ -0,0 +1,261 @@ +#include "consts.h" +.include "fq.inc" +.include "shuffle.inc" + +nttpack128_avx: +#load +vmovdqa (%rdi),%ymm4 +vmovdqa 32(%rdi),%ymm5 +vmovdqa 64(%rdi),%ymm6 +vmovdqa 96(%rdi),%ymm7 +vmovdqa 128(%rdi),%ymm8 +vmovdqa 160(%rdi),%ymm9 +vmovdqa 192(%rdi),%ymm10 +vmovdqa 224(%rdi),%ymm11 + +shuffle1 4,5,3,5 +shuffle1 6,7,4,7 +shuffle1 8,9,6,9 +shuffle1 10,11,8,11 + +shuffle2 3,4,10,4 +shuffle2 6,8,3,8 +shuffle2 5,7,6,7 +shuffle2 9,11,5,11 + +shuffle4 10,3,9,3 +shuffle4 6,5,10,5 +shuffle4 4,8,6,8 +shuffle4 7,11,4,11 + +shuffle8 9,10,7,10 +shuffle8 6,4,9,4 +shuffle8 3,5,6,5 +shuffle8 8,11,3,11 + +#store +vmovdqa %ymm7,(%rdi) +vmovdqa %ymm9,32(%rdi) +vmovdqa %ymm6,64(%rdi) +vmovdqa %ymm3,96(%rdi) +vmovdqa %ymm10,128(%rdi) +vmovdqa %ymm4,160(%rdi) +vmovdqa %ymm5,192(%rdi) +vmovdqa %ymm11,224(%rdi) + +ret + +.text +nttunpack128_avx: +#load +vmovdqa (%rdi),%ymm4 +vmovdqa 32(%rdi),%ymm5 +vmovdqa 64(%rdi),%ymm6 +vmovdqa 96(%rdi),%ymm7 +vmovdqa 128(%rdi),%ymm8 +vmovdqa 160(%rdi),%ymm9 +vmovdqa 192(%rdi),%ymm10 +vmovdqa 224(%rdi),%ymm11 + +shuffle8 4,8,3,8 +shuffle8 5,9,4,9 +shuffle8 6,10,5,10 +shuffle8 7,11,6,11 + +shuffle4 3,5,7,5 +shuffle4 8,10,3,10 +shuffle4 4,6,8,6 +shuffle4 9,11,4,11 + +shuffle2 7,8,9,8 +shuffle2 5,6,7,6 +shuffle2 3,4,5,4 +shuffle2 10,11,3,11 + +shuffle1 9,5,10,5 +shuffle1 8,4,9,4 +shuffle1 7,3,8,3 +shuffle1 6,11,7,11 + +#store +vmovdqa %ymm10,(%rdi) +vmovdqa %ymm5,32(%rdi) +vmovdqa %ymm9,64(%rdi) +vmovdqa %ymm4,96(%rdi) +vmovdqa %ymm8,128(%rdi) +vmovdqa %ymm3,160(%rdi) +vmovdqa %ymm7,192(%rdi) +vmovdqa %ymm11,224(%rdi) + +ret + +.global cdecl(nttunpack_avx) +cdecl(nttunpack_avx): +call nttunpack128_avx +add $256,%rdi +call nttunpack128_avx +ret + +.global cdecl(nttpack_avx) +cdecl(nttpack_avx): +call nttpack128_avx +add $256,%rdi +call nttpack128_avx +ret + + +ntttobytes128_avx: +#load +vmovdqa (%rsi),%ymm5 +vmovdqa 32(%rsi),%ymm6 +vmovdqa 64(%rsi),%ymm7 +vmovdqa 96(%rsi),%ymm8 +vmovdqa 128(%rsi),%ymm9 +vmovdqa 160(%rsi),%ymm10 +vmovdqa 192(%rsi),%ymm11 +vmovdqa 224(%rsi),%ymm12 + +#csubq +csubq 5,13 +csubq 6,14 +csubq 7,15 +csubq 8,1 +csubq 9,13 +csubq 10,14 +csubq 11,15 +csubq 12,1 + +#bitpack +vpsllw $12,%ymm6,%ymm4 +vpor %ymm4,%ymm5,%ymm4 + +vpsrlw $4,%ymm6,%ymm5 +vpsllw $8,%ymm7,%ymm6 +vpor %ymm5,%ymm6,%ymm5 + +vpsrlw $8,%ymm7,%ymm6 +vpsllw $4,%ymm8,%ymm7 +vpor %ymm6,%ymm7,%ymm6 + +vpsllw $12,%ymm10,%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +vpsrlw $4,%ymm10,%ymm8 +vpsllw $8,%ymm11,%ymm9 +vpor %ymm8,%ymm9,%ymm8 + +vpsrlw $8,%ymm11,%ymm9 +vpsllw $4,%ymm12,%ymm10 +vpor %ymm9,%ymm10,%ymm9 + +shuffle1 4,5,3,5 +shuffle1 6,7,4,7 +shuffle1 8,9,6,9 + +shuffle2 3,4,8,4 +shuffle2 6,5,3,5 +shuffle2 7,9,6,9 + +shuffle4 8,3,7,3 +shuffle4 6,4,8,4 +shuffle4 5,9,6,9 + +shuffle8 7,8,5,8 +shuffle8 6,3,7,3 +shuffle8 4,9,6,9 + +#store +vmovdqu %ymm5,(%rdi) +vmovdqu %ymm7,32(%rdi) +vmovdqu %ymm6,64(%rdi) +vmovdqu %ymm8,96(%rdi) +vmovdqu %ymm3,128(%rdi) +vmovdqu %ymm9,160(%rdi) + +ret + +.global cdecl(ntttobytes_avx) +cdecl(ntttobytes_avx): +#consts +vmovdqa _16XQ*2(%rdx),%ymm0 +call ntttobytes128_avx +add $256,%rsi +add $192,%rdi +call ntttobytes128_avx +ret + +nttfrombytes128_avx: +#load +vmovdqu (%rsi),%ymm4 +vmovdqu 32(%rsi),%ymm5 +vmovdqu 64(%rsi),%ymm6 +vmovdqu 96(%rsi),%ymm7 +vmovdqu 128(%rsi),%ymm8 +vmovdqu 160(%rsi),%ymm9 + +shuffle8 4,7,3,7 +shuffle8 5,8,4,8 +shuffle8 6,9,5,9 + +shuffle4 3,8,6,8 +shuffle4 7,5,3,5 +shuffle4 4,9,7,9 + +shuffle2 6,5,4,5 +shuffle2 8,7,6,7 +shuffle2 3,9,8,9 + +shuffle1 4,7,10,7 +shuffle1 5,8,4,8 +shuffle1 6,9,5,9 + +#bitunpack +vpsrlw $12,%ymm10,%ymm11 +vpsllw $4,%ymm7,%ymm12 +vpor %ymm11,%ymm12,%ymm11 +vpand %ymm0,%ymm10,%ymm10 +vpand %ymm0,%ymm11,%ymm11 + +vpsrlw $8,%ymm7,%ymm12 +vpsllw $8,%ymm4,%ymm13 +vpor %ymm12,%ymm13,%ymm12 +vpand %ymm0,%ymm12,%ymm12 + +vpsrlw $4,%ymm4,%ymm13 +vpand %ymm0,%ymm13,%ymm13 + +vpsrlw $12,%ymm8,%ymm14 +vpsllw $4,%ymm5,%ymm15 +vpor %ymm14,%ymm15,%ymm14 +vpand %ymm0,%ymm8,%ymm8 +vpand %ymm0,%ymm14,%ymm14 + +vpsrlw $8,%ymm5,%ymm15 +vpsllw $8,%ymm9,%ymm1 +vpor %ymm15,%ymm1,%ymm15 +vpand %ymm0,%ymm15,%ymm15 + +vpsrlw $4,%ymm9,%ymm1 +vpand %ymm0,%ymm1,%ymm1 + +#store +vmovdqa %ymm10,(%rdi) +vmovdqa %ymm11,32(%rdi) +vmovdqa %ymm12,64(%rdi) +vmovdqa %ymm13,96(%rdi) +vmovdqa %ymm8,128(%rdi) +vmovdqa %ymm14,160(%rdi) +vmovdqa %ymm15,192(%rdi) +vmovdqa %ymm1,224(%rdi) + +ret + +.global cdecl(nttfrombytes_avx) +cdecl(nttfrombytes_avx): +#consts +vmovdqa _16XMASK*2(%rdx),%ymm0 +call nttfrombytes128_avx +add $256,%rdi +add $192,%rsi +call nttfrombytes128_avx +ret diff --git a/code/jasmin/mlkem_avx2/shuffle.inc b/code/jasmin/mlkem_avx2/shuffle.inc new file mode 100644 index 00000000..df352030 --- /dev/null +++ b/code/jasmin/mlkem_avx2/shuffle.inc @@ -0,0 +1,23 @@ +.macro shuffle8 r0,r1,r2,r3 +vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2 +vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3 +.endm + +.macro shuffle4 r0,r1,r2,r3 +vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2 +vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 +.endm + +.macro shuffle2 r0,r1,r2,r3 +vpsllq $32,%ymm\r1,%ymm12 +vpsrlq $32,%ymm\r0,%ymm13 +vpblendd $0xAA,%ymm12,%ymm\r0,%ymm\r2 +vpblendd $0xAA,%ymm\r1,%ymm13,%ymm\r3 +.endm + +.macro shuffle1 r0,r1,r2,r3 +vpslld $16,%ymm\r1,%ymm12 +vpsrld $16,%ymm\r0,%ymm13 +vpblendw $0xAA,%ymm12,%ymm\r0,%ymm\r2 +vpblendw $0xAA,%ymm\r1,%ymm13,%ymm\r3 +.endm diff --git a/code/jasmin/mlkem_avx2/shuffle.jinc b/code/jasmin/mlkem_avx2/shuffle.jinc new file mode 100644 index 00000000..a187591f --- /dev/null +++ b/code/jasmin/mlkem_avx2/shuffle.jinc @@ -0,0 +1,192 @@ +inline +fn __shuffle8(reg u256 a b) -> reg u256, reg u256 +{ + reg u256 r0 r1; + r0 = #VPERM2I128(a,b,0x20); + r1 = #VPERM2I128(a,b,0x31); + return r0, r1; +} + +inline +fn __shuffle4(reg u256 a b) -> reg u256, reg u256 +{ + reg u256 r0 r1; + r0 = #VPUNPCKL_4u64(a,b); + r1 = #VPUNPCKH_4u64(a,b); + return r0, r1; +} + +inline +fn __shuffle2(reg u256 a b) -> reg u256, reg u256 +{ + reg u256 t0 t1; + t0 = #VMOVSLDUP_8u32(b); + t0 = #VPBLEND_8u32(a, t0, 0xAA); + a = #VPSRL_4u64(a,32); + t1 = #VPBLEND_8u32(a, b, 0xAA); + return t0, t1; +} + + +inline +fn __shuffle1(reg u256 a b) -> reg u256, reg u256 +{ + reg u256 r0 r1 t0 t1; + t0 = #VPSLL_8u32(b,16); + r0 = #VPBLEND_16u16(a,t0,0xAA); + t1 = #VPSRL_8u32(a,16); + r1 = #VPBLEND_16u16(t1,b,0xAA); + return r0, r1; +} + + +// Transform from AVX order to bitreversed order +inline +fn __nttpack128(reg u256 r0 r1 r2 r3 r4 r5 r6 r7) + -> reg u256, reg u256, reg u256, reg u256, reg u256, reg u256, reg u256, reg u256 +{ + r0, r1 = __shuffle1(r0, r1); + r2, r3 = __shuffle1(r2, r3); + r4, r5 = __shuffle1(r4, r5); + r6, r7 = __shuffle1(r6, r7); + + r0, r2 = __shuffle2(r0, r2); + r4, r6 = __shuffle2(r4, r6); + r1, r3 = __shuffle2(r1, r3); + r5, r7 = __shuffle2(r5, r7); + + r0, r4 = __shuffle4(r0, r4); + r1, r5 = __shuffle4(r1, r5); + r2, r6 = __shuffle4(r2, r6); + r3, r7 = __shuffle4(r3, r7); + + r0, r1 = __shuffle8(r0, r1); + r2, r3 = __shuffle8(r2, r3); + r4, r5 = __shuffle8(r4, r5); + r6, r7 = __shuffle8(r6, r7); + + return r0, r2, r4, r6, r1, r3, r5, r7; +} + + +// Transform from bitreversed order to AVX order +inline +fn __nttunpack128(reg u256 r0 r1 r2 r3 r4 r5 r6 r7) + -> reg u256, reg u256, reg u256, reg u256, reg u256, reg u256, reg u256, reg u256 +{ + r0, r4 = __shuffle8(r0, r4); + r1, r5 = __shuffle8(r1, r5); + r2, r6 = __shuffle8(r2, r6); + r3, r7 = __shuffle8(r3, r7); + + r0, r2 = __shuffle4(r0, r2); + r4, r6 = __shuffle4(r4, r6); + r1, r3 = __shuffle4(r1, r3); + r5, r7 = __shuffle4(r5, r7); + + r0, r1 = __shuffle2(r0, r1); + r2, r3 = __shuffle2(r2, r3); + r4, r5 = __shuffle2(r4, r5); + r6, r7 = __shuffle2(r6, r7); + + r0, r4 = __shuffle1(r0, r4); + r1, r5 = __shuffle1(r1, r5); + r2, r6 = __shuffle1(r2, r6); + r3, r7 = __shuffle1(r3, r7); + + return r0, r4, r1, r5, r2, r6, r3, r7; +} + +fn _nttpack(reg ptr u16[KYBER_N] rp) -> reg ptr u16[KYBER_N] +{ + reg u256 r0 r1 r2 r3 r4 r5 r6 r7; + + r0 = rp.[u256 32*0]; + r1 = rp.[u256 32*1]; + r2 = rp.[u256 32*2]; + r3 = rp.[u256 32*3]; + r4 = rp.[u256 32*4]; + r5 = rp.[u256 32*5]; + r6 = rp.[u256 32*6]; + r7 = rp.[u256 32*7]; + + r0, r1, r2, r3, r4, r5, r6, r7 = __nttpack128(r0, r1, r2, r3, r4, r5, r6, r7); + + rp.[u256 32*0] = r0; + rp.[u256 32*1] = r1; + rp.[u256 32*2] = r2; + rp.[u256 32*3] = r3; + rp.[u256 32*4] = r4; + rp.[u256 32*5] = r5; + rp.[u256 32*6] = r6; + rp.[u256 32*7] = r7; + + r0 = rp.[u256 32*8]; + r1 = rp.[u256 32*9]; + r2 = rp.[u256 32*10]; + r3 = rp.[u256 32*11]; + r4 = rp.[u256 32*12]; + r5 = rp.[u256 32*13]; + r6 = rp.[u256 32*14]; + r7 = rp.[u256 32*15]; + + r0, r1, r2, r3, r4, r5, r6, r7 = __nttpack128(r0, r1, r2, r3, r4, r5, r6, r7); + + rp.[u256 32*8] = r0; + rp.[u256 32*9] = r1; + rp.[u256 32*10] = r2; + rp.[u256 32*11] = r3; + rp.[u256 32*12] = r4; + rp.[u256 32*13] = r5; + rp.[u256 32*14] = r6; + rp.[u256 32*15] = r7; + + return rp; +} + +fn _nttunpack(reg ptr u16[KYBER_N] rp) -> reg ptr u16[KYBER_N] +{ + reg u256 r0 r1 r2 r3 r4 r5 r6 r7; + + r0 = rp.[u256 32*0]; + r1 = rp.[u256 32*1]; + r2 = rp.[u256 32*2]; + r3 = rp.[u256 32*3]; + r4 = rp.[u256 32*4]; + r5 = rp.[u256 32*5]; + r6 = rp.[u256 32*6]; + r7 = rp.[u256 32*7]; + + r0, r1, r2, r3, r4, r5, r6, r7 = __nttunpack128(r0, r1, r2, r3, r4, r5, r6, r7); + + rp.[u256 32*0] = r0; + rp.[u256 32*1] = r1; + rp.[u256 32*2] = r2; + rp.[u256 32*3] = r3; + rp.[u256 32*4] = r4; + rp.[u256 32*5] = r5; + rp.[u256 32*6] = r6; + rp.[u256 32*7] = r7; + + r0 = rp.[u256 32*8]; + r1 = rp.[u256 32*9]; + r2 = rp.[u256 32*10]; + r3 = rp.[u256 32*11]; + r4 = rp.[u256 32*12]; + r5 = rp.[u256 32*13]; + r6 = rp.[u256 32*14]; + r7 = rp.[u256 32*15]; + + r0, r1, r2, r3, r4, r5, r6, r7 = __nttunpack128(r0, r1, r2, r3, r4, r5, r6, r7); + + rp.[u256 32*8] = r0; + rp.[u256 32*9] = r1; + rp.[u256 32*10] = r2; + rp.[u256 32*11] = r3; + rp.[u256 32*12] = r4; + rp.[u256 32*13] = r5; + rp.[u256 32*14] = r6; + rp.[u256 32*15] = r7; + + return rp; +} diff --git a/code/jasmin/mlkem_avx2/speed.h b/code/jasmin/mlkem_avx2/speed.h new file mode 100644 index 00000000..b4b917c5 --- /dev/null +++ b/code/jasmin/mlkem_avx2/speed.h @@ -0,0 +1,62 @@ +#ifndef SPEED_H +#define SPEED_H + +#include +#include "params.h" + +typedef struct{ + int16_t __attribute__((aligned(32))) coeffs[KYBER_N]; +} poly; + +typedef struct{ + poly vec[KYBER_K]; +} polyvec; + +void gen_matrix_jazz(polyvec *a, unsigned char *seed); + +/*Poly functions*/ +void poly_compress_jazz(unsigned char *r, poly *a); +void poly_decompress_jazz(poly *r, const unsigned char *a); + +void poly_frommsg_jazz(poly *r, const unsigned char msg[KYBER_SYMBYTES]); +void poly_tomsg_jazz(unsigned char msg[KYBER_SYMBYTES], poly *r); + +void poly_getnoise_jazz(poly *r,const unsigned char *seed, unsigned char nonce); +void poly_getnoise_4x_jazz(poly *r0, poly *r1, poly *r2, poly *r3,const unsigned char *seed, unsigned char nonce); + +void poly_ntt_jazz(poly *r); +void poly_invntt_jazz(poly *r); + +/*Polyvec functions*/ +void polyvec_compress_jazz(unsigned char *r, polyvec *a); +void polyvec_decompress_jazz(polyvec *r, const unsigned char *a); + +void polyvec_pointwise_acc_jazz(poly *r, const polyvec *a, const polyvec *b); + +/* Indcpa functions*/ +void indcpa_keypair_jazz(unsigned char *pk, + unsigned char *sk, + const unsigned char *randomness); + +void indcpa_enc_jazz(unsigned char *c, + const unsigned char *m, + const unsigned char *pk, + const unsigned char *coins); + +void indcpa_dec_jazz(unsigned char *m, + const unsigned char *c, + const unsigned char *sk); + +/* KEM functions */ +void crypto_kem_keypair_jazz(unsigned char *pk, + unsigned char *sk, + const unsigned char *randomness); + +void crypto_kem_enc_jazz(unsigned char *c, + const unsigned char *m, + const unsigned char *pk, + const unsigned char *coins); +void crypto_kem_dec_jazz(unsigned char *m, + const unsigned char *c, + const unsigned char *sk); +#endif diff --git a/code/jasmin/mlkem_avx2/symmetric-fips202.c b/code/jasmin/mlkem_avx2/symmetric-fips202.c new file mode 100644 index 00000000..75e885ad --- /dev/null +++ b/code/jasmin/mlkem_avx2/symmetric-fips202.c @@ -0,0 +1,77 @@ +#include +#include "symmetric.h" +#include "fips202.h" + +/************************************************* +* Name: kyber_shake128_absorb +* +* Description: Absorb step of the SHAKE128 specialized for the Kyber context. +* +* Arguments: - uint64_t *s: pointer to (uninitialized) output Keccak state +* - const unsigned char *input: pointer to KYBER_SYMBYTES input to be absorbed into s +* - unsigned char i additional byte of input +* - unsigned char j additional byte of input +**************************************************/ +void kyber_shake128_absorb(keccak_state *s, const unsigned char *input, unsigned char x, unsigned char y) +{ + unsigned char extseed[KYBER_SYMBYTES+2]; + int i; + + for(i=0;is, extseed, KYBER_SYMBYTES+2); +} + +/************************************************* +* Name: kyber_shake128_squeezeblocks +* +* Description: Squeeze step of SHAKE128 XOF. Squeezes full blocks of SHAKE128_RATE bytes each. +* Modifies the state. Can be called multiple times to keep squeezing, +* i.e., is incremental. +* +* Arguments: - unsigned char *output: pointer to output blocks +* - unsigned long long nblocks: number of blocks to be squeezed (written to output) +* - keccak_state *s: pointer to in/output Keccak state +**************************************************/ +void kyber_shake128_squeezeblocks(unsigned char *output, unsigned long long nblocks, keccak_state *s) +{ + shake128_squeezeblocks(output, nblocks, s->s); +} + +/************************************************* +* Name: shake256_prf +* +* Description: Usage of SHAKE256 as a PRF, concatenates secret and public input +* and then generates outlen bytes of SHAKE256 output +* +* Arguments: - unsigned char *output: pointer to output +* - unsigned long long outlen: number of requested output bytes +* - const unsigned char * key: pointer to the key (of length KYBER_SYMBYTES) +* - const unsigned char nonce: single-byte nonce (public PRF input) +**************************************************/ +void shake256_prf(unsigned char *output, unsigned long long outlen, const unsigned char *key, const unsigned char nonce) +{ + unsigned char extkey[KYBER_SYMBYTES+1]; + size_t i; + + for(i=0;i +#include +#include +#include + +#include "../params.h" +#include "../ntt.h" +#include "../indcpa.h" + +#define NRUNS 100 + +static inline uint64_t cpucycles(void) { + uint64_t result; + + asm volatile("rdtsc; shlq $32,%%rdx; orq %%rdx,%%rax" + : "=a" (result) : : "%rdx"); + + return result; +} + +static int cmp_uint64(const void *a, const void *b) { + if(*(uint64_t *)a < *(uint64_t *)b) return -1; + if(*(uint64_t *)a > *(uint64_t *)b) return 1; + return 0; +} + +static uint64_t median(uint64_t *l, size_t llen) { + qsort(l,llen,sizeof(uint64_t),cmp_uint64); + + if(llen%2) return l[llen/2]; + else return (l[llen/2-1]+l[llen/2])/2; +} + +static uint64_t average(uint64_t *t, size_t tlen) { + size_t i; + uint64_t acc=0; + + for(i=0;i +#include +#include +#include + +#include "../params.h" +#include "../speed.h" + +#define NRUNS 1000 + +static inline uint64_t cpucycles(void) { + uint64_t result; + + asm volatile("rdtsc; shlq $32,%%rdx; orq %%rdx,%%rax" + : "=a" (result) : : "%rdx"); + + return result; +} + +static int cmp_uint64(const void *a, const void *b) { + if(*(uint64_t *)a < *(uint64_t *)b) return -1; + if(*(uint64_t *)a > *(uint64_t *)b) return 1; + return 0; +} + +static uint64_t median(uint64_t *l, size_t llen) { + qsort(l,llen,sizeof(uint64_t),cmp_uint64); + + if(llen%2) return l[llen/2]; + else return (l[llen/2-1]+l[llen/2])/2; +} + +static uint64_t average(uint64_t *t, size_t tlen) { + size_t i; + uint64_t acc=0; + + for(i=0;i +#include "../fips202.h" + +#define MAXINLEN 33 +#define MAXOUTLEN 168 + +int main(void) +{ + unsigned char in[MAXINLEN]; + unsigned char out0[MAXOUTLEN]; + unsigned char out1[MAXOUTLEN]; + uint64_t state0[25]; + uint64_t state1[25]; + int k; + + FILE *urandom = fopen("/dev/urandom", "r"); + fread(in, 1, sizeof(in), urandom); + + shake256(out0, 128, in, 33); + shake256_128_33_jazz(out1, in); + + for(k=0;k<128;k++) + if(out0[k] != out1[k]) printf("error shake256 at %d: %d %d\n", k, out0[k], out1[k]); + + sha3_512(out0, in, 32); + sha3_512_32_jazz(out1, in); + + for(k=0;k<64;k++) + if(out0[k] != out1[k]) printf("error sha3512 at %d: %d %d\n", k, out0[k], out1[k]); + + shake128_absorb(state0, in, 34); + shake128_absorb34_jazz(state1, in); + + for(k=0;k<25;k++) + if(state0[k] != state1[k]) printf("error shake128_absorb at %d: %lu %lu\n", k, state0[k], state1[k]); + + shake128_squeezeblocks(out0, 1, state0); + shake128_squeezeblock_jazz(out1, state1); + + for(k=0;k<25;k++) + if(state0[k] != state1[k]) printf("error shake128_squeezeblock (state) at %d: %lu %lu\n", k, state0[k], state1[k]); + + for(k=0;k + +#include "../params.h" +#include "../ntt.h" +#include "../indcpa.h" + +int main(void) +{ + unsigned char sk0[KYBER_INDCPA_SECRETKEYBYTES]; + unsigned char sk1[KYBER_INDCPA_SECRETKEYBYTES]; + unsigned char pk0[KYBER_INDCPA_PUBLICKEYBYTES]; + unsigned char pk1[KYBER_INDCPA_PUBLICKEYBYTES]; + unsigned char ct0[KYBER_INDCPA_BYTES]; + unsigned char ct1[KYBER_INDCPA_BYTES]; + + unsigned char randomness0[KYBER_SYMBYTES]; + unsigned char randomness1[KYBER_SYMBYTES]; + unsigned char message[KYBER_INDCPA_MSGBYTES]; + + /* + unsigned char outmsg0[KYBER_INDCPA_MSGBYTES]; + unsigned char outmsg1[KYBER_INDCPA_MSGBYTES]; + */ + unsigned char outmsg0[KYBER_POLYVECBYTES]; + unsigned char outmsg1[KYBER_POLYVECBYTES]; + + FILE *urandom = fopen("/dev/urandom", "r"); + fread(randomness0, KYBER_SYMBYTES, 1, urandom); + fread(randomness1, KYBER_SYMBYTES, 1, urandom); + fread(message, KYBER_SYMBYTES, 1, urandom); + fclose(urandom); + + /* TEST KEYPAIR */ + indcpa_keypair_jazz(pk1, sk1, randomness0); + indcpa_keypair(pk0, sk0, randomness0); + + for(int i=0;i +#include + +#include "../params.h" +#include "../ntt.h" +#include "../kem.h" + +int main(void) +{ + unsigned char sk0[KYBER_SECRETKEYBYTES]; + unsigned char sk1[KYBER_SECRETKEYBYTES]; + unsigned char pk0[KYBER_PUBLICKEYBYTES]; + unsigned char pk1[KYBER_PUBLICKEYBYTES]; + unsigned char ct0[KYBER_CIPHERTEXTBYTES]; + unsigned char ct1[KYBER_CIPHERTEXTBYTES]; + unsigned char shk0[KYBER_SSBYTES]; + unsigned char shk1[KYBER_SSBYTES]; + + unsigned char randomness0[2*KYBER_SYMBYTES]; + unsigned char randomness1[2*KYBER_SYMBYTES]; + + FILE *urandom = fopen("/dev/urandom", "r"); + fread(randomness0, 2*KYBER_SYMBYTES, 1, urandom); + fread(randomness1, 2*KYBER_SYMBYTES, 1, urandom); + fclose(urandom); + + /* TEST KEYPAIR */ + jade_kem_kyber_kyber768_amd64_avx2v_keypair_derand(pk1, sk1, randomness0); + crypto_kem_keypair(pk0, sk0, randomness0); + + for(int i=0;i +#include "../poly.h" + +void poly_setrandom(poly *r) +{ + FILE *urandom = fopen("/dev/urandom", "r"); + fread(r->coeffs, sizeof(int16_t), KYBER_N, urandom); + for(int i=0;icoeffs[i] %= KYBER_Q; + fclose(urandom); +} + +int main(void) +{ + poly a, b, r0; + + poly_setrandom(&a); + poly_setrandom(&b); + + poly_add(&r0, &a, &b); + + poly_add2_jazz(&a, &b); + + for(int i=0;i +#include "../poly.h" +#include "../ntt.h" + +void poly_setrandom(poly *r) +{ + FILE *urandom = fopen("/dev/urandom", "r"); + fread(r->coeffs, sizeof(int16_t), KYBER_N, urandom); + for(int i=0;icoeffs[i] %= KYBER_Q; + fclose(urandom); +} + +int main(void) +{ + poly a, b, r0, r1; + + poly_setrandom(&a); + poly_setrandom(&b); + + poly_basemul(&r0, &a, &b); + + poly_basemul_jazz(&r1, &a, &b); + + for(int i=0;i +#include "../poly.h" +#include "../ntt.h" + +void poly_setrandom(poly *r) +{ + FILE *urandom = fopen("/dev/urandom", "r"); + fread(r->coeffs, sizeof(int16_t), KYBER_N, urandom); + fclose(urandom); + poly_reduce(r); +} + +int main(void) +{ + unsigned char out0[128]; + unsigned char out1[128]; + poly a; + + poly_setrandom(&a); + + poly_compress(out0, &a); + poly_compress_jazz(out1, &a); + + for(int i=0;i<128;i++) + { + if(out0[i] != out1[i]) + printf("error compress %d, %d, %d\n", i, out0[i], out1[i]); + } + + return 0; +} diff --git a/code/jasmin/mlkem_avx2/test/test_poly_csubq.c b/code/jasmin/mlkem_avx2/test/test_poly_csubq.c new file mode 100644 index 00000000..87f28e08 --- /dev/null +++ b/code/jasmin/mlkem_avx2/test/test_poly_csubq.c @@ -0,0 +1,32 @@ +#include +#include "../poly.h" +#include "../ntt.h" + +void poly_setrandom(poly *r) +{ + FILE *urandom = fopen("/dev/urandom", "r"); + fread(r->coeffs, sizeof(int16_t), KYBER_N, urandom); + fclose(urandom); + poly_reduce(r); +} + +int main(void) +{ + poly r0, r1; + + poly_setrandom(&r0); + + for(int i=0;i +#include "../poly.h" +#include "../ntt.h" + +int main(void) +{ + unsigned char in[KYBER_POLYCOMPRESSEDBYTES]; + poly r0, r1; + + FILE *urandom = fopen("/dev/urandom", "r"); + fread(in, 1, KYBER_POLYCOMPRESSEDBYTES, urandom); + fclose(urandom); + + poly_decompress(&r0, in); + poly_decompress_jazz(&r1, in); + + for(int i=0;i +#include "../poly.h" +#include "../ntt.h" + +int main(void) +{ + unsigned char in[KYBER_POLYBYTES]; + poly r0, r1; + + FILE *urandom = fopen("/dev/urandom", "r"); + fread(in, 1, KYBER_POLYBYTES, urandom); + fclose(urandom); + + poly_frombytes(&r0, in); + poly_frombytes_jazz(&r1, in); + + for(int i=0;i +#include "../poly.h" +#include "../ntt.h" + +void poly_setrandom(poly *r) +{ + FILE *urandom = fopen("/dev/urandom", "r"); + fread(r->coeffs, sizeof(int16_t), KYBER_N, urandom); + for(int i=0;icoeffs[i] %= KYBER_Q; + } + fclose(urandom); +} + +int main(void) +{ + poly r0, r1; + + poly_setrandom(&r0); + + + for(int i=0;i +#include "../poly.h" +#include "../ntt.h" + +int main(void) +{ + unsigned char in[32]; + poly r0, r1; + + FILE *urandom = fopen("/dev/urandom", "r"); + fread(in, 1, 32, urandom); + fclose(urandom); + + poly_frommsg(&r0, in); + poly_frommsg_jazz(&r1, in); + + for(int i=0;i +#include "../poly.h" +#include "../ntt.h" +#include "../params.h" + + +int main(void) +{ + poly r0[4], r1[4]; + unsigned char seed[KYBER_SYMBYTES]; + + FILE *urandom = fopen("/dev/urandom", "r"); + fread(seed, 1, KYBER_SYMBYTES, urandom); + fclose(urandom); + + poly_getnoise_eta1(r0, seed, 0); + poly_getnoise_eta1(&r0[1], seed, 1); + poly_getnoise_eta1(&r0[2], seed, 2); + poly_getnoise_eta1(&r0[3], seed, 3); + poly_getnoise_eta1_4x_jazz(r1, seed, 0); + + for(int i=0;i<4;i++) + { + for(int j=0;j +#include "../poly.h" +#include "../ntt.h" + +void poly_setrandom(poly *r) +{ + FILE *urandom = fopen("/dev/urandom", "r"); + fread(r->coeffs, sizeof(int16_t), KYBER_N, urandom); + for(int i=0;icoeffs[i] %= KYBER_Q; + } + fclose(urandom); +} + +int main(void) +{ + poly r0, r1; + + poly_setrandom(&r0); + + + for(int i=0;i +#include "../poly.h" +#include "../ntt.h" + +void poly_setrandom(poly *r) +{ + FILE *urandom = fopen("/dev/urandom", "r"); + fread(r->coeffs, sizeof(int16_t), KYBER_N, urandom); + for(int i=0;icoeffs[i] %= KYBER_Q; + } + fclose(urandom); +} + +int main(void) +{ + poly r0, r1; + + poly_setrandom(&r0); + + for(int i=0;i +#include "../poly.h" +#include "../ntt.h" + +void poly_setrandom(poly *r) +{ + FILE *urandom = fopen("/dev/urandom", "r"); + fread(r->coeffs, sizeof(int16_t), KYBER_N, urandom); + fclose(urandom); +} + +int main(void) +{ + poly r0, r1; + + poly_setrandom(&r0); + + for(int i=0;i +#include "../poly.h" + +void poly_setrandom(poly *r) +{ + FILE *urandom = fopen("/dev/urandom", "r"); + fread(r->coeffs, sizeof(int16_t), KYBER_N, urandom); + for(int i=0;icoeffs[i] %= KYBER_Q; + fclose(urandom); +} + +int main(void) +{ + poly a, b, r0, r1; + + poly_setrandom(&a); + poly_setrandom(&b); + + poly_sub(&r0, &a, &b); + + poly_sub_jazz(&r1, &a, &b); + + for(int i=0;i +#include "../poly.h" +#include "../ntt.h" + +void poly_setrandom(poly *r) +{ + FILE *urandom = fopen("/dev/urandom", "r"); + fread(r->coeffs, sizeof(int16_t), KYBER_N, urandom); + fclose(urandom); + poly_reduce(r); +} + +int main(void) +{ + unsigned char out0[KYBER_POLYBYTES]; + unsigned char out1[KYBER_POLYBYTES]; + poly a; + + poly_setrandom(&a); + + poly_tobytes(out0, &a); + poly_tobytes_jazz(out1, &a); + + for(int i=0;i +#include "../poly.h" +#include "../ntt.h" + +void poly_setrandom(poly *r) +{ + FILE *urandom = fopen("/dev/urandom", "r"); + fread(r->coeffs, sizeof(int16_t), KYBER_N, urandom); + fclose(urandom); + poly_reduce(r); +} + +int main(void) +{ + unsigned char out0[KYBER_INDCPA_MSGBYTES]; + unsigned char out1[KYBER_INDCPA_MSGBYTES]; + poly a; + + poly_setrandom(&a); + + poly_tomsg(out0, &a); + poly_tomsg_jazz(out1, &a); + + for(int i=0;i +#include "../polyvec.h" + +void polyvec_setrandom(polyvec *r) +{ + FILE *urandom = fopen("/dev/urandom", "r"); + for(int i=0;ivec[i].coeffs, sizeof(int16_t), KYBER_N, urandom); + for(int i=0;ivec[i].coeffs[j] %= KYBER_Q; + fclose(urandom); +} + +int main(void) +{ + polyvec a, b, r0; + + polyvec_setrandom(&a); + polyvec_setrandom(&b); + + polyvec_add(&r0, &a, &b); + polyvec_add2_jazz(&a, &b); + + for(int i=0;i +#include "../polyvec.h" +#include "../ntt.h" + +void polyvec_setrandom(polyvec *r) +{ + FILE *urandom = fopen("/dev/urandom", "r"); + for(int i=0;ivec[i].coeffs, sizeof(int16_t), KYBER_N, urandom); + + polyvec_reduce(r); + fclose(urandom); +} + +int main(void) +{ + unsigned char out0[KYBER_POLYVECCOMPRESSEDBYTES]; + unsigned char out1[KYBER_POLYVECCOMPRESSEDBYTES]; + polyvec a; + + polyvec_setrandom(&a); + + polyvec_compress(out0, &a); + polyvec_compress_jazz(out1, &a); + + for(int i=0;i +#include "../poly.h" +#include "../polyvec.h" + +void polyvec_setrandom(polyvec *r) +{ + FILE *urandom = fopen("/dev/urandom", "r"); + for(int i=0;ivec[i].coeffs, sizeof(int16_t), KYBER_N, urandom); + fclose(urandom); + polyvec_reduce(r); +} + +int main(void) +{ + polyvec r0, r1; + + polyvec_setrandom(&r0); + + for(int i = 0;i +#include "../polyvec.h" + +int main(void) +{ + unsigned char in[KYBER_POLYVECCOMPRESSEDBYTES]; + polyvec r0, r1; + + FILE *urandom = fopen("/dev/urandom", "r"); + fread(in, 1, KYBER_POLYVECCOMPRESSEDBYTES, urandom); + fclose(urandom); + + polyvec_decompress(&r0, in); + polyvec_decompress_jazz(&r1, in); + + for(int i=0;i +#include "../polyvec.h" + +int main(void) +{ + unsigned char in[KYBER_POLYVECBYTES]; + polyvec r0, r1; + + FILE *urandom = fopen("/dev/urandom", "r"); + fread(in, 1, KYBER_POLYVECBYTES, urandom); + fclose(urandom); + + polyvec_frombytes(&r0, in); + polyvec_frombytes_jazz(&r1, in); + + for(int i=0;i +#include "../ntt.h" +#include "../poly.h" +#include "../polyvec.h" + +void polyvec_setrandom(polyvec *r) +{ + FILE *urandom = fopen("/dev/urandom", "r"); + for(int i=0;ivec[i].coeffs, sizeof(int16_t), KYBER_N, urandom); + for(int i=0;ivec[i].coeffs[j] %= KYBER_Q; + fclose(urandom); +} + +int main(void) +{ + polyvec r0, r1; + + polyvec_setrandom(&r0); + + for(int i = 0;i +#include "../ntt.h" +#include "../poly.h" +#include "../polyvec.h" + +void polyvec_setrandom(polyvec *r) +{ + FILE *urandom = fopen("/dev/urandom", "r"); + for(int i=0;ivec[i].coeffs, sizeof(int16_t), KYBER_N, urandom); + for(int i=0;ivec[i].coeffs[j] %= 2*KYBER_Q; + fclose(urandom); +} + +int main(void) +{ + polyvec r0, r1; + + polyvec_setrandom(&r0); + + for(int i = 0;i +#include "../ntt.h" +#include "../polyvec.h" + +void polyvec_setrandom(polyvec *r) +{ + FILE *urandom = fopen("/dev/urandom", "r"); + for(int i=0;ivec[i].coeffs, sizeof(int16_t), KYBER_N, urandom); + for(int i=0;ivec[i].coeffs[j] %= KYBER_Q; + fclose(urandom); +} + +int main(void) +{ + polyvec a, b; + poly r0, r1; + + polyvec_setrandom(&a); + polyvec_setrandom(&b); + + polyvec_pointwise_acc(&r0, &a, &b); + polyvec_pointwise_acc_jazz(&r1, &a, &b); + + for(int j=0;j +#include "../poly.h" +#include "../polyvec.h" + +void polyvec_setrandom(polyvec *r) +{ + FILE *urandom = fopen("/dev/urandom", "r"); + for(int i=0;ivec[i].coeffs, sizeof(int16_t), KYBER_N, urandom); + fclose(urandom); +} + +int main(void) +{ + polyvec r0, r1; + + polyvec_setrandom(&r0); + + for(int i = 0;i +#include "../polyvec.h" +#include "../ntt.h" +#include "../reduce.h" + +void polyvec_setrandom(polyvec *r) +{ + FILE *urandom = fopen("/dev/urandom", "r"); + for(int i=0;ivec[i].coeffs, sizeof(int16_t), KYBER_N, urandom); + + polyvec_reduce(r); + + fclose(urandom); +} + +int main(void) +{ + unsigned char out0[KYBER_POLYVECBYTES]; + unsigned char out1[KYBER_POLYVECBYTES]; + polyvec a; + + polyvec_setrandom(&a); + + polyvec_tobytes(out0, &a); + polyvec_tobytes_jazz(out1, &a); + + for(int i=0;i reg u64 +{ + reg u256 f g h; + reg u64 cnd t64; + reg u8 t1 t2; + reg bool zf; + inline int i off; + + cnd = 0; + t64 = 1; + h = #set0_256(); + + for i=0 to KYBER_INDCPA_CIPHERTEXTBYTES/32 + { + f = ctpc.[u256 32*i]; + g = (u256)[ctp + 32*i]; + f = #VPXOR_256(f, g); + h = #VPOR_256(h, f); + } + + _, _, _, _, zf = #VPTEST_256(h, h); + + cnd = t64 if !zf; + + off = KYBER_INDCPA_CIPHERTEXTBYTES/32 * 32; + + for i=off to KYBER_INDCPA_CIPHERTEXTBYTES + { + t1 = ctpc.[i]; + t2 = (u8)[ctp + i]; + t1 ^= t2; + t64 = (64u)t1; + cnd |= t64; + } + + cnd = -cnd; + cnd >>= 63; + + return cnd; +} + +inline +fn __cmov(reg u64 dst, reg ptr u8[KYBER_SYMBYTES] src, reg u64 cnd) -> reg u64 +{ + reg u256 f g m; + stack u64 scnd; + reg u8 t1 t2 bcond; + inline int i off; + + cnd = -cnd; + scnd = cnd; + + m = #VPBROADCAST_4u64(scnd); + + for i=0 to KYBER_SYMBYTES/32 + { + f = src.[u256 32*i]; + g = (u256)[dst + 32*i]; + f = #VPBLENDVB_256(f, g, m); + (u256)[dst + 32*i] = f; + } + + off = KYBER_SYMBYTES/32 * 32; + + /* fixme: unused in 768, hence untested */ + bcond = (8u)cnd; + for i=off to KYBER_SYMBYTES + { + t2 = (u8)[dst + i]; + t1 = src[i]; + t2 = t2 ^ t1; + t2 = t2 & cnd; + t1 ^= t2; + (u8)[dst + i] = t1; + } + + return dst; +}