diff --git a/include/wally_core.h b/include/wally_core.h index 7ef5e17d6..8f1a722cb 100644 --- a/include/wally_core.h +++ b/include/wally_core.h @@ -28,6 +28,16 @@ extern "C" { #define WALLY_EINVAL -2 /** Invalid argument */ #define WALLY_ENOMEM -3 /** malloc() failed */ +/** + * Initialize wally. + * + * As wally is not currently threadsafe, this function should be called once + * before threads are created by the application. + * + * :param flags: Flags controlling what to initialize. Currently must be zero. + */ +WALLY_CORE_API int wally_init(uint32_t flags); + /** * Free any internally allocated memory. * @@ -74,6 +84,10 @@ WALLY_CORE_API int wally_free_string( * The caller should call this function before using any functions that rely on * libsecp256k1 (i.e. Anything using public/private keys). * + * As wally is not currently threadsafe, this function should either be + * called before threads are created or access to wally functions wrapped + * in an application level mutex. + * * :param bytes: Entropy to use. * :param bytes_len: Size of ``bytes`` in bytes. Must be ``WALLY_SECP_RANDOMISE_LEN``. */ diff --git a/src/ccan/ccan/crypto/sha256/sha256.c b/src/ccan/ccan/crypto/sha256/sha256.c index f5567fca8..95c6927ed 100644 --- a/src/ccan/ccan/crypto/sha256/sha256.c +++ b/src/ccan/ccan/crypto/sha256/sha256.c @@ -13,25 +13,17 @@ #include #include +#ifdef CCAN_CRYPTO_SHA256_USE_OPENSSL static void invalidate_sha256(struct sha256_ctx *ctx) { -#ifdef CCAN_CRYPTO_SHA256_USE_OPENSSL ctx->c.md_len = 0; -#else - ctx->bytes = (size_t)-1; -#endif } static void check_sha256(struct sha256_ctx *ctx UNUSED) { -#ifdef CCAN_CRYPTO_SHA256_USE_OPENSSL assert(ctx->c.md_len != 0); -#else - assert(ctx->bytes != (size_t)-1); -#endif } -#ifdef CCAN_CRYPTO_SHA256_USE_OPENSSL void sha256_init(struct sha256_ctx *ctx) { SHA256_Init(&ctx->c); @@ -49,6 +41,16 @@ void sha256_done(struct sha256_ctx *ctx, struct sha256 *res) invalidate_sha256(ctx); } #else +static void invalidate_sha256(struct sha256_ctx *ctx) +{ + ctx->bytes = (size_t)-1; +} + +static void check_sha256(struct sha256_ctx *ctx UNUSED) +{ + assert(ctx->bytes != (size_t)-1); +} + static uint32_t Ch(uint32_t x, uint32_t y, uint32_t z) { return z ^ (x & (y ^ z)); @@ -83,8 +85,8 @@ static void Round(uint32_t a, uint32_t b, uint32_t c, uint32_t *d, uint32_t e, u *h = t1 + t2; } -/** Perform one SHA-256 transformation, processing a 64-byte chunk. */ -static void Transform(uint32_t *s, const uint32_t *chunk, size_t blocks) +/** Perform a number of SHA-256 transformations, processing 64-byte chunks. */ +static void TransformDefault(uint32_t *s, const uint32_t *chunk, size_t blocks) { while (blocks--) { uint32_t a = s[0], b = s[1], c = s[2], d = s[3], e = s[4], f = s[5], g = s[6], h = s[7]; @@ -170,6 +172,25 @@ static void Transform(uint32_t *s, const uint32_t *chunk, size_t blocks) } } +#if defined(__x86_64__) || defined(__amd64__) +#include + +#include "sha256_sse4.c" + +static int use_optimized_transform = 0; +#endif + +static inline void Transform(uint32_t *s, const uint32_t *chunk, size_t blocks) +{ +#if defined(__x86_64__) || defined(__amd64__) + if (use_optimized_transform) { + TransformSSE4(s, chunk, blocks); + return; + } +#endif + TransformDefault(s, chunk, blocks); +} + static void add(struct sha256_ctx *ctx, const void *p, size_t len) { const unsigned char *data = p; @@ -209,6 +230,16 @@ static void add(struct sha256_ctx *ctx, const void *p, size_t len) } } +void sha256_optimize(void) +{ +#if defined(__x86_64__) || defined(__amd64__) + uint32_t eax, ebx, ecx, edx; + if (__get_cpuid(1, &eax, &ebx, &ecx, &edx) && (ecx >> 19) & 1) { + use_optimized_transform = 1; + } +#endif +} + void sha256_init(struct sha256_ctx *ctx) { struct sha256_ctx init = SHA256_INIT; diff --git a/src/ccan/ccan/crypto/sha256/sha256.h b/src/ccan/ccan/crypto/sha256/sha256.h index fe44aa645..47d162a31 100644 --- a/src/ccan/ccan/crypto/sha256/sha256.h +++ b/src/ccan/ccan/crypto/sha256/sha256.h @@ -26,6 +26,11 @@ struct sha256 { } u; }; +/** + * sha256_optimize - check for and enable optimised functionality if possible. + */ +void sha256_optimize(void); + /** * sha256 - return sha256 of an object. * @sha256: the sha256 to fill in diff --git a/src/ccan/ccan/crypto/sha256/sha256_sse4.c b/src/ccan/ccan/crypto/sha256/sha256_sse4.c new file mode 100644 index 000000000..8ead1dbf6 --- /dev/null +++ b/src/ccan/ccan/crypto/sha256/sha256_sse4.c @@ -0,0 +1,958 @@ +/* Copyright (c) 2017 The Bitcoin Core developers + * Distributed under the MIT software license, see the accompanying + * file COPYING or http://www.opensource.org/licenses/mit-license.php. + * + * (translated to c from Bitcoin Cores src/crypto/sha256_sse4.cpp). + */ + +#include +#include + +#if defined(__x86_64__) || defined(__amd64__) +/* TODO: Support alignment in compiler.h */ +#define ALIGNED(N) __attribute__((aligned(N))) + +void TransformSSE4(uint32_t* s, const uint32_t* chunk, size_t blocks) +{ + static const uint32_t K256[] ALIGNED(16) = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, + }; + static const uint32_t FLIP_MASK[] ALIGNED(16) = {0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f}; + static const uint32_t SHUF_00BA[] ALIGNED(16) = {0x03020100, 0x0b0a0908, 0xffffffff, 0xffffffff}; + static const uint32_t SHUF_DC00[] ALIGNED(16) = {0xffffffff, 0xffffffff, 0x03020100, 0x0b0a0908}; + uint32_t a, b, c, d, f, g, h, y0, y1, y2; + uint64_t tbl; + uint64_t inp_end, inp; + uint32_t xfer[4] ALIGNED(16); + + __asm__ __volatile__( + "shl $0x6,%2;" + "je Ldone_hash_%=;" + "add %1,%2;" + "mov %2,%14;" + "mov (%0),%3;" + "mov 0x4(%0),%4;" + "mov 0x8(%0),%5;" + "mov 0xc(%0),%6;" + "mov 0x10(%0),%k2;" + "mov 0x14(%0),%7;" + "mov 0x18(%0),%8;" + "mov 0x1c(%0),%9;" + "movdqa %18,%%xmm12;" + "movdqa %19,%%xmm10;" + "movdqa %20,%%xmm11;" + + "Lloop0_%=:" + "lea %17,%13;" + "movdqu (%1),%%xmm4;" + "pshufb %%xmm12,%%xmm4;" + "movdqu 0x10(%1),%%xmm5;" + "pshufb %%xmm12,%%xmm5;" + "movdqu 0x20(%1),%%xmm6;" + "pshufb %%xmm12,%%xmm6;" + "movdqu 0x30(%1),%%xmm7;" + "pshufb %%xmm12,%%xmm7;" + "mov %1,%15;" + "mov $3,%1;" + + "Lloop1_%=:" + "movdqa 0x0(%13),%%xmm9;" + "paddd %%xmm4,%%xmm9;" + "movdqa %%xmm9,%16;" + "movdqa %%xmm7,%%xmm0;" + "mov %k2,%10;" + "ror $0xe,%10;" + "mov %3,%11;" + "palignr $0x4,%%xmm6,%%xmm0;" + "ror $0x9,%11;" + "xor %k2,%10;" + "mov %7,%12;" + "ror $0x5,%10;" + "movdqa %%xmm5,%%xmm1;" + "xor %3,%11;" + "xor %8,%12;" + "paddd %%xmm4,%%xmm0;" + "xor %k2,%10;" + "and %k2,%12;" + "ror $0xb,%11;" + "palignr $0x4,%%xmm4,%%xmm1;" + "xor %3,%11;" + "ror $0x6,%10;" + "xor %8,%12;" + "movdqa %%xmm1,%%xmm2;" + "ror $0x2,%11;" + "add %10,%12;" + "add %16,%12;" + "movdqa %%xmm1,%%xmm3;" + "mov %3,%10;" + "add %12,%9;" + "mov %3,%12;" + "pslld $0x19,%%xmm1;" + "or %5,%10;" + "add %9,%6;" + "and %5,%12;" + "psrld $0x7,%%xmm2;" + "and %4,%10;" + "add %11,%9;" + "por %%xmm2,%%xmm1;" + "or %12,%10;" + "add %10,%9;" + "movdqa %%xmm3,%%xmm2;" + "mov %6,%10;" + "mov %9,%11;" + "movdqa %%xmm3,%%xmm8;" + "ror $0xe,%10;" + "xor %6,%10;" + "mov %k2,%12;" + "ror $0x9,%11;" + "pslld $0xe,%%xmm3;" + "xor %9,%11;" + "ror $0x5,%10;" + "xor %7,%12;" + "psrld $0x12,%%xmm2;" + "ror $0xb,%11;" + "xor %6,%10;" + "and %6,%12;" + "ror $0x6,%10;" + "pxor %%xmm3,%%xmm1;" + "xor %9,%11;" + "xor %7,%12;" + "psrld $0x3,%%xmm8;" + "add %10,%12;" + "add 4+%16,%12;" + "ror $0x2,%11;" + "pxor %%xmm2,%%xmm1;" + "mov %9,%10;" + "add %12,%8;" + "mov %9,%12;" + "pxor %%xmm8,%%xmm1;" + "or %4,%10;" + "add %8,%5;" + "and %4,%12;" + "pshufd $0xfa,%%xmm7,%%xmm2;" + "and %3,%10;" + "add %11,%8;" + "paddd %%xmm1,%%xmm0;" + "or %12,%10;" + "add %10,%8;" + "movdqa %%xmm2,%%xmm3;" + "mov %5,%10;" + "mov %8,%11;" + "ror $0xe,%10;" + "movdqa %%xmm2,%%xmm8;" + "xor %5,%10;" + "ror $0x9,%11;" + "mov %6,%12;" + "xor %8,%11;" + "ror $0x5,%10;" + "psrlq $0x11,%%xmm2;" + "xor %k2,%12;" + "psrlq $0x13,%%xmm3;" + "xor %5,%10;" + "and %5,%12;" + "psrld $0xa,%%xmm8;" + "ror $0xb,%11;" + "xor %8,%11;" + "xor %k2,%12;" + "ror $0x6,%10;" + "pxor %%xmm3,%%xmm2;" + "add %10,%12;" + "ror $0x2,%11;" + "add 8+%16,%12;" + "pxor %%xmm2,%%xmm8;" + "mov %8,%10;" + "add %12,%7;" + "mov %8,%12;" + "pshufb %%xmm10,%%xmm8;" + "or %3,%10;" + "add %7,%4;" + "and %3,%12;" + "paddd %%xmm8,%%xmm0;" + "and %9,%10;" + "add %11,%7;" + "pshufd $0x50,%%xmm0,%%xmm2;" + "or %12,%10;" + "add %10,%7;" + "movdqa %%xmm2,%%xmm3;" + "mov %4,%10;" + "ror $0xe,%10;" + "mov %7,%11;" + "movdqa %%xmm2,%%xmm4;" + "ror $0x9,%11;" + "xor %4,%10;" + "mov %5,%12;" + "ror $0x5,%10;" + "psrlq $0x11,%%xmm2;" + "xor %7,%11;" + "xor %6,%12;" + "psrlq $0x13,%%xmm3;" + "xor %4,%10;" + "and %4,%12;" + "ror $0xb,%11;" + "psrld $0xa,%%xmm4;" + "xor %7,%11;" + "ror $0x6,%10;" + "xor %6,%12;" + "pxor %%xmm3,%%xmm2;" + "ror $0x2,%11;" + "add %10,%12;" + "add 12+%16,%12;" + "pxor %%xmm2,%%xmm4;" + "mov %7,%10;" + "add %12,%k2;" + "mov %7,%12;" + "pshufb %%xmm11,%%xmm4;" + "or %9,%10;" + "add %k2,%3;" + "and %9,%12;" + "paddd %%xmm0,%%xmm4;" + "and %8,%10;" + "add %11,%k2;" + "or %12,%10;" + "add %10,%k2;" + "movdqa 0x10(%13),%%xmm9;" + "paddd %%xmm5,%%xmm9;" + "movdqa %%xmm9,%16;" + "movdqa %%xmm4,%%xmm0;" + "mov %3,%10;" + "ror $0xe,%10;" + "mov %k2,%11;" + "palignr $0x4,%%xmm7,%%xmm0;" + "ror $0x9,%11;" + "xor %3,%10;" + "mov %4,%12;" + "ror $0x5,%10;" + "movdqa %%xmm6,%%xmm1;" + "xor %k2,%11;" + "xor %5,%12;" + "paddd %%xmm5,%%xmm0;" + "xor %3,%10;" + "and %3,%12;" + "ror $0xb,%11;" + "palignr $0x4,%%xmm5,%%xmm1;" + "xor %k2,%11;" + "ror $0x6,%10;" + "xor %5,%12;" + "movdqa %%xmm1,%%xmm2;" + "ror $0x2,%11;" + "add %10,%12;" + "add %16,%12;" + "movdqa %%xmm1,%%xmm3;" + "mov %k2,%10;" + "add %12,%6;" + "mov %k2,%12;" + "pslld $0x19,%%xmm1;" + "or %8,%10;" + "add %6,%9;" + "and %8,%12;" + "psrld $0x7,%%xmm2;" + "and %7,%10;" + "add %11,%6;" + "por %%xmm2,%%xmm1;" + "or %12,%10;" + "add %10,%6;" + "movdqa %%xmm3,%%xmm2;" + "mov %9,%10;" + "mov %6,%11;" + "movdqa %%xmm3,%%xmm8;" + "ror $0xe,%10;" + "xor %9,%10;" + "mov %3,%12;" + "ror $0x9,%11;" + "pslld $0xe,%%xmm3;" + "xor %6,%11;" + "ror $0x5,%10;" + "xor %4,%12;" + "psrld $0x12,%%xmm2;" + "ror $0xb,%11;" + "xor %9,%10;" + "and %9,%12;" + "ror $0x6,%10;" + "pxor %%xmm3,%%xmm1;" + "xor %6,%11;" + "xor %4,%12;" + "psrld $0x3,%%xmm8;" + "add %10,%12;" + "add 4+%16,%12;" + "ror $0x2,%11;" + "pxor %%xmm2,%%xmm1;" + "mov %6,%10;" + "add %12,%5;" + "mov %6,%12;" + "pxor %%xmm8,%%xmm1;" + "or %7,%10;" + "add %5,%8;" + "and %7,%12;" + "pshufd $0xfa,%%xmm4,%%xmm2;" + "and %k2,%10;" + "add %11,%5;" + "paddd %%xmm1,%%xmm0;" + "or %12,%10;" + "add %10,%5;" + "movdqa %%xmm2,%%xmm3;" + "mov %8,%10;" + "mov %5,%11;" + "ror $0xe,%10;" + "movdqa %%xmm2,%%xmm8;" + "xor %8,%10;" + "ror $0x9,%11;" + "mov %9,%12;" + "xor %5,%11;" + "ror $0x5,%10;" + "psrlq $0x11,%%xmm2;" + "xor %3,%12;" + "psrlq $0x13,%%xmm3;" + "xor %8,%10;" + "and %8,%12;" + "psrld $0xa,%%xmm8;" + "ror $0xb,%11;" + "xor %5,%11;" + "xor %3,%12;" + "ror $0x6,%10;" + "pxor %%xmm3,%%xmm2;" + "add %10,%12;" + "ror $0x2,%11;" + "add 8+%16,%12;" + "pxor %%xmm2,%%xmm8;" + "mov %5,%10;" + "add %12,%4;" + "mov %5,%12;" + "pshufb %%xmm10,%%xmm8;" + "or %k2,%10;" + "add %4,%7;" + "and %k2,%12;" + "paddd %%xmm8,%%xmm0;" + "and %6,%10;" + "add %11,%4;" + "pshufd $0x50,%%xmm0,%%xmm2;" + "or %12,%10;" + "add %10,%4;" + "movdqa %%xmm2,%%xmm3;" + "mov %7,%10;" + "ror $0xe,%10;" + "mov %4,%11;" + "movdqa %%xmm2,%%xmm5;" + "ror $0x9,%11;" + "xor %7,%10;" + "mov %8,%12;" + "ror $0x5,%10;" + "psrlq $0x11,%%xmm2;" + "xor %4,%11;" + "xor %9,%12;" + "psrlq $0x13,%%xmm3;" + "xor %7,%10;" + "and %7,%12;" + "ror $0xb,%11;" + "psrld $0xa,%%xmm5;" + "xor %4,%11;" + "ror $0x6,%10;" + "xor %9,%12;" + "pxor %%xmm3,%%xmm2;" + "ror $0x2,%11;" + "add %10,%12;" + "add 12+%16,%12;" + "pxor %%xmm2,%%xmm5;" + "mov %4,%10;" + "add %12,%3;" + "mov %4,%12;" + "pshufb %%xmm11,%%xmm5;" + "or %6,%10;" + "add %3,%k2;" + "and %6,%12;" + "paddd %%xmm0,%%xmm5;" + "and %5,%10;" + "add %11,%3;" + "or %12,%10;" + "add %10,%3;" + "movdqa 0x20(%13),%%xmm9;" + "paddd %%xmm6,%%xmm9;" + "movdqa %%xmm9,%16;" + "movdqa %%xmm5,%%xmm0;" + "mov %k2,%10;" + "ror $0xe,%10;" + "mov %3,%11;" + "palignr $0x4,%%xmm4,%%xmm0;" + "ror $0x9,%11;" + "xor %k2,%10;" + "mov %7,%12;" + "ror $0x5,%10;" + "movdqa %%xmm7,%%xmm1;" + "xor %3,%11;" + "xor %8,%12;" + "paddd %%xmm6,%%xmm0;" + "xor %k2,%10;" + "and %k2,%12;" + "ror $0xb,%11;" + "palignr $0x4,%%xmm6,%%xmm1;" + "xor %3,%11;" + "ror $0x6,%10;" + "xor %8,%12;" + "movdqa %%xmm1,%%xmm2;" + "ror $0x2,%11;" + "add %10,%12;" + "add %16,%12;" + "movdqa %%xmm1,%%xmm3;" + "mov %3,%10;" + "add %12,%9;" + "mov %3,%12;" + "pslld $0x19,%%xmm1;" + "or %5,%10;" + "add %9,%6;" + "and %5,%12;" + "psrld $0x7,%%xmm2;" + "and %4,%10;" + "add %11,%9;" + "por %%xmm2,%%xmm1;" + "or %12,%10;" + "add %10,%9;" + "movdqa %%xmm3,%%xmm2;" + "mov %6,%10;" + "mov %9,%11;" + "movdqa %%xmm3,%%xmm8;" + "ror $0xe,%10;" + "xor %6,%10;" + "mov %k2,%12;" + "ror $0x9,%11;" + "pslld $0xe,%%xmm3;" + "xor %9,%11;" + "ror $0x5,%10;" + "xor %7,%12;" + "psrld $0x12,%%xmm2;" + "ror $0xb,%11;" + "xor %6,%10;" + "and %6,%12;" + "ror $0x6,%10;" + "pxor %%xmm3,%%xmm1;" + "xor %9,%11;" + "xor %7,%12;" + "psrld $0x3,%%xmm8;" + "add %10,%12;" + "add 4+%16,%12;" + "ror $0x2,%11;" + "pxor %%xmm2,%%xmm1;" + "mov %9,%10;" + "add %12,%8;" + "mov %9,%12;" + "pxor %%xmm8,%%xmm1;" + "or %4,%10;" + "add %8,%5;" + "and %4,%12;" + "pshufd $0xfa,%%xmm5,%%xmm2;" + "and %3,%10;" + "add %11,%8;" + "paddd %%xmm1,%%xmm0;" + "or %12,%10;" + "add %10,%8;" + "movdqa %%xmm2,%%xmm3;" + "mov %5,%10;" + "mov %8,%11;" + "ror $0xe,%10;" + "movdqa %%xmm2,%%xmm8;" + "xor %5,%10;" + "ror $0x9,%11;" + "mov %6,%12;" + "xor %8,%11;" + "ror $0x5,%10;" + "psrlq $0x11,%%xmm2;" + "xor %k2,%12;" + "psrlq $0x13,%%xmm3;" + "xor %5,%10;" + "and %5,%12;" + "psrld $0xa,%%xmm8;" + "ror $0xb,%11;" + "xor %8,%11;" + "xor %k2,%12;" + "ror $0x6,%10;" + "pxor %%xmm3,%%xmm2;" + "add %10,%12;" + "ror $0x2,%11;" + "add 8+%16,%12;" + "pxor %%xmm2,%%xmm8;" + "mov %8,%10;" + "add %12,%7;" + "mov %8,%12;" + "pshufb %%xmm10,%%xmm8;" + "or %3,%10;" + "add %7,%4;" + "and %3,%12;" + "paddd %%xmm8,%%xmm0;" + "and %9,%10;" + "add %11,%7;" + "pshufd $0x50,%%xmm0,%%xmm2;" + "or %12,%10;" + "add %10,%7;" + "movdqa %%xmm2,%%xmm3;" + "mov %4,%10;" + "ror $0xe,%10;" + "mov %7,%11;" + "movdqa %%xmm2,%%xmm6;" + "ror $0x9,%11;" + "xor %4,%10;" + "mov %5,%12;" + "ror $0x5,%10;" + "psrlq $0x11,%%xmm2;" + "xor %7,%11;" + "xor %6,%12;" + "psrlq $0x13,%%xmm3;" + "xor %4,%10;" + "and %4,%12;" + "ror $0xb,%11;" + "psrld $0xa,%%xmm6;" + "xor %7,%11;" + "ror $0x6,%10;" + "xor %6,%12;" + "pxor %%xmm3,%%xmm2;" + "ror $0x2,%11;" + "add %10,%12;" + "add 12+%16,%12;" + "pxor %%xmm2,%%xmm6;" + "mov %7,%10;" + "add %12,%k2;" + "mov %7,%12;" + "pshufb %%xmm11,%%xmm6;" + "or %9,%10;" + "add %k2,%3;" + "and %9,%12;" + "paddd %%xmm0,%%xmm6;" + "and %8,%10;" + "add %11,%k2;" + "or %12,%10;" + "add %10,%k2;" + "movdqa 0x30(%13),%%xmm9;" + "paddd %%xmm7,%%xmm9;" + "movdqa %%xmm9,%16;" + "add $0x40,%13;" + "movdqa %%xmm6,%%xmm0;" + "mov %3,%10;" + "ror $0xe,%10;" + "mov %k2,%11;" + "palignr $0x4,%%xmm5,%%xmm0;" + "ror $0x9,%11;" + "xor %3,%10;" + "mov %4,%12;" + "ror $0x5,%10;" + "movdqa %%xmm4,%%xmm1;" + "xor %k2,%11;" + "xor %5,%12;" + "paddd %%xmm7,%%xmm0;" + "xor %3,%10;" + "and %3,%12;" + "ror $0xb,%11;" + "palignr $0x4,%%xmm7,%%xmm1;" + "xor %k2,%11;" + "ror $0x6,%10;" + "xor %5,%12;" + "movdqa %%xmm1,%%xmm2;" + "ror $0x2,%11;" + "add %10,%12;" + "add %16,%12;" + "movdqa %%xmm1,%%xmm3;" + "mov %k2,%10;" + "add %12,%6;" + "mov %k2,%12;" + "pslld $0x19,%%xmm1;" + "or %8,%10;" + "add %6,%9;" + "and %8,%12;" + "psrld $0x7,%%xmm2;" + "and %7,%10;" + "add %11,%6;" + "por %%xmm2,%%xmm1;" + "or %12,%10;" + "add %10,%6;" + "movdqa %%xmm3,%%xmm2;" + "mov %9,%10;" + "mov %6,%11;" + "movdqa %%xmm3,%%xmm8;" + "ror $0xe,%10;" + "xor %9,%10;" + "mov %3,%12;" + "ror $0x9,%11;" + "pslld $0xe,%%xmm3;" + "xor %6,%11;" + "ror $0x5,%10;" + "xor %4,%12;" + "psrld $0x12,%%xmm2;" + "ror $0xb,%11;" + "xor %9,%10;" + "and %9,%12;" + "ror $0x6,%10;" + "pxor %%xmm3,%%xmm1;" + "xor %6,%11;" + "xor %4,%12;" + "psrld $0x3,%%xmm8;" + "add %10,%12;" + "add 4+%16,%12;" + "ror $0x2,%11;" + "pxor %%xmm2,%%xmm1;" + "mov %6,%10;" + "add %12,%5;" + "mov %6,%12;" + "pxor %%xmm8,%%xmm1;" + "or %7,%10;" + "add %5,%8;" + "and %7,%12;" + "pshufd $0xfa,%%xmm6,%%xmm2;" + "and %k2,%10;" + "add %11,%5;" + "paddd %%xmm1,%%xmm0;" + "or %12,%10;" + "add %10,%5;" + "movdqa %%xmm2,%%xmm3;" + "mov %8,%10;" + "mov %5,%11;" + "ror $0xe,%10;" + "movdqa %%xmm2,%%xmm8;" + "xor %8,%10;" + "ror $0x9,%11;" + "mov %9,%12;" + "xor %5,%11;" + "ror $0x5,%10;" + "psrlq $0x11,%%xmm2;" + "xor %3,%12;" + "psrlq $0x13,%%xmm3;" + "xor %8,%10;" + "and %8,%12;" + "psrld $0xa,%%xmm8;" + "ror $0xb,%11;" + "xor %5,%11;" + "xor %3,%12;" + "ror $0x6,%10;" + "pxor %%xmm3,%%xmm2;" + "add %10,%12;" + "ror $0x2,%11;" + "add 8+%16,%12;" + "pxor %%xmm2,%%xmm8;" + "mov %5,%10;" + "add %12,%4;" + "mov %5,%12;" + "pshufb %%xmm10,%%xmm8;" + "or %k2,%10;" + "add %4,%7;" + "and %k2,%12;" + "paddd %%xmm8,%%xmm0;" + "and %6,%10;" + "add %11,%4;" + "pshufd $0x50,%%xmm0,%%xmm2;" + "or %12,%10;" + "add %10,%4;" + "movdqa %%xmm2,%%xmm3;" + "mov %7,%10;" + "ror $0xe,%10;" + "mov %4,%11;" + "movdqa %%xmm2,%%xmm7;" + "ror $0x9,%11;" + "xor %7,%10;" + "mov %8,%12;" + "ror $0x5,%10;" + "psrlq $0x11,%%xmm2;" + "xor %4,%11;" + "xor %9,%12;" + "psrlq $0x13,%%xmm3;" + "xor %7,%10;" + "and %7,%12;" + "ror $0xb,%11;" + "psrld $0xa,%%xmm7;" + "xor %4,%11;" + "ror $0x6,%10;" + "xor %9,%12;" + "pxor %%xmm3,%%xmm2;" + "ror $0x2,%11;" + "add %10,%12;" + "add 12+%16,%12;" + "pxor %%xmm2,%%xmm7;" + "mov %4,%10;" + "add %12,%3;" + "mov %4,%12;" + "pshufb %%xmm11,%%xmm7;" + "or %6,%10;" + "add %3,%k2;" + "and %6,%12;" + "paddd %%xmm0,%%xmm7;" + "and %5,%10;" + "add %11,%3;" + "or %12,%10;" + "add %10,%3;" + "sub $0x1,%1;" + "jne Lloop1_%=;" + "mov $0x2,%1;" + + "Lloop2_%=:" + "paddd 0x0(%13),%%xmm4;" + "movdqa %%xmm4,%16;" + "mov %k2,%10;" + "ror $0xe,%10;" + "mov %3,%11;" + "xor %k2,%10;" + "ror $0x9,%11;" + "mov %7,%12;" + "xor %3,%11;" + "ror $0x5,%10;" + "xor %8,%12;" + "xor %k2,%10;" + "ror $0xb,%11;" + "and %k2,%12;" + "xor %3,%11;" + "ror $0x6,%10;" + "xor %8,%12;" + "add %10,%12;" + "ror $0x2,%11;" + "add %16,%12;" + "mov %3,%10;" + "add %12,%9;" + "mov %3,%12;" + "or %5,%10;" + "add %9,%6;" + "and %5,%12;" + "and %4,%10;" + "add %11,%9;" + "or %12,%10;" + "add %10,%9;" + "mov %6,%10;" + "ror $0xe,%10;" + "mov %9,%11;" + "xor %6,%10;" + "ror $0x9,%11;" + "mov %k2,%12;" + "xor %9,%11;" + "ror $0x5,%10;" + "xor %7,%12;" + "xor %6,%10;" + "ror $0xb,%11;" + "and %6,%12;" + "xor %9,%11;" + "ror $0x6,%10;" + "xor %7,%12;" + "add %10,%12;" + "ror $0x2,%11;" + "add 4+%16,%12;" + "mov %9,%10;" + "add %12,%8;" + "mov %9,%12;" + "or %4,%10;" + "add %8,%5;" + "and %4,%12;" + "and %3,%10;" + "add %11,%8;" + "or %12,%10;" + "add %10,%8;" + "mov %5,%10;" + "ror $0xe,%10;" + "mov %8,%11;" + "xor %5,%10;" + "ror $0x9,%11;" + "mov %6,%12;" + "xor %8,%11;" + "ror $0x5,%10;" + "xor %k2,%12;" + "xor %5,%10;" + "ror $0xb,%11;" + "and %5,%12;" + "xor %8,%11;" + "ror $0x6,%10;" + "xor %k2,%12;" + "add %10,%12;" + "ror $0x2,%11;" + "add 8+%16,%12;" + "mov %8,%10;" + "add %12,%7;" + "mov %8,%12;" + "or %3,%10;" + "add %7,%4;" + "and %3,%12;" + "and %9,%10;" + "add %11,%7;" + "or %12,%10;" + "add %10,%7;" + "mov %4,%10;" + "ror $0xe,%10;" + "mov %7,%11;" + "xor %4,%10;" + "ror $0x9,%11;" + "mov %5,%12;" + "xor %7,%11;" + "ror $0x5,%10;" + "xor %6,%12;" + "xor %4,%10;" + "ror $0xb,%11;" + "and %4,%12;" + "xor %7,%11;" + "ror $0x6,%10;" + "xor %6,%12;" + "add %10,%12;" + "ror $0x2,%11;" + "add 12+%16,%12;" + "mov %7,%10;" + "add %12,%k2;" + "mov %7,%12;" + "or %9,%10;" + "add %k2,%3;" + "and %9,%12;" + "and %8,%10;" + "add %11,%k2;" + "or %12,%10;" + "add %10,%k2;" + "paddd 0x10(%13),%%xmm5;" + "movdqa %%xmm5,%16;" + "add $0x20,%13;" + "mov %3,%10;" + "ror $0xe,%10;" + "mov %k2,%11;" + "xor %3,%10;" + "ror $0x9,%11;" + "mov %4,%12;" + "xor %k2,%11;" + "ror $0x5,%10;" + "xor %5,%12;" + "xor %3,%10;" + "ror $0xb,%11;" + "and %3,%12;" + "xor %k2,%11;" + "ror $0x6,%10;" + "xor %5,%12;" + "add %10,%12;" + "ror $0x2,%11;" + "add %16,%12;" + "mov %k2,%10;" + "add %12,%6;" + "mov %k2,%12;" + "or %8,%10;" + "add %6,%9;" + "and %8,%12;" + "and %7,%10;" + "add %11,%6;" + "or %12,%10;" + "add %10,%6;" + "mov %9,%10;" + "ror $0xe,%10;" + "mov %6,%11;" + "xor %9,%10;" + "ror $0x9,%11;" + "mov %3,%12;" + "xor %6,%11;" + "ror $0x5,%10;" + "xor %4,%12;" + "xor %9,%10;" + "ror $0xb,%11;" + "and %9,%12;" + "xor %6,%11;" + "ror $0x6,%10;" + "xor %4,%12;" + "add %10,%12;" + "ror $0x2,%11;" + "add 4+%16,%12;" + "mov %6,%10;" + "add %12,%5;" + "mov %6,%12;" + "or %7,%10;" + "add %5,%8;" + "and %7,%12;" + "and %k2,%10;" + "add %11,%5;" + "or %12,%10;" + "add %10,%5;" + "mov %8,%10;" + "ror $0xe,%10;" + "mov %5,%11;" + "xor %8,%10;" + "ror $0x9,%11;" + "mov %9,%12;" + "xor %5,%11;" + "ror $0x5,%10;" + "xor %3,%12;" + "xor %8,%10;" + "ror $0xb,%11;" + "and %8,%12;" + "xor %5,%11;" + "ror $0x6,%10;" + "xor %3,%12;" + "add %10,%12;" + "ror $0x2,%11;" + "add 8+%16,%12;" + "mov %5,%10;" + "add %12,%4;" + "mov %5,%12;" + "or %k2,%10;" + "add %4,%7;" + "and %k2,%12;" + "and %6,%10;" + "add %11,%4;" + "or %12,%10;" + "add %10,%4;" + "mov %7,%10;" + "ror $0xe,%10;" + "mov %4,%11;" + "xor %7,%10;" + "ror $0x9,%11;" + "mov %8,%12;" + "xor %4,%11;" + "ror $0x5,%10;" + "xor %9,%12;" + "xor %7,%10;" + "ror $0xb,%11;" + "and %7,%12;" + "xor %4,%11;" + "ror $0x6,%10;" + "xor %9,%12;" + "add %10,%12;" + "ror $0x2,%11;" + "add 12+%16,%12;" + "mov %4,%10;" + "add %12,%3;" + "mov %4,%12;" + "or %6,%10;" + "add %3,%k2;" + "and %6,%12;" + "and %5,%10;" + "add %11,%3;" + "or %12,%10;" + "add %10,%3;" + "movdqa %%xmm6,%%xmm4;" + "movdqa %%xmm7,%%xmm5;" + "sub $0x1,%1;" + "jne Lloop2_%=;" + "add (%0),%3;" + "mov %3,(%0);" + "add 0x4(%0),%4;" + "mov %4,0x4(%0);" + "add 0x8(%0),%5;" + "mov %5,0x8(%0);" + "add 0xc(%0),%6;" + "mov %6,0xc(%0);" + "add 0x10(%0),%k2;" + "mov %k2,0x10(%0);" + "add 0x14(%0),%7;" + "mov %7,0x14(%0);" + "add 0x18(%0),%8;" + "mov %8,0x18(%0);" + "add 0x1c(%0),%9;" + "mov %9,0x1c(%0);" + "mov %15,%1;" + "add $0x40,%1;" + "cmp %14,%1;" + "jne Lloop0_%=;" + + "Ldone_hash_%=:" + + : "+r"(s), "+r"(chunk), "+r"(blocks), "=r"(a), "=r"(b), "=r"(c), "=r"(d), /* e = chunk */ "=r"(f), "=r"(g), "=r"(h), "=r"(y0), "=r"(y1), "=r"(y2), "=r"(tbl), "+m"(inp_end), "+m"(inp), "+m"(xfer) + : "m"(K256), "m"(FLIP_MASK), "m"(SHUF_00BA), "m"(SHUF_DC00) + : "cc", "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12" + ); +} + +#endif diff --git a/src/internal.c b/src/internal.c index d0be42c85..0b0ad91ef 100644 --- a/src/internal.c +++ b/src/internal.c @@ -36,7 +36,7 @@ int wally_secp_randomize(const unsigned char *bytes, size_t bytes_len) if (!bytes || bytes_len != WALLY_SECP_RANDOMISE_LEN) return WALLY_EINVAL; - if (!(ctx = (secp256k1_context *)secp_ctx())) + if (!(ctx = secp_ctx())) return WALLY_ENOMEM; if (!secp256k1_context_randomize(ctx, bytes)) @@ -267,6 +267,21 @@ void wally_clear_6(void *p, size_t len, void *p2, size_t len2, _ops.bzero_fn(p6, len6); } +static bool wally_init_done = false; + +int wally_init(uint32_t flags) +{ + if (flags) + return WALLY_EINVAL; + + if (!wally_init_done) { + sha256_optimize(); + wally_init_done = true; + } + + return WALLY_OK; +} + int wally_cleanup(uint32_t flags) { if (flags) diff --git a/src/test/test_hash.py b/src/test/test_hash.py index 9f511bf30..9f8a91a77 100755 --- a/src/test/test_hash.py +++ b/src/test/test_hash.py @@ -79,7 +79,7 @@ def do_hash(self, fn, hex_in, aligned=True): return utf8(result) - def test_sha_vectors(self): + def _do_test_sha_vectors(self): for in_msg, values in sha2_cases.items(): msg = h(utf8(in_msg)) for i, fn in enumerate([wally_sha256, wally_sha512, wally_sha256d]): @@ -90,6 +90,12 @@ def test_sha_vectors(self): self.assertEqual(result, expected) + def test_sha_vectors(self): + self. _do_test_sha_vectors() + wally_init(0) # Enable optimised SHA256 and re-test + self. _do_test_sha_vectors() + + def test_hash160_vectors(self): for msg, expected in hash160_cases: for aligned in [True, False]: diff --git a/src/test/util.py b/src/test/util.py index 7678fb893..ef09a1499 100755 --- a/src/test/util.py +++ b/src/test/util.py @@ -54,6 +54,8 @@ class c_ulong_p_class(object): c_uint_p = POINTER(c_uint) for f in ( + ('wally_init', c_int, [c_uint]), + ('wally_cleanup', c_int, [c_uint]), ('wordlist_init', c_void_p, [c_char_p]), ('wordlist_lookup_word', c_ulong, [c_void_p, c_char_p]), ('wordlist_lookup_index', c_char_p, [c_void_p, c_ulong]), @@ -138,8 +140,8 @@ def int_fn_wrapper(fn, *args): return [p.value, (ret, p.value)][fn.restype is not None] name, restype, argtypes = f - is_str_fn = type(argtypes[-1]) is c_char_p_p_class - is_int_fn = type(argtypes[-1]) is c_ulong_p_class + is_str_fn = len(argtypes) and type(argtypes[-1]) is c_char_p_p_class + is_int_fn = len(argtypes) and type(argtypes[-1]) is c_ulong_p_class if is_str_fn: argtypes[-1] = POINTER(c_char_p) elif is_int_fn: