diff --git a/.gitignore b/.gitignore index c8a66d28808..4c3c98b72f0 100644 --- a/.gitignore +++ b/.gitignore @@ -24,38 +24,6 @@ bld/ ![Cc]ore/[Ll]og/ tests/documentation/verify/ tests/documentation/all.odin-doc -tests/internal/test_map -tests/internal/test_pow -tests/internal/test_rtti -tests/core/test_base64 -tests/core/test_cbor -tests/core/test_core_compress -tests/core/test_core_container -tests/core/test_core_filepath -tests/core/test_core_fmt -tests/core/test_core_i18n -tests/core/test_core_image -tests/core/test_core_libc -tests/core/test_core_match -tests/core/test_core_math -tests/core/test_core_net -tests/core/test_core_os_exit -tests/core/test_core_reflect -tests/core/test_core_strings -tests/core/test_core_time -tests/core/test_crypto -tests/core/test_hash -tests/core/test_hex -tests/core/test_hxa -tests/core/test_json -tests/core/test_linalg_glsl_math -tests/core/test_noise -tests/core/test_varint -tests/core/test_xml -tests/core/test_core_slice -tests/core/test_core_thread -tests/core/test_core_runtime -tests/vendor/vendor_botan # Visual Studio 2015 cache/options directory .vs/ # Visual Studio Code options directory @@ -63,6 +31,7 @@ tests/vendor/vendor_botan # Uncomment if you have tasks that create the project's static files in wwwroot #wwwroot/ demo +benchmark # MSTest test Results [Tt]est[Rr]esult*/ diff --git a/core/bytes/bytes.odin b/core/bytes/bytes.odin index 208949fd8bc..7cbf092acb1 100644 --- a/core/bytes/bytes.odin +++ b/core/bytes/bytes.odin @@ -1167,3 +1167,28 @@ fields_proc :: proc(s: []byte, f: proc(rune) -> bool, allocator := context.alloc return subslices[:] } + +// alias returns true iff a and b have a non-zero length, and any part of +// a overlaps with b. +alias :: proc "contextless" (a, b: []byte) -> bool { + a_len, b_len := len(a), len(b) + if a_len == 0 || b_len == 0 { + return false + } + + a_start, b_start := uintptr(raw_data(a)), uintptr(raw_data(b)) + a_end, b_end := a_start + uintptr(a_len-1), b_start + uintptr(b_len-1) + + return a_start <= b_end && b_start <= a_end +} + +// alias_inexactly returns true iff a and b have a non-zero length, +// the base pointer of a and b are NOT equal, and any part of a overlaps +// with b (ie: `alias(a, b)` with an exception that returns false for +// `a == b`, `b = a[:len(a)-69]` and similar conditions). +alias_inexactly :: proc "contextless" (a, b: []byte) -> bool { + if raw_data(a) == raw_data(b) { + return false + } + return alias(a, b) +} diff --git a/core/crypto/_aes/ct64/api.odin b/core/crypto/_aes/ct64/api.odin index ae624971cd4..f57a630b124 100644 --- a/core/crypto/_aes/ct64/api.odin +++ b/core/crypto/_aes/ct64/api.odin @@ -7,9 +7,8 @@ STRIDE :: 4 // Context is a keyed AES (ECB) instance. Context :: struct { - _sk_exp: [120]u64, - _num_rounds: int, - _is_initialized: bool, + _sk_exp: [120]u64, + _num_rounds: int, } // init initializes a context for AES with the provided key. @@ -18,13 +17,10 @@ init :: proc(ctx: ^Context, key: []byte) { ctx._num_rounds = keysched(skey[:], key) skey_expand(ctx._sk_exp[:], skey[:], ctx._num_rounds) - ctx._is_initialized = true } // encrypt_block sets `dst` to `AES-ECB-Encrypt(src)`. encrypt_block :: proc(ctx: ^Context, dst, src: []byte) { - assert(ctx._is_initialized) - q: [8]u64 load_blockx1(&q, src) _encrypt(&q, ctx._sk_exp[:], ctx._num_rounds) @@ -33,8 +29,6 @@ encrypt_block :: proc(ctx: ^Context, dst, src: []byte) { // encrypt_block sets `dst` to `AES-ECB-Decrypt(src)`. decrypt_block :: proc(ctx: ^Context, dst, src: []byte) { - assert(ctx._is_initialized) - q: [8]u64 load_blockx1(&q, src) _decrypt(&q, ctx._sk_exp[:], ctx._num_rounds) @@ -43,8 +37,6 @@ decrypt_block :: proc(ctx: ^Context, dst, src: []byte) { // encrypt_blocks sets `dst` to `AES-ECB-Encrypt(src[0], .. src[n])`. encrypt_blocks :: proc(ctx: ^Context, dst, src: [][]byte) { - assert(ctx._is_initialized) - q: [8]u64 = --- src, dst := src, dst @@ -67,8 +59,6 @@ encrypt_blocks :: proc(ctx: ^Context, dst, src: [][]byte) { // decrypt_blocks sets dst to `AES-ECB-Decrypt(src[0], .. src[n])`. decrypt_blocks :: proc(ctx: ^Context, dst, src: [][]byte) { - assert(ctx._is_initialized) - q: [8]u64 = --- src, dst := src, dst diff --git a/core/crypto/_aes/hw_intel/api.odin b/core/crypto/_aes/hw_intel/api.odin new file mode 100644 index 00000000000..5cb5a68bb57 --- /dev/null +++ b/core/crypto/_aes/hw_intel/api.odin @@ -0,0 +1,43 @@ +//+build amd64 +package aes_hw_intel + +import "core:sys/info" + +// is_supporte returns true iff hardware accelerated AES +// is supported. +is_supported :: proc "contextless" () -> bool { + features, ok := info.cpu_features.? + if !ok { + return false + } + + // Note: Everything with AES-NI and PCLMULQDQ has support for + // the required SSE extxtensions. + req_features :: info.CPU_Features{ + .sse2, + .ssse3, + .sse41, + .aes, + .pclmulqdq, + } + return features >= req_features +} + +// Context is a keyed AES (ECB) instance. +Context :: struct { + // Note: The ideal thing to do is for the expanded round keys to be + // arrays of `__m128i`, however that implies alignment (or using AVX). + // + // All the people using e-waste processors that don't support an + // insturction set that has been around for over 10 years are why + // we can't have nice things. + _sk_exp_enc: [15][16]byte, + _sk_exp_dec: [15][16]byte, + _num_rounds: int, +} + +// init initializes a context for AES with the provided key. +init :: proc(ctx: ^Context, key: []byte) { + keysched(ctx, key) +} + diff --git a/core/crypto/_aes/hw_intel/ghash.odin b/core/crypto/_aes/hw_intel/ghash.odin new file mode 100644 index 00000000000..9a520852336 --- /dev/null +++ b/core/crypto/_aes/hw_intel/ghash.odin @@ -0,0 +1,281 @@ +// Copyright (c) 2017 Thomas Pornin +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// THIS SOFTWARE IS PROVIDED BY THE AUTHORS “AS IS” AND ANY EXPRESS OR +// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY +// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +// GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +//+build amd64 +package aes_hw_intel + +import "base:intrinsics" +import "core:crypto/_aes" +import "core:simd" +import "core:simd/x86" + +@(private = "file") +GHASH_STRIDE_HW :: 4 +@(private = "file") +GHASH_STRIDE_BYTES_HW :: GHASH_STRIDE_HW * _aes.GHASH_BLOCK_SIZE + +// GHASH is defined over elements of GF(2^128) with "full little-endian" +// representation: leftmost byte is least significant, and, within each +// byte, leftmost _bit_ is least significant. The natural ordering in +// x86 is "mixed little-endian": bytes are ordered from least to most +// significant, but bits within a byte are in most-to-least significant +// order. Going to full little-endian representation would require +// reversing bits within each byte, which is doable but expensive. +// +// Instead, we go to full big-endian representation, by swapping bytes +// around, which is done with a single _mm_shuffle_epi8() opcode (it +// comes with SSSE3; all CPU that offer pclmulqdq also have SSSE3). We +// can use a full big-endian representation because in a carryless +// multiplication, we have a nice bit reversal property: +// +// rev_128(x) * rev_128(y) = rev_255(x * y) +// +// So by using full big-endian, we still get the right result, except +// that it is right-shifted by 1 bit. The left-shift is relatively +// inexpensive, and it can be mutualised. +// +// Since SSE2 opcodes do not have facilities for shitfting full 128-bit +// values with bit precision, we have to break down values into 64-bit +// chunks. We number chunks from 0 to 3 in left to right order. + +@(private = "file") +byteswap_index := transmute(x86.__m128i)simd.i8x16{ + // Note: simd.i8x16 is reverse order from x86._mm_set_epi8. + 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, +} + +@(private = "file", require_results, enable_target_feature = "sse2,ssse3") +byteswap :: #force_inline proc "contextless" (x: x86.__m128i) -> x86.__m128i { + return x86._mm_shuffle_epi8(x, byteswap_index) +} + +// From a 128-bit value kw, compute kx as the XOR of the two 64-bit +// halves of kw (into the right half of kx; left half is unspecified), +// and return kx. +@(private = "file", require_results, enable_target_feature = "sse2") +bk :: #force_inline proc "contextless" (kw: x86.__m128i) -> x86.__m128i { + return x86._mm_xor_si128(kw, x86._mm_shuffle_epi32(kw, 0x0e)) +} + +// Combine two 64-bit values (k0:k1) into a 128-bit (kw) value and +// the XOR of the two values (kx), and return (kw, kx). +@(private = "file", enable_target_feature = "sse2") +pbk :: #force_inline proc "contextless" (k0, k1: x86.__m128i) -> (x86.__m128i, x86.__m128i) { + kw := x86._mm_unpacklo_epi64(k1, k0) + kx := x86._mm_xor_si128(k0, k1) + return kw, kx +} + +// Left-shift by 1 bit a 256-bit value (in four 64-bit words). +@(private = "file", require_results, enable_target_feature = "sse2") +sl_256 :: #force_inline proc "contextless" (x0, x1, x2, x3: x86.__m128i) -> (x86.__m128i, x86.__m128i, x86.__m128i, x86.__m128i) { + x0, x1, x2, x3 := x0, x1, x2, x3 + + x0 = x86._mm_or_si128(x86._mm_slli_epi64(x0, 1), x86._mm_srli_epi64(x1, 63)) + x1 = x86._mm_or_si128(x86._mm_slli_epi64(x1, 1), x86._mm_srli_epi64(x2, 63)) + x2 = x86._mm_or_si128(x86._mm_slli_epi64(x2, 1), x86._mm_srli_epi64(x3, 63)) + x3 = x86._mm_slli_epi64(x3, 1) + + return x0, x1, x2, x3 +} + +// Perform reduction in GF(2^128). +@(private = "file", require_results, enable_target_feature = "sse2") +reduce_f128 :: #force_inline proc "contextless" (x0, x1, x2, x3: x86.__m128i) -> (x86.__m128i, x86.__m128i) { + x0, x1, x2 := x0, x1, x2 + + x1 = x86._mm_xor_si128( + x1, + x86._mm_xor_si128( + x86._mm_xor_si128( + x3, + x86._mm_srli_epi64(x3, 1)), + x86._mm_xor_si128( + x86._mm_srli_epi64(x3, 2), + x86._mm_srli_epi64(x3, 7)))) + x2 = x86._mm_xor_si128( + x86._mm_xor_si128( + x2, + x86._mm_slli_epi64(x3, 63)), + x86._mm_xor_si128( + x86._mm_slli_epi64(x3, 62), + x86._mm_slli_epi64(x3, 57))) + x0 = x86._mm_xor_si128( + x0, + x86._mm_xor_si128( + x86._mm_xor_si128( + x2, + x86._mm_srli_epi64(x2, 1)), + x86._mm_xor_si128( + x86._mm_srli_epi64(x2, 2), + x86._mm_srli_epi64(x2, 7)))) + x1 = x86._mm_xor_si128( + x86._mm_xor_si128( + x1, + x86._mm_slli_epi64(x2, 63)), + x86._mm_xor_si128( + x86._mm_slli_epi64(x2, 62), + x86._mm_slli_epi64(x2, 57))) + + return x0, x1 +} + +// Square value kw in GF(2^128) into (dw,dx). +@(private = "file", require_results, enable_target_feature = "sse2,pclmul") +square_f128 :: #force_inline proc "contextless" (kw: x86.__m128i) -> (x86.__m128i, x86.__m128i) { + z1 := x86._mm_clmulepi64_si128(kw, kw, 0x11) + z3 := x86._mm_clmulepi64_si128(kw, kw, 0x00) + z0 := x86._mm_shuffle_epi32(z1, 0x0E) + z2 := x86._mm_shuffle_epi32(z3, 0x0E) + z0, z1, z2, z3 = sl_256(z0, z1, z2, z3) + z0, z1 = reduce_f128(z0, z1, z2, z3) + return pbk(z0, z1) +} + +// ghash calculates the GHASH of data, with the key `key`, and input `dst` +// and `data`, and stores the resulting digest in `dst`. +// +// Note: `dst` is both an input and an output, to support easy implementation +// of GCM. +@(enable_target_feature = "sse2,ssse3,pclmul") +ghash :: proc "contextless" (dst, key, data: []byte) #no_bounds_check { + if len(dst) != _aes.GHASH_BLOCK_SIZE || len(key) != _aes.GHASH_BLOCK_SIZE { + intrinsics.trap() + } + + // Note: BearSSL opts to copy the remainder into a zero-filled + // 64-byte buffer. We do something slightly more simple. + + // Load key and dst (h and y). + yw := intrinsics.unaligned_load((^x86.__m128i)(raw_data(dst))) + h1w := intrinsics.unaligned_load((^x86.__m128i)(raw_data(key))) + yw = byteswap(yw) + h1w = byteswap(h1w) + h1x := bk(h1w) + + // Process 4 blocks at a time + buf := data + l := len(buf) + if l >= GHASH_STRIDE_BYTES_HW { + // Compute h2 = h^2 + h2w, h2x := square_f128(h1w) + + // Compute h3 = h^3 = h*(h^2) + t1 := x86._mm_clmulepi64_si128(h1w, h2w, 0x11) + t3 := x86._mm_clmulepi64_si128(h1w, h2w, 0x00) + t2 := x86._mm_xor_si128( + x86._mm_clmulepi64_si128(h1x, h2x, 0x00), + x86._mm_xor_si128(t1, t3)) + t0 := x86._mm_shuffle_epi32(t1, 0x0E) + t1 = x86._mm_xor_si128(t1, x86._mm_shuffle_epi32(t2, 0x0E)) + t2 = x86._mm_xor_si128(t2, x86._mm_shuffle_epi32(t3, 0x0E)) + t0, t1, t2, t3 = sl_256(t0, t1, t2, t3) + t0, t1 = reduce_f128(t0, t1, t2, t3) + h3w, h3x := pbk(t0, t1) + + // Compute h4 = h^4 = (h^2)^2 + h4w, h4x := square_f128(h2w) + + for l >= GHASH_STRIDE_BYTES_HW { + aw0 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(buf))) + aw1 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(buf[16:]))) + aw2 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(buf[32:]))) + aw3 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(buf[48:]))) + aw0 = byteswap(aw0) + aw1 = byteswap(aw1) + aw2 = byteswap(aw2) + aw3 = byteswap(aw3) + buf, l = buf[GHASH_STRIDE_BYTES_HW:], l - GHASH_STRIDE_BYTES_HW + + aw0 = x86._mm_xor_si128(aw0, yw) + ax1 := bk(aw1) + ax2 := bk(aw2) + ax3 := bk(aw3) + ax0 := bk(aw0) + + t1 = x86._mm_xor_si128( + x86._mm_xor_si128( + x86._mm_clmulepi64_si128(aw0, h4w, 0x11), + x86._mm_clmulepi64_si128(aw1, h3w, 0x11)), + x86._mm_xor_si128( + x86._mm_clmulepi64_si128(aw2, h2w, 0x11), + x86._mm_clmulepi64_si128(aw3, h1w, 0x11))) + t3 = x86._mm_xor_si128( + x86._mm_xor_si128( + x86._mm_clmulepi64_si128(aw0, h4w, 0x00), + x86._mm_clmulepi64_si128(aw1, h3w, 0x00)), + x86._mm_xor_si128( + x86._mm_clmulepi64_si128(aw2, h2w, 0x00), + x86._mm_clmulepi64_si128(aw3, h1w, 0x00))) + t2 = x86._mm_xor_si128( + x86._mm_xor_si128( + x86._mm_clmulepi64_si128(ax0, h4x, 0x00), + x86._mm_clmulepi64_si128(ax1, h3x, 0x00)), + x86._mm_xor_si128( + x86._mm_clmulepi64_si128(ax2, h2x, 0x00), + x86._mm_clmulepi64_si128(ax3, h1x, 0x00))) + t2 = x86._mm_xor_si128(t2, x86._mm_xor_si128(t1, t3)) + t0 = x86._mm_shuffle_epi32(t1, 0x0E) + t1 = x86._mm_xor_si128(t1, x86._mm_shuffle_epi32(t2, 0x0E)) + t2 = x86._mm_xor_si128(t2, x86._mm_shuffle_epi32(t3, 0x0E)) + t0, t1, t2, t3 = sl_256(t0, t1, t2, t3) + t0, t1 = reduce_f128(t0, t1, t2, t3) + yw = x86._mm_unpacklo_epi64(t1, t0) + } + } + + // Process 1 block at a time + src: []byte + for l > 0 { + if l >= _aes.GHASH_BLOCK_SIZE { + src = buf + buf = buf[_aes.GHASH_BLOCK_SIZE:] + l -= _aes.GHASH_BLOCK_SIZE + } else { + tmp: [_aes.GHASH_BLOCK_SIZE]byte + copy(tmp[:], buf) + src = tmp[:] + l = 0 + } + + aw := intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))) + aw = byteswap(aw) + + aw = x86._mm_xor_si128(aw, yw) + ax := bk(aw) + + t1 := x86._mm_clmulepi64_si128(aw, h1w, 0x11) + t3 := x86._mm_clmulepi64_si128(aw, h1w, 0x00) + t2 := x86._mm_clmulepi64_si128(ax, h1x, 0x00) + t2 = x86._mm_xor_si128(t2, x86._mm_xor_si128(t1, t3)) + t0 := x86._mm_shuffle_epi32(t1, 0x0E) + t1 = x86._mm_xor_si128(t1, x86._mm_shuffle_epi32(t2, 0x0E)) + t2 = x86._mm_xor_si128(t2, x86._mm_shuffle_epi32(t3, 0x0E)) + t0, t1, t2, t3 = sl_256(t0, t1, t2, t3) + t0, t1 = reduce_f128(t0, t1, t2, t3) + yw = x86._mm_unpacklo_epi64(t1, t0) + } + + // Write back the hash (dst, aka y) + yw = byteswap(yw) + intrinsics.unaligned_store((^x86.__m128i)(raw_data(dst)), yw) +} diff --git a/core/crypto/_aes/hw_intel/hw_intel_keysched.odin b/core/crypto/_aes/hw_intel/hw_intel_keysched.odin new file mode 100644 index 00000000000..911dffbd548 --- /dev/null +++ b/core/crypto/_aes/hw_intel/hw_intel_keysched.odin @@ -0,0 +1,178 @@ +// Copyright (c) 2017 Thomas Pornin +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// THIS SOFTWARE IS PROVIDED BY THE AUTHORS “AS IS” AND ANY EXPRESS OR +// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY +// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +// GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +//+build amd64 +package aes_hw_intel + +import "base:intrinsics" +import "core:crypto/_aes" +import "core:mem" +import "core:simd/x86" + +// Intel AES-NI based implementation. Inspiration taken from BearSSL. +// +// Note: This assumes that the SROA optimization pass is enabled to be +// anything resembling performat otherwise, LLVM will not elide a massive +// number of redundant loads/stores it generates for every intrinsic call. + +@(private = "file", require_results, enable_target_feature = "sse2") +expand_step128 :: #force_inline proc(k1, k2: x86.__m128i) -> x86.__m128i { + k1, k2 := k1, k2 + + k2 = x86._mm_shuffle_epi32(k2, 0xff) + k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) + k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) + k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) + return x86._mm_xor_si128(k1, k2) +} + +@(private = "file", require_results, enable_target_feature = "sse,sse2") +expand_step192a :: #force_inline proc (k1_, k2_: ^x86.__m128i, k3: x86.__m128i) -> (x86.__m128i, x86.__m128i) { + k1, k2, k3 := k1_^, k2_^, k3 + + k3 = x86._mm_shuffle_epi32(k3, 0x55) + k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) + k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) + k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) + k1 = x86._mm_xor_si128(k1, k3) + + tmp := k2 + k2 = x86._mm_xor_si128(k2, x86._mm_slli_si128(k2, 0x04)) + k2 = x86._mm_xor_si128(k2, x86._mm_shuffle_epi32(k1, 0xff)) + + k1_, k2_ := k1_, k2_ + k1_^, k2_^ = k1, k2 + + r1 := transmute(x86.__m128i)(x86._mm_shuffle_ps(transmute(x86.__m128)(tmp), transmute(x86.__m128)(k1), 0x44)) + r2 := transmute(x86.__m128i)(x86._mm_shuffle_ps(transmute(x86.__m128)(k1), transmute(x86.__m128)(k2), 0x4e)) + + return r1, r2 +} + +@(private = "file", require_results, enable_target_feature = "sse2") +expand_step192b :: #force_inline proc (k1_, k2_: ^x86.__m128i, k3: x86.__m128i) -> x86.__m128i { + k1, k2, k3 := k1_^, k2_^, k3 + + k3 = x86._mm_shuffle_epi32(k3, 0x55) + k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) + k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) + k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) + k1 = x86._mm_xor_si128(k1, k3) + + k2 = x86._mm_xor_si128(k2, x86._mm_slli_si128(k2, 0x04)) + k2 = x86._mm_xor_si128(k2, x86._mm_shuffle_epi32(k1, 0xff)) + + k1_, k2_ := k1_, k2_ + k1_^, k2_^ = k1, k2 + + return k1 +} + +@(private = "file", require_results, enable_target_feature = "sse2") +expand_step256b :: #force_inline proc(k1, k2: x86.__m128i) -> x86.__m128i { + k1, k2 := k1, k2 + + k2 = x86._mm_shuffle_epi32(k2, 0xaa) + k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) + k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) + k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) + return x86._mm_xor_si128(k1, k2) +} + +@(private = "file", enable_target_feature = "aes") +derive_dec_keys :: proc(ctx: ^Context, sks: ^[15]x86.__m128i, num_rounds: int) { + intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_dec[0]), sks[num_rounds]) + for i in 1 ..< num_rounds { + tmp := x86._mm_aesimc_si128(sks[i]) + intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_dec[num_rounds - i]), tmp) + } + intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_dec[num_rounds]), sks[0]) +} + +@(private, enable_target_feature = "sse,sse2,aes") +keysched :: proc(ctx: ^Context, key: []byte) { + sks: [15]x86.__m128i = --- + + // Compute the encryption keys. + num_rounds, key_len := 0, len(key) + switch key_len { + case _aes.KEY_SIZE_128: + sks[0] = intrinsics.unaligned_load((^x86.__m128i)(raw_data(key))) + sks[1] = expand_step128(sks[0], x86._mm_aeskeygenassist_si128(sks[0], 0x01)) + sks[2] = expand_step128(sks[1], x86._mm_aeskeygenassist_si128(sks[1], 0x02)) + sks[3] = expand_step128(sks[2], x86._mm_aeskeygenassist_si128(sks[2], 0x04)) + sks[4] = expand_step128(sks[3], x86._mm_aeskeygenassist_si128(sks[3], 0x08)) + sks[5] = expand_step128(sks[4], x86._mm_aeskeygenassist_si128(sks[4], 0x10)) + sks[6] = expand_step128(sks[5], x86._mm_aeskeygenassist_si128(sks[5], 0x20)) + sks[7] = expand_step128(sks[6], x86._mm_aeskeygenassist_si128(sks[6], 0x40)) + sks[8] = expand_step128(sks[7], x86._mm_aeskeygenassist_si128(sks[7], 0x80)) + sks[9] = expand_step128(sks[8], x86._mm_aeskeygenassist_si128(sks[8], 0x1b)) + sks[10] = expand_step128(sks[9], x86._mm_aeskeygenassist_si128(sks[9], 0x36)) + num_rounds = _aes.ROUNDS_128 + case _aes.KEY_SIZE_192: + k0 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(key))) + k1 := x86.__m128i{ + intrinsics.unaligned_load((^i64)(raw_data(key[16:]))), + 0, + } + sks[0] = k0 + sks[1], sks[2] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x01)) + sks[3] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x02)) + sks[4], sks[5] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x04)) + sks[6] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x08)) + sks[7], sks[8] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x10)) + sks[9] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x20)) + sks[10], sks[11] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x40)) + sks[12] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x80)) + num_rounds = _aes.ROUNDS_192 + case _aes.KEY_SIZE_256: + sks[0] = intrinsics.unaligned_load((^x86.__m128i)(raw_data(key))) + sks[1] = intrinsics.unaligned_load((^x86.__m128i)(raw_data(key[16:]))) + sks[2] = expand_step128(sks[0], x86._mm_aeskeygenassist_si128(sks[1], 0x01)) + sks[3] = expand_step256b(sks[1], x86._mm_aeskeygenassist_si128(sks[2], 0x01)) + sks[4] = expand_step128(sks[2], x86._mm_aeskeygenassist_si128(sks[3], 0x02)) + sks[5] = expand_step256b(sks[3], x86._mm_aeskeygenassist_si128(sks[4], 0x02)) + sks[6] = expand_step128(sks[4], x86._mm_aeskeygenassist_si128(sks[5], 0x04)) + sks[7] = expand_step256b(sks[5], x86._mm_aeskeygenassist_si128(sks[6], 0x04)) + sks[8] = expand_step128(sks[6], x86._mm_aeskeygenassist_si128(sks[7], 0x08)) + sks[9] = expand_step256b(sks[7], x86._mm_aeskeygenassist_si128(sks[8], 0x08)) + sks[10] = expand_step128(sks[8], x86._mm_aeskeygenassist_si128(sks[9], 0x10)) + sks[11] = expand_step256b(sks[9], x86._mm_aeskeygenassist_si128(sks[10], 0x10)) + sks[12] = expand_step128(sks[10], x86._mm_aeskeygenassist_si128(sks[11], 0x20)) + sks[13] = expand_step256b(sks[11], x86._mm_aeskeygenassist_si128(sks[12], 0x20)) + sks[14] = expand_step128(sks[12], x86._mm_aeskeygenassist_si128(sks[13], 0x40)) + num_rounds = _aes.ROUNDS_256 + case: + panic("crypto/aes: invalid AES key size") + } + for i in 0 ..= num_rounds { + intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_enc[i]), sks[i]) + } + + // Compute the decryption keys. GCM and CTR do not need this, however + // ECB, CBC, OCB3, etc do. + derive_dec_keys(ctx, &sks, num_rounds) + + ctx._num_rounds = num_rounds + + mem.zero_explicit(&sks, size_of(sks)) +} diff --git a/core/crypto/aes/aes.odin b/core/crypto/aes/aes.odin index e895c5fe074..ef305fd2149 100644 --- a/core/crypto/aes/aes.odin +++ b/core/crypto/aes/aes.odin @@ -6,7 +6,6 @@ See: - https://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-38a.pdf - https://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-38d.pdf */ - package aes import "core:crypto/_aes" diff --git a/core/crypto/aes/aes_ctr.odin b/core/crypto/aes/aes_ctr.odin index 1821a7bdf55..1c5fe31e8ea 100644 --- a/core/crypto/aes/aes_ctr.odin +++ b/core/crypto/aes/aes_ctr.odin @@ -1,5 +1,6 @@ package aes +import "core:bytes" import "core:crypto/_aes/ct64" import "core:encoding/endian" import "core:math/bits" @@ -37,14 +38,15 @@ init_ctr :: proc(ctx: ^Context_CTR, key, iv: []byte, impl := Implementation.Hard xor_bytes_ctr :: proc(ctx: ^Context_CTR, dst, src: []byte) { assert(ctx._is_initialized) - // TODO: Enforcing that dst and src alias exactly or not at all - // is a good idea, though odd aliasing should be extremely uncommon. - src, dst := src, dst if dst_len := len(dst); dst_len < len(src) { src = src[:dst_len] } + if bytes.alias_inexactly(dst, src) { + panic("crypto/aes: dst and src alias inexactly") + } + for remaining := len(src); remaining > 0; { // Process multiple blocks at once if ctx._off == BLOCK_SIZE { @@ -123,8 +125,8 @@ reset_ctr :: proc "contextless" (ctx: ^Context_CTR) { ctx._is_initialized = false } -@(private) -ctr_blocks :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) { +@(private = "file") +ctr_blocks :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) #no_bounds_check { // Use the optimized hardware implementation if available. if _, is_hw := ctx._impl.(Context_Impl_Hardware); is_hw { ctr_blocks_hw(ctx, dst, src, nr_blocks) @@ -183,17 +185,17 @@ xor_blocks :: #force_inline proc "contextless" (dst, src: []byte, blocks: [][]by // performance of this implementation matters to where that // optimization would be worth it, use chacha20poly1305, or a // CPU that isn't e-waste. - if src != nil { - #no_bounds_check { - for i in 0 ..< len(blocks) { - off := i * BLOCK_SIZE - for j in 0 ..< BLOCK_SIZE { - blocks[i][j] ~= src[off + j] + #no_bounds_check { + if src != nil { + for i in 0 ..< len(blocks) { + off := i * BLOCK_SIZE + for j in 0 ..< BLOCK_SIZE { + blocks[i][j] ~= src[off + j] + } } - } } - } - for i in 0 ..< len(blocks) { - copy(dst[i * BLOCK_SIZE:], blocks[i]) + for i in 0 ..< len(blocks) { + copy(dst[i * BLOCK_SIZE:], blocks[i]) + } } } diff --git a/core/crypto/aes/aes_ctr_hw_intel.odin b/core/crypto/aes/aes_ctr_hw_intel.odin new file mode 100644 index 00000000000..1c9e815ad99 --- /dev/null +++ b/core/crypto/aes/aes_ctr_hw_intel.odin @@ -0,0 +1,151 @@ +//+build amd64 +package aes + +import "base:intrinsics" +import "core:crypto/_aes" +import "core:math/bits" +import "core:mem" +import "core:simd/x86" + +@(private) +CTR_STRIDE_HW :: 4 +@(private) +CTR_STRIDE_BYTES_HW :: CTR_STRIDE_HW * BLOCK_SIZE + +@(private, enable_target_feature = "sse2,aes") +ctr_blocks_hw :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) #no_bounds_check { + hw_ctx := ctx._impl.(Context_Impl_Hardware) + + sks: [15]x86.__m128i = --- + for i in 0 ..= hw_ctx._num_rounds { + sks[i] = intrinsics.unaligned_load((^x86.__m128i)(&hw_ctx._sk_exp_enc[i])) + } + + hw_inc_ctr := #force_inline proc "contextless" (hi, lo: u64) -> (x86.__m128i, u64, u64) { + ret := x86.__m128i{ + i64(intrinsics.byte_swap(hi)), + i64(intrinsics.byte_swap(lo)), + } + + hi, lo := hi, lo + carry: u64 + + lo, carry = bits.add_u64(lo, 1, 0) + hi, _ = bits.add_u64(hi, 0, carry) + return ret, hi, lo + } + + // The latency of AESENC depends on mfg and microarchitecture: + // - 7 -> up to Broadwell + // - 4 -> AMD and Skylake - Cascade Lake + // - 3 -> Ice Lake and newer + // + // This implementation does 4 blocks at once, since performance + // should be "adequate" across most CPUs. + + src, dst := src, dst + nr_blocks := nr_blocks + ctr_hi, ctr_lo := ctx._ctr_hi, ctx._ctr_lo + + blks: [CTR_STRIDE_HW]x86.__m128i = --- + for nr_blocks >= CTR_STRIDE_HW { + #unroll for i in 0..< CTR_STRIDE_HW { + blks[i], ctr_hi, ctr_lo = hw_inc_ctr(ctr_hi, ctr_lo) + } + + #unroll for i in 0 ..< CTR_STRIDE_HW { + blks[i] = x86._mm_xor_si128(blks[i], sks[0]) + } + #unroll for i in 1 ..= 9 { + #unroll for j in 0 ..< CTR_STRIDE_HW { + blks[j] = x86._mm_aesenc_si128(blks[j], sks[i]) + } + } + switch hw_ctx._num_rounds { + case _aes.ROUNDS_128: + #unroll for i in 0 ..< CTR_STRIDE_HW { + blks[i] = x86._mm_aesenclast_si128(blks[i], sks[10]) + } + case _aes.ROUNDS_192: + #unroll for i in 10 ..= 11 { + #unroll for j in 0 ..< CTR_STRIDE_HW { + blks[j] = x86._mm_aesenc_si128(blks[j], sks[i]) + } + } + #unroll for i in 0 ..< CTR_STRIDE_HW { + blks[i] = x86._mm_aesenclast_si128(blks[i], sks[12]) + } + case _aes.ROUNDS_256: + #unroll for i in 10 ..= 13 { + #unroll for j in 0 ..< CTR_STRIDE_HW { + blks[j] = x86._mm_aesenc_si128(blks[j], sks[i]) + } + } + #unroll for i in 0 ..< CTR_STRIDE_HW { + blks[i] = x86._mm_aesenclast_si128(blks[i], sks[14]) + } + } + + xor_blocks_hw(dst, src, blks[:]) + + if src != nil { + src = src[CTR_STRIDE_BYTES_HW:] + } + dst = dst[CTR_STRIDE_BYTES_HW:] + nr_blocks -= CTR_STRIDE_HW + } + + // Handle the remainder. + for nr_blocks > 0 { + blks[0], ctr_hi, ctr_lo = hw_inc_ctr(ctr_hi, ctr_lo) + + blks[0] = x86._mm_xor_si128(blks[0], sks[0]) + #unroll for i in 1 ..= 9 { + blks[0] = x86._mm_aesenc_si128(blks[0], sks[i]) + } + switch hw_ctx._num_rounds { + case _aes.ROUNDS_128: + blks[0] = x86._mm_aesenclast_si128(blks[0], sks[10]) + case _aes.ROUNDS_192: + #unroll for i in 10 ..= 11 { + blks[0] = x86._mm_aesenc_si128(blks[0], sks[i]) + } + blks[0] = x86._mm_aesenclast_si128(blks[0], sks[12]) + case _aes.ROUNDS_256: + #unroll for i in 10 ..= 13 { + blks[0] = x86._mm_aesenc_si128(blks[0], sks[i]) + } + blks[0] = x86._mm_aesenclast_si128(blks[0], sks[14]) + } + + xor_blocks_hw(dst, src, blks[:1]) + + if src != nil { + src = src[BLOCK_SIZE:] + } + dst = dst[BLOCK_SIZE:] + nr_blocks -= 1 + } + + // Write back the counter. + ctx._ctr_hi, ctx._ctr_lo = ctr_hi, ctr_lo + + mem.zero_explicit(&blks, size_of(blks)) + mem.zero_explicit(&sks, size_of(sks)) +} + +@(private, enable_target_feature = "sse2") +xor_blocks_hw :: proc(dst, src: []byte, blocks: []x86.__m128i) { + #no_bounds_check { + if src != nil { + for i in 0 ..< len(blocks) { + off := i * BLOCK_SIZE + tmp := intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[off:]))) + blocks[i] = x86._mm_xor_si128(blocks[i], tmp) + } + } + for i in 0 ..< len(blocks) { + intrinsics.unaligned_store((^x86.__m128i)(raw_data(dst[i * BLOCK_SIZE:])), blocks[i]) + } + } +} diff --git a/core/crypto/aes/aes_ecb_hw_intel.odin b/core/crypto/aes/aes_ecb_hw_intel.odin new file mode 100644 index 00000000000..b2ff36a0c82 --- /dev/null +++ b/core/crypto/aes/aes_ecb_hw_intel.odin @@ -0,0 +1,58 @@ +//+build amd64 +package aes + +import "base:intrinsics" +import "core:crypto/_aes" +import "core:simd/x86" + +@(private, enable_target_feature = "sse2,aes") +encrypt_block_hw :: proc(ctx: ^Context_Impl_Hardware, dst, src: []byte) { + blk := intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))) + + blk = x86._mm_xor_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[0]))) + #unroll for i in 1 ..= 9 { + blk = x86._mm_aesenc_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i]))) + } + switch ctx._num_rounds { + case _aes.ROUNDS_128: + blk = x86._mm_aesenclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[10]))) + case _aes.ROUNDS_192: + #unroll for i in 10 ..= 11 { + blk = x86._mm_aesenc_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i]))) + } + blk = x86._mm_aesenclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[12]))) + case _aes.ROUNDS_256: + #unroll for i in 10 ..= 13 { + blk = x86._mm_aesenc_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i]))) + } + blk = x86._mm_aesenclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[14]))) + } + + intrinsics.unaligned_store((^x86.__m128i)(raw_data(dst)), blk) +} + +@(private, enable_target_feature = "sse2,aes") +decrypt_block_hw :: proc(ctx: ^Context_Impl_Hardware, dst, src: []byte) { + blk := intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))) + + blk = x86._mm_xor_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[0]))) + #unroll for i in 1 ..= 9 { + blk = x86._mm_aesdec_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[i]))) + } + switch ctx._num_rounds { + case _aes.ROUNDS_128: + blk = x86._mm_aesdeclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[10]))) + case _aes.ROUNDS_192: + #unroll for i in 10 ..= 11 { + blk = x86._mm_aesdec_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[i]))) + } + blk = x86._mm_aesdeclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[12]))) + case _aes.ROUNDS_256: + #unroll for i in 10 ..= 13 { + blk = x86._mm_aesdec_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[i]))) + } + blk = x86._mm_aesdeclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[14]))) + } + + intrinsics.unaligned_store((^x86.__m128i)(raw_data(dst)), blk) +} diff --git a/core/crypto/aes/aes_gcm.odin b/core/crypto/aes/aes_gcm.odin index 66ef48db202..25e0cc35b71 100644 --- a/core/crypto/aes/aes_gcm.odin +++ b/core/crypto/aes/aes_gcm.odin @@ -1,13 +1,16 @@ package aes +import "core:bytes" import "core:crypto" import "core:crypto/_aes" import "core:crypto/_aes/ct64" import "core:encoding/endian" import "core:mem" -// GCM_NONCE_SIZE is the size of the GCM nonce in bytes. +// GCM_NONCE_SIZE is the default size of the GCM nonce in bytes. GCM_NONCE_SIZE :: 12 +// GCM_NONCE_SIZE_MAX is the maximum size of the GCM nonce in bytes. +GCM_NONCE_SIZE_MAX :: 0x2000000000000000 // floor((2^64 - 1) / 8) bits // GCM_TAG_SIZE is the size of a GCM tag in bytes. GCM_TAG_SIZE :: _aes.GHASH_TAG_SIZE @@ -39,6 +42,9 @@ seal_gcm :: proc(ctx: ^Context_GCM, dst, tag, nonce, aad, plaintext: []byte) { if len(dst) != len(plaintext) { panic("crypto/aes: invalid destination ciphertext size") } + if bytes.alias_inexactly(dst, plaintext) { + panic("crypto/aes: dst and plaintext alias inexactly") + } if impl, is_hw := ctx._impl.(Context_Impl_Hardware); is_hw { gcm_seal_hw(&impl, dst, tag, nonce, aad, plaintext) @@ -47,17 +53,19 @@ seal_gcm :: proc(ctx: ^Context_GCM, dst, tag, nonce, aad, plaintext: []byte) { h: [_aes.GHASH_KEY_SIZE]byte j0: [_aes.GHASH_BLOCK_SIZE]byte + j0_enc: [_aes.GHASH_BLOCK_SIZE]byte s: [_aes.GHASH_TAG_SIZE]byte - init_ghash_ct64(ctx, &h, &j0, nonce) + init_ghash_ct64(ctx, &h, &j0, &j0_enc, nonce) // Note: Our GHASH implementation handles appending padding. ct64.ghash(s[:], h[:], aad) - gctr_ct64(ctx, dst, &s, plaintext, &h, nonce, true) - final_ghash_ct64(&s, &h, &j0, len(aad), len(plaintext)) + gctr_ct64(ctx, dst, &s, plaintext, &h, &j0, true) + final_ghash_ct64(&s, &h, &j0_enc, len(aad), len(plaintext)) copy(tag, s[:]) mem.zero_explicit(&h, len(h)) mem.zero_explicit(&j0, len(j0)) + mem.zero_explicit(&j0_enc, len(j0_enc)) } // open_gcm authenticates the aad and ciphertext, and decrypts the ciphertext, @@ -73,6 +81,9 @@ open_gcm :: proc(ctx: ^Context_GCM, dst, nonce, aad, ciphertext, tag: []byte) -> if len(dst) != len(ciphertext) { panic("crypto/aes: invalid destination plaintext size") } + if bytes.alias_inexactly(dst, ciphertext) { + panic("crypto/aes: dst and ciphertext alias inexactly") + } if impl, is_hw := ctx._impl.(Context_Impl_Hardware); is_hw { return gcm_open_hw(&impl, dst, nonce, aad, ciphertext, tag) @@ -80,12 +91,13 @@ open_gcm :: proc(ctx: ^Context_GCM, dst, nonce, aad, ciphertext, tag: []byte) -> h: [_aes.GHASH_KEY_SIZE]byte j0: [_aes.GHASH_BLOCK_SIZE]byte + j0_enc: [_aes.GHASH_BLOCK_SIZE]byte s: [_aes.GHASH_TAG_SIZE]byte - init_ghash_ct64(ctx, &h, &j0, nonce) + init_ghash_ct64(ctx, &h, &j0, &j0_enc, nonce) ct64.ghash(s[:], h[:], aad) - gctr_ct64(ctx, dst, &s, ciphertext, &h, nonce, false) - final_ghash_ct64(&s, &h, &j0, len(aad), len(ciphertext)) + gctr_ct64(ctx, dst, &s, ciphertext, &h, &j0, false) + final_ghash_ct64(&s, &h, &j0_enc, len(aad), len(ciphertext)) ok := crypto.compare_constant_time(s[:], tag) == 1 if !ok { @@ -94,6 +106,7 @@ open_gcm :: proc(ctx: ^Context_GCM, dst, nonce, aad, ciphertext, tag: []byte) -> mem.zero_explicit(&h, len(h)) mem.zero_explicit(&j0, len(j0)) + mem.zero_explicit(&j0_enc, len(j0_enc)) mem.zero_explicit(&s, len(s)) return ok @@ -106,19 +119,14 @@ reset_gcm :: proc "contextless" (ctx: ^Context_GCM) { ctx._is_initialized = false } -@(private) +@(private = "file") gcm_validate_common_slice_sizes :: proc(tag, nonce, aad, text: []byte) { if len(tag) != GCM_TAG_SIZE { panic("crypto/aes: invalid GCM tag size") } - // The specification supports nonces in the range [1, 2^64) bits - // however per NIST SP 800-38D 5.2.1.1: - // - // > For IVs, it is recommended that implementations restrict support - // > to the length of 96 bits, to promote interoperability, efficiency, - // > and simplicity of design. - if len(nonce) != GCM_NONCE_SIZE { + // The specification supports nonces in the range [1, 2^64) bits. + if l := len(nonce); l == 0 || u64(l) >= GCM_NONCE_SIZE_MAX { panic("crypto/aes: invalid GCM nonce size") } @@ -135,6 +143,7 @@ init_ghash_ct64 :: proc( ctx: ^Context_GCM, h: ^[_aes.GHASH_KEY_SIZE]byte, j0: ^[_aes.GHASH_BLOCK_SIZE]byte, + j0_enc: ^[_aes.GHASH_BLOCK_SIZE]byte, nonce: []byte, ) { impl := &ctx._impl.(ct64.Context) @@ -142,12 +151,25 @@ init_ghash_ct64 :: proc( // 1. Let H = CIPH(k, 0^128) ct64.encrypt_block(impl, h[:], h[:]) + // Define a block, J0, as follows: + if l := len(nonce); l == GCM_NONCE_SIZE { + // if len(IV) = 96, then let J0 = IV || 0^31 || 1 + copy(j0[:], nonce) + j0[_aes.GHASH_BLOCK_SIZE - 1] = 1 + } else { + // If len(IV) != 96, then let s = 128 ceil(len(IV)/128) - len(IV), + // and let J0 = GHASHH(IV || 0^(s+64) || ceil(len(IV))^64). + ct64.ghash(j0[:], h[:], nonce) + + tmp: [_aes.GHASH_BLOCK_SIZE]byte + endian.unchecked_put_u64be(tmp[8:], u64(l) * 8) + ct64.ghash(j0[:], h[:], tmp[:]) + } + // ECB encrypt j0, so that we can just XOR with the tag. In theory // this could be processed along with the final GCTR block, to // potentially save a call to AES-ECB, but... just use AES-NI. - copy(j0[:], nonce) - j0[_aes.GHASH_BLOCK_SIZE - 1] = 1 - ct64.encrypt_block(impl, j0[:], j0[:]) + ct64.encrypt_block(impl, j0_enc[:], j0[:]) } @(private = "file") @@ -175,33 +197,27 @@ gctr_ct64 :: proc( s: ^[_aes.GHASH_BLOCK_SIZE]byte, src: []byte, h: ^[_aes.GHASH_KEY_SIZE]byte, - nonce: []byte, + nonce: ^[_aes.GHASH_BLOCK_SIZE]byte, is_seal: bool, -) { +) #no_bounds_check { ct64_inc_ctr32 := #force_inline proc "contextless" (dst: []byte, ctr: u32) -> u32 { endian.unchecked_put_u32be(dst[12:], ctr) return ctr + 1 } - // 2. Define a block J_0 as follows: - // if len(IV) = 96, then let J0 = IV || 0^31 || 1 - // - // Note: We only support 96 bit IVs. + // Setup the counter blocks. tmp, tmp2: [ct64.STRIDE][BLOCK_SIZE]byte = ---, --- ctrs, blks: [ct64.STRIDE][]byte = ---, --- - ctr: u32 = 2 + ctr := endian.unchecked_get_u32be(nonce[GCM_NONCE_SIZE:]) + 1 for i in 0 ..< ct64.STRIDE { // Setup scratch space for the keystream. blks[i] = tmp2[i][:] // Pre-copy the IV to all the counter blocks. ctrs[i] = tmp[i][:] - copy(ctrs[i], nonce) + copy(ctrs[i], nonce[:GCM_NONCE_SIZE]) } - // We stitch the GCTR and GHASH operations together, so that only - // one pass over the ciphertext is required. - impl := &ctx._impl.(ct64.Context) src, dst := src, dst diff --git a/core/crypto/aes/aes_gcm_hw_intel.odin b/core/crypto/aes/aes_gcm_hw_intel.odin new file mode 100644 index 00000000000..7d32d4d968e --- /dev/null +++ b/core/crypto/aes/aes_gcm_hw_intel.odin @@ -0,0 +1,243 @@ +//+build amd64 +package aes + +import "base:intrinsics" +import "core:crypto" +import "core:crypto/_aes" +import "core:crypto/_aes/hw_intel" +import "core:encoding/endian" +import "core:mem" +import "core:simd/x86" + +@(private) +gcm_seal_hw :: proc(ctx: ^Context_Impl_Hardware, dst, tag, nonce, aad, plaintext: []byte) { + h: [_aes.GHASH_KEY_SIZE]byte + j0: [_aes.GHASH_BLOCK_SIZE]byte + j0_enc: [_aes.GHASH_BLOCK_SIZE]byte + s: [_aes.GHASH_TAG_SIZE]byte + init_ghash_hw(ctx, &h, &j0, &j0_enc, nonce) + + // Note: Our GHASH implementation handles appending padding. + hw_intel.ghash(s[:], h[:], aad) + gctr_hw(ctx, dst, &s, plaintext, &h, &j0, true) + final_ghash_hw(&s, &h, &j0_enc, len(aad), len(plaintext)) + copy(tag, s[:]) + + mem.zero_explicit(&h, len(h)) + mem.zero_explicit(&j0, len(j0)) + mem.zero_explicit(&j0_enc, len(j0_enc)) +} + +@(private) +gcm_open_hw :: proc(ctx: ^Context_Impl_Hardware, dst, nonce, aad, ciphertext, tag: []byte) -> bool { + h: [_aes.GHASH_KEY_SIZE]byte + j0: [_aes.GHASH_BLOCK_SIZE]byte + j0_enc: [_aes.GHASH_BLOCK_SIZE]byte + s: [_aes.GHASH_TAG_SIZE]byte + init_ghash_hw(ctx, &h, &j0, &j0_enc, nonce) + + hw_intel.ghash(s[:], h[:], aad) + gctr_hw(ctx, dst, &s, ciphertext, &h, &j0, false) + final_ghash_hw(&s, &h, &j0_enc, len(aad), len(ciphertext)) + + ok := crypto.compare_constant_time(s[:], tag) == 1 + if !ok { + mem.zero_explicit(raw_data(dst), len(dst)) + } + + mem.zero_explicit(&h, len(h)) + mem.zero_explicit(&j0, len(j0)) + mem.zero_explicit(&j0_enc, len(j0_enc)) + mem.zero_explicit(&s, len(s)) + + return ok +} + +@(private = "file") +init_ghash_hw :: proc( + ctx: ^Context_Impl_Hardware, + h: ^[_aes.GHASH_KEY_SIZE]byte, + j0: ^[_aes.GHASH_BLOCK_SIZE]byte, + j0_enc: ^[_aes.GHASH_BLOCK_SIZE]byte, + nonce: []byte, +) { + // 1. Let H = CIPH(k, 0^128) + encrypt_block_hw(ctx, h[:], h[:]) + + // Define a block, J0, as follows: + if l := len(nonce); l == GCM_NONCE_SIZE { + // if len(IV) = 96, then let J0 = IV || 0^31 || 1 + copy(j0[:], nonce) + j0[_aes.GHASH_BLOCK_SIZE - 1] = 1 + } else { + // If len(IV) != 96, then let s = 128 ceil(len(IV)/128) - len(IV), + // and let J0 = GHASHH(IV || 0^(s+64) || ceil(len(IV))^64). + hw_intel.ghash(j0[:], h[:], nonce) + + tmp: [_aes.GHASH_BLOCK_SIZE]byte + endian.unchecked_put_u64be(tmp[8:], u64(l) * 8) + hw_intel.ghash(j0[:], h[:], tmp[:]) + } + + // ECB encrypt j0, so that we can just XOR with the tag. + encrypt_block_hw(ctx, j0_enc[:], j0[:]) +} + +@(private = "file", enable_target_feature = "sse2") +final_ghash_hw :: proc( + s: ^[_aes.GHASH_BLOCK_SIZE]byte, + h: ^[_aes.GHASH_KEY_SIZE]byte, + j0: ^[_aes.GHASH_BLOCK_SIZE]byte, + a_len: int, + t_len: int, +) { + blk: [_aes.GHASH_BLOCK_SIZE]byte + endian.unchecked_put_u64be(blk[0:], u64(a_len) * 8) + endian.unchecked_put_u64be(blk[8:], u64(t_len) * 8) + + hw_intel.ghash(s[:], h[:], blk[:]) + j0_vec := intrinsics.unaligned_load((^x86.__m128i)(j0)) + s_vec := intrinsics.unaligned_load((^x86.__m128i)(s)) + s_vec = x86._mm_xor_si128(s_vec, j0_vec) + intrinsics.unaligned_store((^x86.__m128i)(s), s_vec) +} + +@(private = "file", enable_target_feature = "sse2,sse4.1,aes") +gctr_hw :: proc( + ctx: ^Context_Impl_Hardware, + dst: []byte, + s: ^[_aes.GHASH_BLOCK_SIZE]byte, + src: []byte, + h: ^[_aes.GHASH_KEY_SIZE]byte, + nonce: ^[_aes.GHASH_BLOCK_SIZE]byte, + is_seal: bool, +) #no_bounds_check { + sks: [15]x86.__m128i = --- + for i in 0 ..= ctx._num_rounds { + sks[i] = intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i])) + } + + // Setup the counter block + ctr_blk := intrinsics.unaligned_load((^x86.__m128i)(nonce)) + ctr := endian.unchecked_get_u32be(nonce[GCM_NONCE_SIZE:]) + 1 + + src, dst := src, dst + + // Note: Instead of doing GHASH and CTR separately, it is more + // performant to interleave (stitch) the two operations together. + // This results in an unreadable mess, so we opt for simplicity + // as performance is adequate. + + blks: [CTR_STRIDE_HW]x86.__m128i = --- + nr_blocks := len(src) / BLOCK_SIZE + for nr_blocks >= CTR_STRIDE_HW { + if !is_seal { + hw_intel.ghash(s[:], h[:], src[:CTR_STRIDE_BYTES_HW]) + } + + #unroll for i in 0 ..< CTR_STRIDE_HW { + blks[i], ctr = hw_inc_ctr32(&ctr_blk, ctr) + } + + #unroll for i in 0 ..< CTR_STRIDE_HW { + blks[i] = x86._mm_xor_si128(blks[i], sks[0]) + } + #unroll for i in 1 ..= 9 { + #unroll for j in 0 ..< CTR_STRIDE_HW { + blks[j] = x86._mm_aesenc_si128(blks[j], sks[i]) + } + } + switch ctx._num_rounds { + case _aes.ROUNDS_128: + #unroll for i in 0 ..< CTR_STRIDE_HW { + blks[i] = x86._mm_aesenclast_si128(blks[i], sks[10]) + } + case _aes.ROUNDS_192: + #unroll for i in 10 ..= 11 { + #unroll for j in 0 ..< CTR_STRIDE_HW { + blks[j] = x86._mm_aesenc_si128(blks[j], sks[i]) + } + } + #unroll for i in 0 ..< CTR_STRIDE_HW { + blks[i] = x86._mm_aesenclast_si128(blks[i], sks[12]) + } + case _aes.ROUNDS_256: + #unroll for i in 10 ..= 13 { + #unroll for j in 0 ..< CTR_STRIDE_HW { + blks[j] = x86._mm_aesenc_si128(blks[j], sks[i]) + } + } + #unroll for i in 0 ..< CTR_STRIDE_HW { + blks[i] = x86._mm_aesenclast_si128(blks[i], sks[14]) + } + } + + xor_blocks_hw(dst, src, blks[:]) + + if is_seal { + hw_intel.ghash(s[:], h[:], dst[:CTR_STRIDE_BYTES_HW]) + } + + src = src[CTR_STRIDE_BYTES_HW:] + dst = dst[CTR_STRIDE_BYTES_HW:] + nr_blocks -= CTR_STRIDE_HW + } + + // Handle the remainder. + for n := len(src); n > 0; { + l := min(n, BLOCK_SIZE) + if !is_seal { + hw_intel.ghash(s[:], h[:], src[:l]) + } + + blks[0], ctr = hw_inc_ctr32(&ctr_blk, ctr) + + blks[0] = x86._mm_xor_si128(blks[0], sks[0]) + #unroll for i in 1 ..= 9 { + blks[0] = x86._mm_aesenc_si128(blks[0], sks[i]) + } + switch ctx._num_rounds { + case _aes.ROUNDS_128: + blks[0] = x86._mm_aesenclast_si128(blks[0], sks[10]) + case _aes.ROUNDS_192: + #unroll for i in 10 ..= 11 { + blks[0] = x86._mm_aesenc_si128(blks[0], sks[i]) + } + blks[0] = x86._mm_aesenclast_si128(blks[0], sks[12]) + case _aes.ROUNDS_256: + #unroll for i in 10 ..= 13 { + blks[0] = x86._mm_aesenc_si128(blks[0], sks[i]) + } + blks[0] = x86._mm_aesenclast_si128(blks[0], sks[14]) + } + + if l == BLOCK_SIZE { + xor_blocks_hw(dst, src, blks[:1]) + } else { + blk: [BLOCK_SIZE]byte + copy(blk[:], src) + xor_blocks_hw(blk[:], blk[:], blks[:1]) + copy(dst, blk[:l]) + } + if is_seal { + hw_intel.ghash(s[:], h[:], dst[:l]) + } + + dst = dst[l:] + src = src[l:] + n -= l + } + + mem.zero_explicit(&blks, size_of(blks)) + mem.zero_explicit(&sks, size_of(sks)) +} + +// BUG: Sticking this in gctr_hw (like the other implementations) crashes +// the compiler. +// +// src/check_expr.cpp(7892): Assertion Failure: `c->curr_proc_decl->entity` +@(private = "file", enable_target_feature = "sse4.1") +hw_inc_ctr32 :: #force_inline proc "contextless" (src: ^x86.__m128i, ctr: u32) -> (x86.__m128i, u32) { + ret := x86._mm_insert_epi32(src^, i32(intrinsics.byte_swap(ctr)), 3) + return ret, ctr + 1 +} diff --git a/core/crypto/aes/aes_impl_hw_gen.odin b/core/crypto/aes/aes_impl_hw_gen.odin index 94815f61cd2..5361c6ef01f 100644 --- a/core/crypto/aes/aes_impl_hw_gen.odin +++ b/core/crypto/aes/aes_impl_hw_gen.odin @@ -1,3 +1,4 @@ +//+build !amd64 package aes @(private = "file") diff --git a/core/crypto/aes/aes_impl_hw_intel.odin b/core/crypto/aes/aes_impl_hw_intel.odin new file mode 100644 index 00000000000..39ea2dc8d0f --- /dev/null +++ b/core/crypto/aes/aes_impl_hw_intel.odin @@ -0,0 +1,18 @@ +//+build amd64 +package aes + +import "core:crypto/_aes/hw_intel" + +// is_hardware_accelerated returns true iff hardware accelerated AES +// is supported. +is_hardware_accelerated :: proc "contextless" () -> bool { + return hw_intel.is_supported() +} + +@(private) +Context_Impl_Hardware :: hw_intel.Context + +@(private, enable_target_feature = "sse2,aes") +init_impl_hw :: proc(ctx: ^Context_Impl_Hardware, key: []byte) { + hw_intel.init(ctx, key) +} diff --git a/core/crypto/chacha20/chacha20.odin b/core/crypto/chacha20/chacha20.odin index 7f0950d037e..73d3e1ea2f6 100644 --- a/core/crypto/chacha20/chacha20.odin +++ b/core/crypto/chacha20/chacha20.odin @@ -7,6 +7,7 @@ See: */ package chacha20 +import "core:bytes" import "core:encoding/endian" import "core:math/bits" import "core:mem" @@ -121,14 +122,15 @@ seek :: proc(ctx: ^Context, block_nr: u64) { xor_bytes :: proc(ctx: ^Context, dst, src: []byte) { assert(ctx._is_initialized) - // TODO: Enforcing that dst and src alias exactly or not at all - // is a good idea, though odd aliasing should be extremely uncommon. - src, dst := src, dst if dst_len := len(dst); dst_len < len(src) { src = src[:dst_len] } + if bytes.alias_inexactly(dst, src) { + panic("crypto/chacha20: dst and src alias inexactly") + } + for remaining := len(src); remaining > 0; { // Process multiple blocks at once if ctx._off == _BLOCK_SIZE { diff --git a/core/crypto/crypto.odin b/core/crypto/crypto.odin index f83d20dd7e1..323cc45d605 100644 --- a/core/crypto/crypto.odin +++ b/core/crypto/crypto.odin @@ -60,7 +60,11 @@ rand_bytes :: proc (dst: []byte) { _rand_bytes(dst) } - +// random_generator returns a `runtime.Random_Generator` backed by the +// system entropy source. +// +// Support for the system entropy source can be checked with the +// `HAS_RAND_BYTES` boolean constant. random_generator :: proc() -> runtime.Random_Generator { return { procedure = proc(data: rawptr, mode: runtime.Random_Generator_Mode, p: []byte) { diff --git a/core/simd/x86/aes.odin b/core/simd/x86/aes.odin index 3a32de0d6d8..a2cd2e4d3a9 100644 --- a/core/simd/x86/aes.odin +++ b/core/simd/x86/aes.odin @@ -2,33 +2,33 @@ package simd_x86 @(require_results, enable_target_feature = "aes") -_mm_aesdec :: #force_inline proc "c" (a, b: __m128i) -> __m128i { +_mm_aesdec_si128 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return aesdec(a, b) } @(require_results, enable_target_feature = "aes") -_mm_aesdeclast :: #force_inline proc "c" (a, b: __m128i) -> __m128i { +_mm_aesdeclast_si128 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return aesdeclast(a, b) } @(require_results, enable_target_feature = "aes") -_mm_aesenc :: #force_inline proc "c" (a, b: __m128i) -> __m128i { +_mm_aesenc_si128 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return aesenc(a, b) } @(require_results, enable_target_feature = "aes") -_mm_aesenclast :: #force_inline proc "c" (a, b: __m128i) -> __m128i { +_mm_aesenclast_si128 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return aesenclast(a, b) } @(require_results, enable_target_feature = "aes") -_mm_aesimc :: #force_inline proc "c" (a: __m128i) -> __m128i { +_mm_aesimc_si128 :: #force_inline proc "c" (a: __m128i) -> __m128i { return aesimc(a) } @(require_results, enable_target_feature = "aes") -_mm_aeskeygenassist :: #force_inline proc "c" (a: __m128i, $IMM8: u8) -> __m128i { - return aeskeygenassist(a, u8(IMM8)) +_mm_aeskeygenassist_si128 :: #force_inline proc "c" (a: __m128i, $IMM8: u8) -> __m128i { + return aeskeygenassist(a, IMM8) } @@ -45,5 +45,5 @@ foreign _ { @(link_name = "llvm.x86.aesni.aesimc") aesimc :: proc(a: __m128i) -> __m128i --- @(link_name = "llvm.x86.aesni.aeskeygenassist") - aeskeygenassist :: proc(a: __m128i, imm8: u8) -> __m128i --- + aeskeygenassist :: proc(a: __m128i, #const imm8: u8) -> __m128i --- } diff --git a/core/simd/x86/sse2.odin b/core/simd/x86/sse2.odin index 52286cbb8b2..4263590317d 100644 --- a/core/simd/x86/sse2.odin +++ b/core/simd/x86/sse2.odin @@ -144,19 +144,26 @@ _mm_subs_epu16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { _mm_slli_si128_impl :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i { shift :: IMM8 & 0xff + // This needs to emit behavior identical to PSLLDQ which is as follows: + // + // TEMP := COUNT + // IF (TEMP > 15) THEN TEMP := 16; FI + // DEST := DEST << (TEMP * 8) + // DEST[MAXVL-1:128] (Unmodified) + return transmute(__m128i)simd.shuffle( - transmute(i8x16)a, i8x16(0), - 0 when shift > 15 else (16 - shift + 0), - 1 when shift > 15 else (16 - shift + 1), - 2 when shift > 15 else (16 - shift + 2), - 3 when shift > 15 else (16 - shift + 3), - 4 when shift > 15 else (16 - shift + 4), - 5 when shift > 15 else (16 - shift + 5), - 6 when shift > 15 else (16 - shift + 6), - 7 when shift > 15 else (16 - shift + 7), - 8 when shift > 15 else (16 - shift + 8), - 9 when shift > 15 else (16 - shift + 9), + transmute(i8x16)a, + 0 when shift > 15 else (16 - shift + 0), + 1 when shift > 15 else (16 - shift + 1), + 2 when shift > 15 else (16 - shift + 2), + 3 when shift > 15 else (16 - shift + 3), + 4 when shift > 15 else (16 - shift + 4), + 5 when shift > 15 else (16 - shift + 5), + 6 when shift > 15 else (16 - shift + 6), + 7 when shift > 15 else (16 - shift + 7), + 8 when shift > 15 else (16 - shift + 8), + 9 when shift > 15 else (16 - shift + 9), 10 when shift > 15 else (16 - shift + 10), 11 when shift > 15 else (16 - shift + 11), 12 when shift > 15 else (16 - shift + 12), @@ -435,7 +442,7 @@ _mm_store_si128 :: #force_inline proc "c" (mem_addr: ^__m128i, a: __m128i) { } @(enable_target_feature="sse2") _mm_storeu_si128 :: #force_inline proc "c" (mem_addr: ^__m128i, a: __m128i) { - storeudq(mem_addr, a) + intrinsics.unaligned_store(mem_addr, a) } @(enable_target_feature="sse2") _mm_storel_epi64 :: #force_inline proc "c" (mem_addr: ^__m128i, a: __m128i) { @@ -1178,8 +1185,6 @@ foreign _ { cvttsd2si :: proc(a: __m128d) -> i32 --- @(link_name="llvm.x86.sse2.cvttps2dq") cvttps2dq :: proc(a: __m128) -> i32x4 --- - @(link_name="llvm.x86.sse2.storeu.dq") - storeudq :: proc(mem_addr: rawptr, a: __m128i) --- @(link_name="llvm.x86.sse2.storeu.pd") storeupd :: proc(mem_addr: rawptr, a: __m128d) --- diff --git a/tests/benchmark/crypto/benchmark_crypto.odin b/tests/benchmark/crypto/benchmark_crypto.odin index e90216ad666..b2ac4bca3a7 100644 --- a/tests/benchmark/crypto/benchmark_crypto.odin +++ b/tests/benchmark/crypto/benchmark_crypto.odin @@ -28,6 +28,32 @@ benchmark_crypto :: proc(t: ^testing.T) { strings.builder_destroy(&str) } + { + name := "AES256-CTR 64 bytes" + options := &time.Benchmark_Options { + rounds = 1_000, + bytes = 64, + setup = _setup_sized_buf, + bench = _benchmark_aes256_ctr, + teardown = _teardown_sized_buf, + } + + err := time.benchmark(options, context.allocator) + testing.expect(t, err == nil, name) + benchmark_print(&str, name, options) + + name = "AES256-CTR 1024 bytes" + options.bytes = 1024 + err = time.benchmark(options, context.allocator) + testing.expect(t, err == nil, name) + benchmark_print(&str, name, options) + + name = "AES256-CTR 65536 bytes" + options.bytes = 65536 + err = time.benchmark(options, context.allocator) + testing.expect(t, err == nil, name) + benchmark_print(&str, name, options) + } { name := "ChaCha20 64 bytes" options := &time.Benchmark_Options { @@ -323,6 +349,36 @@ _benchmark_chacha20poly1305 :: proc( return nil } +@(private) +_benchmark_aes256_ctr :: proc( + options: ^time.Benchmark_Options, + allocator := context.allocator, +) -> ( + err: time.Benchmark_Error, +) { + buf := options.input + key := [aes.KEY_SIZE_256]byte { + 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, + 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, + 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, + 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, + } + nonce := [aes.CTR_IV_SIZE]byte { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + } + + ctx: aes.Context_CTR = --- + aes.init_ctr(&ctx, key[:], nonce[:]) + + for _ in 0 ..= options.rounds { + aes.xor_bytes_ctr(&ctx, buf, buf) + } + options.count = options.rounds + options.processed = options.rounds * options.bytes + return nil +} + _benchmark_aes256_gcm :: proc( options: ^time.Benchmark_Options, allocator := context.allocator, diff --git a/tests/core/crypto/test_core_crypto_aes.odin b/tests/core/crypto/test_core_crypto_aes.odin index 4d4c06bdc9d..c2fa2835c5a 100644 --- a/tests/core/crypto/test_core_crypto_aes.odin +++ b/tests/core/crypto/test_core_crypto_aes.odin @@ -12,8 +12,6 @@ import "core:crypto/sha2" test_aes :: proc(t: ^testing.T) { runtime.DEFAULT_TEMP_ALLOCATOR_TEMP_GUARD() - log.info("Testing AES") - impls := make([dynamic]aes.Implementation, 0, 2) defer delete(impls) append(&impls, aes.Implementation.Portable) @@ -29,7 +27,7 @@ test_aes :: proc(t: ^testing.T) { } test_aes_ecb :: proc(t: ^testing.T, impl: aes.Implementation) { - log.infof("Testing AES-ECB/%v", impl) + log.debugf("Testing AES-ECB/%v", impl) test_vectors := []struct { key: string, @@ -136,7 +134,7 @@ test_aes_ecb :: proc(t: ^testing.T, impl: aes.Implementation) { } test_aes_ctr :: proc(t: ^testing.T, impl: aes.Implementation) { - log.infof("Testing AES-CTR/%v", impl) + log.debugf("Testing AES-CTR/%v", impl) test_vectors := []struct { key: string, @@ -200,7 +198,7 @@ test_aes_ctr :: proc(t: ^testing.T, impl: aes.Implementation) { ctx: aes.Context_CTR key: [aes.KEY_SIZE_256]byte nonce: [aes.CTR_IV_SIZE]byte - aes.init_ctr(&ctx, key[:], nonce[:]) + aes.init_ctr(&ctx, key[:], nonce[:], impl) h_ctx: sha2.Context_512 sha2.init_512_256(&h_ctx) @@ -226,7 +224,7 @@ test_aes_ctr :: proc(t: ^testing.T, impl: aes.Implementation) { } test_aes_gcm :: proc(t: ^testing.T, impl: aes.Implementation) { - log.infof("Testing AES-GCM/%v", impl) + log.debugf("Testing AES-GCM/%v", impl) // NIST did a reorg of their site, so the source of the test vectors // is only available from an archive. The commented out tests are @@ -431,7 +429,7 @@ test_aes_gcm :: proc(t: ^testing.T, impl: aes.Implementation) { testing.expectf( t, ok && dst_str == v.plaintext, - "AES-GCM/%v: Expected: (%s, true) for open(%s, %s, %s, %s, %s), but got (%s, %s) instead", + "AES-GCM/%v: Expected: (%s, true) for open(%s, %s, %s, %s, %s), but got (%s, %v) instead", impl, v.plaintext, v.key, diff --git a/tests/core/crypto/test_core_crypto_ecc25519.odin b/tests/core/crypto/test_core_crypto_ecc25519.odin index baf4a1a38a3..fec4fa38e20 100644 --- a/tests/core/crypto/test_core_crypto_ecc25519.odin +++ b/tests/core/crypto/test_core_crypto_ecc25519.odin @@ -58,9 +58,9 @@ test_sqrt_ratio_m1 :: proc(t: ^testing.T) { v_bytes, _ := hex.decode(transmute([]byte)(v.v), context.temp_allocator) r_bytes, _ := hex.decode(transmute([]byte)(v.r), context.temp_allocator) - u_ := transmute(^[32]byte)(raw_data(u_bytes)) - v_ := transmute(^[32]byte)(raw_data(v_bytes)) - r_ := transmute(^[32]byte)(raw_data(r_bytes)) + u_ := (^[32]byte)(raw_data(u_bytes)) + v_ := (^[32]byte)(raw_data(v_bytes)) + r_ := (^[32]byte)(raw_data(r_bytes)) u, vee, r: field.Tight_Field_Element field.fe_from_bytes(&u, u_) diff --git a/tests/core/crypto/test_core_crypto_kdf.odin b/tests/core/crypto/test_core_crypto_kdf.odin index 247529e652e..c15dc220620 100644 --- a/tests/core/crypto/test_core_crypto_kdf.odin +++ b/tests/core/crypto/test_core_crypto_kdf.odin @@ -161,7 +161,7 @@ test_pbkdf2 :: proc(t: ^testing.T) { testing.expectf( t, dst_str == v.dk, - "HMAC-%s: Expected: %s for input of (%s, %s, %d), but got %s instead", + "PBKDF2-%s: Expected: %s for input of (%s, %s, %d), but got %s instead", algo_name, v.dk, v.password,