1
- /* auto-generated on 2025-01-27 20:34:35 -0500. Do not edit! */
1
+ /* auto-generated on 2025-02-14 16:11:36 -0500. Do not edit! */
2
2
/* including simdjson.h: */
3
3
/* begin file simdjson.h */
4
4
#ifndef SIMDJSON_H
@@ -2437,7 +2437,7 @@ namespace std {
2437
2437
#define SIMDJSON_SIMDJSON_VERSION_H
2438
2438
2439
2439
/** The version of simdjson being used (major.minor.revision) */
2440
- #define SIMDJSON_VERSION "3.12.0 "
2440
+ #define SIMDJSON_VERSION "3.12.2 "
2441
2441
2442
2442
namespace simdjson {
2443
2443
enum {
@@ -2452,7 +2452,7 @@ enum {
2452
2452
/**
2453
2453
* The revision (major.minor.REVISION) of simdjson being used.
2454
2454
*/
2455
- SIMDJSON_VERSION_REVISION = 0
2455
+ SIMDJSON_VERSION_REVISION = 2
2456
2456
};
2457
2457
} // namespace simdjson
2458
2458
@@ -17948,14 +17948,18 @@ namespace simd {
17948
17948
17949
17949
// Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset).
17950
17950
// Passing a 0 value for mask would be equivalent to writing out every byte to output.
17951
- // Only the first 32 - count_ones(mask) bytes of the result are significant but 32 bytes
17951
+ // Only the first 64 - count_ones(mask) bytes of the result are significant but 64 bytes
17952
17952
// get written.
17953
17953
// Design consideration: it seems like a function with the
17954
17954
// signature simd8<L> compress(uint32_t mask) would be
17955
17955
// sensible, but the AVX ISA makes this kind of approach difficult.
17956
17956
template<typename L>
17957
17957
simdjson_inline void compress(uint64_t mask, L * output) const {
17958
- _mm512_mask_compressstoreu_epi8 (output,~mask,*this);
17958
+ // we deliberately avoid _mm512_mask_compressstoreu_epi8 for portability
17959
+ // (AMD Zen4 has terrible performance with it, it is effectively broken)
17960
+ // _mm512_mask_compressstoreu_epi8 (output,~mask,*this);
17961
+ __m512i compressed = _mm512_maskz_compress_epi8(~mask, *this);
17962
+ _mm512_storeu_si512(output, compressed); // could use a mask
17959
17963
}
17960
17964
17961
17965
template<typename L>
@@ -65401,14 +65405,18 @@ namespace simd {
65401
65405
65402
65406
// Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset).
65403
65407
// Passing a 0 value for mask would be equivalent to writing out every byte to output.
65404
- // Only the first 32 - count_ones(mask) bytes of the result are significant but 32 bytes
65408
+ // Only the first 64 - count_ones(mask) bytes of the result are significant but 64 bytes
65405
65409
// get written.
65406
65410
// Design consideration: it seems like a function with the
65407
65411
// signature simd8<L> compress(uint32_t mask) would be
65408
65412
// sensible, but the AVX ISA makes this kind of approach difficult.
65409
65413
template<typename L>
65410
65414
simdjson_inline void compress(uint64_t mask, L * output) const {
65411
- _mm512_mask_compressstoreu_epi8 (output,~mask,*this);
65415
+ // we deliberately avoid _mm512_mask_compressstoreu_epi8 for portability
65416
+ // (AMD Zen4 has terrible performance with it, it is effectively broken)
65417
+ // _mm512_mask_compressstoreu_epi8 (output,~mask,*this);
65418
+ __m512i compressed = _mm512_maskz_compress_epi8(~mask, *this);
65419
+ _mm512_storeu_si512(output, compressed); // could use a mask
65412
65420
}
65413
65421
65414
65422
template<typename L>
0 commit comments