From 942814d53d115807cf457285220bc4d2484b03dc Mon Sep 17 00:00:00 2001 From: zombee0 Date: Tue, 10 Sep 2024 12:56:31 +0800 Subject: [PATCH] [Enhancement]simd for bit unpacking from arrow Signed-off-by: zombee0 --- be/src/common/config.h | 2 + be/src/util/bit_packing.h | 1 - be/src/util/bit_packing_apapter.h | 140 ++++++++++++++++++++++++++ be/src/util/bit_stream_utils.inline.h | 4 +- 4 files changed, 145 insertions(+), 2 deletions(-) create mode 100644 be/src/util/bit_packing_apapter.h diff --git a/be/src/common/config.h b/be/src/common/config.h index 97e7fe81f92b5a..126d0e83abe0b4 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -1444,4 +1444,6 @@ CONF_mBool(enable_lake_compaction_use_partial_segments, "false"); // chunk size used by lake compaction CONF_mInt32(lake_compaction_chunk_size, "4096"); +CONF_mBool(enable_bit_unpack_simd, "true"); + } // namespace starrocks::config diff --git a/be/src/util/bit_packing.h b/be/src/util/bit_packing.h index 7d95c56f834225..397b8222eae397 100644 --- a/be/src/util/bit_packing.h +++ b/be/src/util/bit_packing.h @@ -62,7 +62,6 @@ class BitPacking { static const uint8_t* UnpackUpTo31Values(const uint8_t* __restrict__ in, int64_t in_bytes, int num_values, OutType* __restrict__ out); -private: /// Compute the number of values with the given bit width that can be unpacked from /// an input buffer of 'in_bytes' into an output buffer with space for 'num_values'. static int64_t NumValuesToUnpack(int bit_width, int64_t in_bytes, int64_t num_values); diff --git a/be/src/util/bit_packing_apapter.h b/be/src/util/bit_packing_apapter.h new file mode 100644 index 00000000000000..fcc0daa2aef324 --- /dev/null +++ b/be/src/util/bit_packing_apapter.h @@ -0,0 +1,140 @@ +// Copyright 2021-present StarRocks, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#ifdef __ARM_NEON +#include +#include +#endif +#ifdef __AVX2__ +#include +#include +#endif + +#include "common/config.h" +#include "common/logging.h" +#include "util/bit_packing.h" + +namespace starrocks { + +class BitPackingAdapter { +public: + template + static std::pair UnpackValues(int bit_width, const uint8_t* __restrict__ in, + int64_t in_bytes, int64_t num_values, + OutType* __restrict__ out) { +#if defined(__AVX2__) || defined(__ARM_NEON) + +#pragma push_macro("UNPACK_ADAPTER_VALUES_CASE") +#define UNPACK_ADAPTER_VALUES_CASE(ignore1, i, ignore2) \ + case i: \ + return UnpackValues(in, in_bytes, num_values, out); + + switch (bit_width) { + // Expand cases from 0 to 64. + BOOST_PP_REPEAT_FROM_TO(0, 65, UNPACK_ADAPTER_VALUES_CASE, ignore); + default: + DCHECK(false); + return std::make_pair(nullptr, -1); + } +#pragma pop_macro("UNPACK_ADAPTER_VALUES_CASE") + +#else + + return BitPacking::UnpackValues(bit_width, in, in_bytes, num_values, out); + +#endif + } + + template + static std::pair UnpackValues(const uint8_t* __restrict__ in, int64_t in_bytes, + int64_t num_values, OutType* __restrict__ out) { + constexpr int BATCH_SIZE = 32; + const int64_t values_to_read = BitPacking::NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values); + const int64_t batches_to_read = values_to_read / BATCH_SIZE; + const int64_t remainder_values = values_to_read % BATCH_SIZE; + const uint8_t* in_pos = in; + OutType* out_pos = out; + + // First unpack as many full batches as possible. + if (config::enable_bit_unpack_simd && batches_to_read > 0) { + in_pos = BitPackingAdapter::UnpackValues_32_SIMD(in_pos, in_bytes, out_pos, + batches_to_read * BATCH_SIZE); + } else { + for (int64_t i = 0; i < batches_to_read; ++i) { + in_pos = BitPacking::Unpack32Values(in_pos, in_bytes, out_pos); + out_pos += BATCH_SIZE; + in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT; + } + } + + // Then unpack the final partial batch. + if (remainder_values > 0) { + in_pos = BitPacking::UnpackUpTo31Values(in_pos, in_bytes, remainder_values, out_pos); + } + return std::make_pair(in_pos, values_to_read); + } + + template + static const uint8_t* UnpackValues_32_SIMD(const uint8_t* __restrict__ in, int64_t in_bytes, + OutType* __restrict__ out, int64_t num_values) { + int batch_size = num_values; + const int byte_width = 8; + if constexpr (sizeof(OutType) == 4) { +#if defined(__AVX2__) + int num_unpacked = arrow::internal::unpack32_avx2(reinterpret_cast(in), + reinterpret_cast(out), batch_size, BIT_WIDTH); +#elif defined(__ARM_NEON) + int num_unpacked = arrow::internal::unpack32_neon(reinterpret_cast(in), + reinterpret_cast(out), batch_size, BIT_WIDTH); +#else +#error "Not supported instruction set" +#endif + + DCHECK(num_unpacked == batch_size); + in += num_unpacked * BIT_WIDTH / byte_width; + } else if constexpr (sizeof(OutType) == 8 && BIT_WIDTH > 32) { + // Use unpack64 only if BIT_WIDTH is larger than 32 + // TODO (ARROW-13677): improve the performance of internal::unpack64 + // and remove the restriction of BIT_WIDTH + int num_unpacked = arrow::internal::unpack64(in, reinterpret_cast(out), batch_size, BIT_WIDTH); + DCHECK(num_unpacked == batch_size); + in += num_unpacked * BIT_WIDTH / byte_width; + } else { + // TODO: revisit this limit if necessary + DCHECK_LE(BIT_WIDTH, 32); + const int buffer_size = 32; + uint32_t unpack_buffer[buffer_size]; + +#if defined(__AVX2__) + int num_unpacked = arrow::internal::unpack32_avx2(reinterpret_cast(in), unpack_buffer, + batch_size, BIT_WIDTH); +#elif defined(__ARM_NEON) + int num_unpacked = arrow::internal::unpack32_neon(reinterpret_cast(in), unpack_buffer, + batch_size, BIT_WIDTH); +#else +#error "Not supported instruction set" +#endif + DCHECK(num_unpacked == batch_size); + for (int k = 0; k < num_unpacked; ++k) { + out[k] = static_cast(unpack_buffer[k]); + } + in += num_unpacked * BIT_WIDTH / byte_width; + } + return in; + } +}; + +} // namespace starrocks diff --git a/be/src/util/bit_stream_utils.inline.h b/be/src/util/bit_stream_utils.inline.h index 96b8af0d7d6f1d..109d3a57785cdb 100644 --- a/be/src/util/bit_stream_utils.inline.h +++ b/be/src/util/bit_stream_utils.inline.h @@ -38,6 +38,7 @@ #include "glog/logging.h" #include "util/alignment.h" #include "util/bit_packing.inline.h" +#include "util/bit_packing_apapter.h" #include "util/bit_stream_utils.h" using starrocks::BitUtil; @@ -270,7 +271,8 @@ inline bool BatchedBitReader::skip_bytes(int num_bytes) { template inline int BatchedBitReader::unpack_batch(int bit_width, int num_values, T* v) { int64_t num_read; - std::tie(_buffer_pos, num_read) = BitPacking::UnpackValues(bit_width, _buffer_pos, _bytes_left(), num_values, v); + std::tie(_buffer_pos, num_read) = + BitPackingAdapter::UnpackValues(bit_width, _buffer_pos, _bytes_left(), num_values, v); return static_cast(num_read); }