Skip to content

Commit

Permalink
[Enhancement]simd for bit unpacking from arrow
Browse files Browse the repository at this point in the history
Signed-off-by: zombee0 <[email protected]>
  • Loading branch information
zombee0 committed Sep 14, 2024
1 parent 90ce1c2 commit 942814d
Show file tree
Hide file tree
Showing 4 changed files with 145 additions and 2 deletions.
2 changes: 2 additions & 0 deletions be/src/common/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -1444,4 +1444,6 @@ CONF_mBool(enable_lake_compaction_use_partial_segments, "false");
// chunk size used by lake compaction
CONF_mInt32(lake_compaction_chunk_size, "4096");

CONF_mBool(enable_bit_unpack_simd, "true");

} // namespace starrocks::config
1 change: 0 additions & 1 deletion be/src/util/bit_packing.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@ class BitPacking {
static const uint8_t* UnpackUpTo31Values(const uint8_t* __restrict__ in, int64_t in_bytes, int num_values,
OutType* __restrict__ out);

private:
/// Compute the number of values with the given bit width that can be unpacked from
/// an input buffer of 'in_bytes' into an output buffer with space for 'num_values'.
static int64_t NumValuesToUnpack(int bit_width, int64_t in_bytes, int64_t num_values);
Expand Down
140 changes: 140 additions & 0 deletions be/src/util/bit_packing_apapter.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#ifdef __ARM_NEON
#include <arrow/util/bpacking.h>
#include <arrow/util/bpakcing_neon.h>
#endif
#ifdef __AVX2__
#include <arrow/util/bpacking.h>
#include <arrow/util/bpacking_avx2.h>
#endif

#include "common/config.h"
#include "common/logging.h"
#include "util/bit_packing.h"

namespace starrocks {

class BitPackingAdapter {
public:
template <typename OutType>
static std::pair<const uint8_t*, int64_t> UnpackValues(int bit_width, const uint8_t* __restrict__ in,
int64_t in_bytes, int64_t num_values,
OutType* __restrict__ out) {
#if defined(__AVX2__) || defined(__ARM_NEON)

#pragma push_macro("UNPACK_ADAPTER_VALUES_CASE")
#define UNPACK_ADAPTER_VALUES_CASE(ignore1, i, ignore2) \
case i: \
return UnpackValues<OutType, i>(in, in_bytes, num_values, out);

switch (bit_width) {
// Expand cases from 0 to 64.
BOOST_PP_REPEAT_FROM_TO(0, 65, UNPACK_ADAPTER_VALUES_CASE, ignore);
default:
DCHECK(false);
return std::make_pair(nullptr, -1);
}
#pragma pop_macro("UNPACK_ADAPTER_VALUES_CASE")

#else

return BitPacking::UnpackValues(bit_width, in, in_bytes, num_values, out);

#endif
}

template <typename OutType, int BIT_WIDTH>
static std::pair<const uint8_t*, int64_t> UnpackValues(const uint8_t* __restrict__ in, int64_t in_bytes,
int64_t num_values, OutType* __restrict__ out) {
constexpr int BATCH_SIZE = 32;
const int64_t values_to_read = BitPacking::NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values);
const int64_t batches_to_read = values_to_read / BATCH_SIZE;
const int64_t remainder_values = values_to_read % BATCH_SIZE;
const uint8_t* in_pos = in;
OutType* out_pos = out;

// First unpack as many full batches as possible.
if (config::enable_bit_unpack_simd && batches_to_read > 0) {
in_pos = BitPackingAdapter::UnpackValues_32_SIMD<OutType, BIT_WIDTH>(in_pos, in_bytes, out_pos,
batches_to_read * BATCH_SIZE);
} else {
for (int64_t i = 0; i < batches_to_read; ++i) {
in_pos = BitPacking::Unpack32Values<OutType, BIT_WIDTH>(in_pos, in_bytes, out_pos);
out_pos += BATCH_SIZE;
in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT;
}
}

// Then unpack the final partial batch.
if (remainder_values > 0) {
in_pos = BitPacking::UnpackUpTo31Values<OutType, BIT_WIDTH>(in_pos, in_bytes, remainder_values, out_pos);
}
return std::make_pair(in_pos, values_to_read);
}

template <typename OutType, int BIT_WIDTH>
static const uint8_t* UnpackValues_32_SIMD(const uint8_t* __restrict__ in, int64_t in_bytes,
OutType* __restrict__ out, int64_t num_values) {
int batch_size = num_values;
const int byte_width = 8;
if constexpr (sizeof(OutType) == 4) {
#if defined(__AVX2__)
int num_unpacked = arrow::internal::unpack32_avx2(reinterpret_cast<const uint32_t*>(in),
reinterpret_cast<uint32_t*>(out), batch_size, BIT_WIDTH);
#elif defined(__ARM_NEON)
int num_unpacked = arrow::internal::unpack32_neon(reinterpret_cast<const uint32_t*>(in),
reinterpret_cast<uint32_t*>(out), batch_size, BIT_WIDTH);
#else
#error "Not supported instruction set"
#endif

DCHECK(num_unpacked == batch_size);
in += num_unpacked * BIT_WIDTH / byte_width;
} else if constexpr (sizeof(OutType) == 8 && BIT_WIDTH > 32) {
// Use unpack64 only if BIT_WIDTH is larger than 32
// TODO (ARROW-13677): improve the performance of internal::unpack64
// and remove the restriction of BIT_WIDTH
int num_unpacked = arrow::internal::unpack64(in, reinterpret_cast<uint64_t*>(out), batch_size, BIT_WIDTH);
DCHECK(num_unpacked == batch_size);
in += num_unpacked * BIT_WIDTH / byte_width;
} else {
// TODO: revisit this limit if necessary
DCHECK_LE(BIT_WIDTH, 32);
const int buffer_size = 32;
uint32_t unpack_buffer[buffer_size];

#if defined(__AVX2__)
int num_unpacked = arrow::internal::unpack32_avx2(reinterpret_cast<const uint32_t*>(in), unpack_buffer,
batch_size, BIT_WIDTH);
#elif defined(__ARM_NEON)
int num_unpacked = arrow::internal::unpack32_neon(reinterpret_cast<const uint32_t*>(in), unpack_buffer,
batch_size, BIT_WIDTH);
#else
#error "Not supported instruction set"
#endif
DCHECK(num_unpacked == batch_size);
for (int k = 0; k < num_unpacked; ++k) {
out[k] = static_cast<OutType>(unpack_buffer[k]);
}
in += num_unpacked * BIT_WIDTH / byte_width;
}
return in;
}
};

} // namespace starrocks
4 changes: 3 additions & 1 deletion be/src/util/bit_stream_utils.inline.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
#include "glog/logging.h"
#include "util/alignment.h"
#include "util/bit_packing.inline.h"
#include "util/bit_packing_apapter.h"
#include "util/bit_stream_utils.h"

using starrocks::BitUtil;
Expand Down Expand Up @@ -270,7 +271,8 @@ inline bool BatchedBitReader::skip_bytes(int num_bytes) {
template <typename T>
inline int BatchedBitReader::unpack_batch(int bit_width, int num_values, T* v) {
int64_t num_read;
std::tie(_buffer_pos, num_read) = BitPacking::UnpackValues(bit_width, _buffer_pos, _bytes_left(), num_values, v);
std::tie(_buffer_pos, num_read) =
BitPackingAdapter::UnpackValues(bit_width, _buffer_pos, _bytes_left(), num_values, v);
return static_cast<int>(num_read);
}

Expand Down

0 comments on commit 942814d

Please sign in to comment.