-
Notifications
You must be signed in to change notification settings - Fork 1.8k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Enhancement]enhance bit unpacking with bmi2 instruction set
Signed-off-by: zombee0 <[email protected]>
- Loading branch information
Showing
11 changed files
with
1,049 additions
and
21 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,136 @@ | ||
// Copyright 2021-present StarRocks, Inc. All rights reserved. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// https://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
#pragma once | ||
|
||
#ifdef __ARM_NEON | ||
#include <arrow/util/bpacking.h> | ||
#include <arrow/util/bpakcing_neon.h> | ||
#endif | ||
#ifdef __AVX2__ | ||
#include <arrow/util/bpacking.h> | ||
#include <arrow/util/bpacking_avx2.h> | ||
#endif | ||
|
||
#include "common/config.h" | ||
#include "common/logging.h" | ||
#include "util/bit_packing.h" | ||
#include "util/bit_packing_simd.h" | ||
|
||
namespace starrocks { | ||
|
||
class BitPackingAdapter { | ||
public: | ||
template <typename OutType> | ||
static std::pair<const uint8_t*, int64_t> UnpackValues(int bit_width, const uint8_t* __restrict__ in, | ||
int64_t in_bytes, int64_t num_values, | ||
OutType* __restrict__ out) { | ||
if (config::enable_bit_unpack_simd) { | ||
// First unpack as many full batches as possible. | ||
const int64_t values_to_read = BitPacking::NumValuesToUnpack(bit_width, in_bytes, num_values); | ||
constexpr int BATCH_SIZE = 8; | ||
// make sure don't access memory out of bound. | ||
const int64_t batches_to_read = values_to_read * bit_width / 8 / 8 * 8 * 8 / bit_width / BATCH_SIZE; | ||
if (batches_to_read > 0) { | ||
starrocks::util::unpack(in, in_bytes, out, batches_to_read * BATCH_SIZE, bit_width); | ||
in_bytes -= batches_to_read * bit_width; | ||
in += batches_to_read * bit_width; | ||
out += batches_to_read * BATCH_SIZE; | ||
} | ||
|
||
const int64_t remainder_values = values_to_read - batches_to_read * BATCH_SIZE; | ||
// Then unpack the final partial batch. | ||
if (remainder_values > 0) { | ||
in = BitPacking::UnpackValues(bit_width, in, in_bytes, remainder_values, out).first; | ||
} | ||
return std::make_pair(in, values_to_read); | ||
} else { | ||
return BitPacking::UnpackValues(bit_width, in, in_bytes, num_values, out); | ||
} | ||
} | ||
|
||
template <typename OutType> | ||
static const uint8_t* UnpackValues_32_ARROW(const uint8_t* __restrict__ in, int64_t in_bytes, | ||
OutType* __restrict__ out, int64_t num_values, int bit_width) { | ||
#pragma push_macro("UNPACK_ARROW_VALUES_CASE") | ||
#define UNPACK_ARROW_VALUES_CASE(ignore1, i, ignore2) \ | ||
case i: \ | ||
return UnpackValues_32_ARROW<OutType, i>(in, in_bytes, out, num_values); | ||
|
||
switch (bit_width) { | ||
// Expand cases from 0 to 64. | ||
BOOST_PP_REPEAT_FROM_TO(0, 65, UNPACK_ARROW_VALUES_CASE, ignore); | ||
default: | ||
DCHECK(false); | ||
return nullptr; | ||
} | ||
#pragma pop_macro("UNPACK_ARROW_VALUES_CASE") | ||
} | ||
|
||
template <typename OutType, int BIT_WIDTH> | ||
static const uint8_t* UnpackValues_32_ARROW(const uint8_t* __restrict__ in, int64_t in_bytes, | ||
OutType* __restrict__ out, int64_t num_values) { | ||
int batch_size = num_values; | ||
const int byte_width = 8; | ||
if constexpr (sizeof(OutType) == 4) { | ||
#if defined(__AVX2__) | ||
int num_unpacked = arrow::internal::unpack32_avx2(reinterpret_cast<const uint32_t*>(in), | ||
reinterpret_cast<uint32_t*>(out), batch_size, BIT_WIDTH); | ||
#elif defined(__ARM_NEON) | ||
int num_unpacked = arrow::internal::unpack32_neon(reinterpret_cast<const uint32_t*>(in), | ||
reinterpret_cast<uint32_t*>(out), batch_size, BIT_WIDTH); | ||
#else | ||
#error "Not supported instruction set" | ||
#endif | ||
|
||
DCHECK(num_unpacked == batch_size); | ||
in += num_unpacked * BIT_WIDTH / byte_width; | ||
} else if constexpr (sizeof(OutType) == 8 && BIT_WIDTH > 32) { | ||
// Use unpack64 only if BIT_WIDTH is larger than 32 | ||
// TODO (ARROW-13677): improve the performance of internal::unpack64 | ||
// and remove the restriction of BIT_WIDTH | ||
int num_unpacked = arrow::internal::unpack64(in, reinterpret_cast<uint64_t*>(out), batch_size, BIT_WIDTH); | ||
DCHECK(num_unpacked == batch_size); | ||
in += num_unpacked * BIT_WIDTH / byte_width; | ||
} else { | ||
// TODO: revisit this limit if necessary | ||
DCHECK_LE(BIT_WIDTH, 32); | ||
const int buffer_size = 1024; | ||
uint32_t unpack_buffer[buffer_size]; | ||
|
||
int64_t decoded = 0; | ||
while (decoded < batch_size) { | ||
auto size = batch_size - decoded > buffer_size ? buffer_size : batch_size - decoded; | ||
#if defined(__AVX2__) | ||
int num_unpacked = arrow::internal::unpack32_avx2(reinterpret_cast<const uint32_t*>(in), unpack_buffer, | ||
size, BIT_WIDTH); | ||
#elif defined(__ARM_NEON) | ||
int num_unpacked = arrow::internal::unpack32_neon(reinterpret_cast<const uint32_t*>(in), unpack_buffer, | ||
size, BIT_WIDTH); | ||
#else | ||
#error "Not supported instruction set" | ||
#endif | ||
DCHECK(num_unpacked == size); | ||
for (int k = 0; k < size; ++k) { | ||
out[decoded + k] = static_cast<OutType>(unpack_buffer[k]); | ||
} | ||
in += num_unpacked * BIT_WIDTH / byte_width; | ||
decoded += size; | ||
} | ||
} | ||
return in; | ||
} | ||
}; | ||
|
||
} // namespace starrocks |
Oops, something went wrong.