Skip to content

Commit

Permalink
[Enhancement]enhance bit unpacking with bmi2 instruction set
Browse files Browse the repository at this point in the history
Signed-off-by: zombee0 <[email protected]>
  • Loading branch information
zombee0 committed Sep 19, 2024
1 parent 8a34503 commit 9c9348f
Show file tree
Hide file tree
Showing 11 changed files with 1,049 additions and 21 deletions.
7 changes: 6 additions & 1 deletion be/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,8 @@ option(USE_STAROS "Use StarOS to manager tablet info" OFF)

option(USE_SSE4_2 "Build with SSE4.2 instruction" ON)

option(USE_BMI_2 "Build with BMI2 instruction" ON)

option(USE_AVX2 "Build with AVX2 instruction" ON)

option(USE_AVX512 "Build with AVX512f/AVX512BW instruction" OFF)
Expand Down Expand Up @@ -660,6 +662,9 @@ if ("${CMAKE_BUILD_TARGET_ARCH}" STREQUAL "x86" OR "${CMAKE_BUILD_TARGET_ARCH}"
# the compiler will define __AVX2__
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -mavx2")
endif()
if (${USE_BMI_2})
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -mbmi2")
endif()
if (${USE_AVX512})
# the compiler will define __AVX512F__ __AVX512BW__
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -mavx512f -mavx512bw")
Expand Down Expand Up @@ -1044,7 +1049,7 @@ FUNCTION(ADD_BE_BENCH BENCH_NAME)

find_package(benchmark REQUIRED)
ADD_EXECUTABLE(${BENCH_FILE_NAME} ${BENCH_NAME}.cpp)
TARGET_LINK_LIBRARIES(${BENCH_FILE_NAME} ${TEST_LINK_LIBS} benchmark benchmark_main)
TARGET_LINK_LIBRARIES(${BENCH_FILE_NAME} benchmark benchmark_main ${TEST_LINK_LIBS})

SET_TARGET_PROPERTIES(${BENCH_FILE_NAME} PROPERTIES COMPILE_FLAGS "-fno-access-control")
SET_TARGET_PROPERTIES(${BENCH_FILE_NAME} PROPERTIES COMPILE_FLAGS ${CXX_FLAGS_RELEASE})
Expand Down
31 changes: 16 additions & 15 deletions be/src/bench/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,19 @@
# See the License for the specific language governing permissions and
# limitations under the License.

ADD_BE_BENCH(${SRC_DIR}/bench/chunks_sorter_bench)
ADD_BE_BENCH(${SRC_DIR}/bench/runtime_filter_bench)
ADD_BE_BENCH(${SRC_DIR}/bench/csv_reader_bench)
ADD_BE_BENCH(${SRC_DIR}/bench/shuffle_chunk_bench)
#ADD_BE_BENCH(${SRC_DIR}/bench/block_cache_bench)
ADD_BE_BENCH(${SRC_DIR}/bench/roaring_bitmap_mem_bench)
ADD_BE_BENCH(${SRC_DIR}/bench/parquet_dict_decode_bench)
ADD_BE_BENCH(${SRC_DIR}/bench/get_dict_codes_bench)
ADD_BE_BENCH(${SRC_DIR}/bench/persistent_index_bench)
ADD_BE_BENCH(${SRC_DIR}/bench/orc_column_reader_bench)
ADD_BE_BENCH(${SRC_DIR}/bench/hash_functions_bench)
ADD_BE_BENCH(${SRC_DIR}/bench/binary_column_copy_bench)
ADD_BE_BENCH(${SRC_DIR}/bench/hyperscan_vec_bench)

ADD_BE_BENCH(${SRC_DIR}/bench/mem_equal_bench)
#ADD_BE_BENCH(${SRC_DIR}/bench/chunks_sorter_bench)
#ADD_BE_BENCH(${SRC_DIR}/bench/runtime_filter_bench)
#ADD_BE_BENCH(${SRC_DIR}/bench/csv_reader_bench)
#ADD_BE_BENCH(${SRC_DIR}/bench/shuffle_chunk_bench)
##ADD_BE_BENCH(${SRC_DIR}/bench/block_cache_bench)
#ADD_BE_BENCH(${SRC_DIR}/bench/roaring_bitmap_mem_bench)
#ADD_BE_BENCH(${SRC_DIR}/bench/parquet_dict_decode_bench)
#ADD_BE_BENCH(${SRC_DIR}/bench/get_dict_codes_bench)
#ADD_BE_BENCH(${SRC_DIR}/bench/persistent_index_bench)
#ADD_BE_BENCH(${SRC_DIR}/bench/orc_column_reader_bench)
#ADD_BE_BENCH(${SRC_DIR}/bench/hash_functions_bench)
#ADD_BE_BENCH(${SRC_DIR}/bench/binary_column_copy_bench)
#ADD_BE_BENCH(${SRC_DIR}/bench/hyperscan_vec_bench)
#
#ADD_BE_BENCH(${SRC_DIR}/bench/mem_equal_bench)
ADD_BE_BENCH(${SRC_DIR}/bench/bit_unpack_bench)
418 changes: 418 additions & 0 deletions be/src/bench/bit_unpack_bench.cpp

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions be/src/common/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -1450,4 +1450,6 @@ CONF_mBool(enable_lake_compaction_use_partial_segments, "false");
// chunk size used by lake compaction
CONF_mInt32(lake_compaction_chunk_size, "4096");

CONF_mBool(enable_bit_unpack_simd, "true");

} // namespace starrocks::config
2 changes: 1 addition & 1 deletion be/src/formats/orc/orc_chunk_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
#include "orc_schema_builder.h"
#include "simd/simd.h"
#include "types/logical_type.h"
#include "util/stack_util.cpp"
#include "util/stack_util.h"
#include "util/timezone_utils.h"

namespace starrocks {
Expand Down
1 change: 0 additions & 1 deletion be/src/util/bit_packing.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@ class BitPacking {
static const uint8_t* UnpackUpTo31Values(const uint8_t* __restrict__ in, int64_t in_bytes, int num_values,
OutType* __restrict__ out);

private:
/// Compute the number of values with the given bit width that can be unpacked from
/// an input buffer of 'in_bytes' into an output buffer with space for 'num_values'.
static int64_t NumValuesToUnpack(int bit_width, int64_t in_bytes, int64_t num_values);
Expand Down
136 changes: 136 additions & 0 deletions be/src/util/bit_packing_apapter.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#ifdef __ARM_NEON
#include <arrow/util/bpacking.h>
#include <arrow/util/bpakcing_neon.h>
#endif
#ifdef __AVX2__
#include <arrow/util/bpacking.h>
#include <arrow/util/bpacking_avx2.h>
#endif

#include "common/config.h"
#include "common/logging.h"
#include "util/bit_packing.h"
#include "util/bit_packing_simd.h"

namespace starrocks {

class BitPackingAdapter {
public:
template <typename OutType>
static std::pair<const uint8_t*, int64_t> UnpackValues(int bit_width, const uint8_t* __restrict__ in,
int64_t in_bytes, int64_t num_values,
OutType* __restrict__ out) {
if (config::enable_bit_unpack_simd) {
// First unpack as many full batches as possible.
const int64_t values_to_read = BitPacking::NumValuesToUnpack(bit_width, in_bytes, num_values);
constexpr int BATCH_SIZE = 8;
// make sure don't access memory out of bound.
const int64_t batches_to_read = values_to_read * bit_width / 8 / 8 * 8 * 8 / bit_width / BATCH_SIZE;
if (batches_to_read > 0) {
starrocks::util::unpack(in, in_bytes, out, batches_to_read * BATCH_SIZE, bit_width);
in_bytes -= batches_to_read * bit_width;
in += batches_to_read * bit_width;
out += batches_to_read * BATCH_SIZE;
}

const int64_t remainder_values = values_to_read - batches_to_read * BATCH_SIZE;
// Then unpack the final partial batch.
if (remainder_values > 0) {
in = BitPacking::UnpackValues(bit_width, in, in_bytes, remainder_values, out).first;
}
return std::make_pair(in, values_to_read);
} else {
return BitPacking::UnpackValues(bit_width, in, in_bytes, num_values, out);
}
}

template <typename OutType>
static const uint8_t* UnpackValues_32_ARROW(const uint8_t* __restrict__ in, int64_t in_bytes,
OutType* __restrict__ out, int64_t num_values, int bit_width) {
#pragma push_macro("UNPACK_ARROW_VALUES_CASE")
#define UNPACK_ARROW_VALUES_CASE(ignore1, i, ignore2) \
case i: \
return UnpackValues_32_ARROW<OutType, i>(in, in_bytes, out, num_values);

switch (bit_width) {
// Expand cases from 0 to 64.
BOOST_PP_REPEAT_FROM_TO(0, 65, UNPACK_ARROW_VALUES_CASE, ignore);
default:
DCHECK(false);
return nullptr;
}
#pragma pop_macro("UNPACK_ARROW_VALUES_CASE")
}

template <typename OutType, int BIT_WIDTH>
static const uint8_t* UnpackValues_32_ARROW(const uint8_t* __restrict__ in, int64_t in_bytes,
OutType* __restrict__ out, int64_t num_values) {
int batch_size = num_values;
const int byte_width = 8;
if constexpr (sizeof(OutType) == 4) {
#if defined(__AVX2__)
int num_unpacked = arrow::internal::unpack32_avx2(reinterpret_cast<const uint32_t*>(in),
reinterpret_cast<uint32_t*>(out), batch_size, BIT_WIDTH);
#elif defined(__ARM_NEON)
int num_unpacked = arrow::internal::unpack32_neon(reinterpret_cast<const uint32_t*>(in),
reinterpret_cast<uint32_t*>(out), batch_size, BIT_WIDTH);
#else
#error "Not supported instruction set"
#endif

DCHECK(num_unpacked == batch_size);
in += num_unpacked * BIT_WIDTH / byte_width;
} else if constexpr (sizeof(OutType) == 8 && BIT_WIDTH > 32) {
// Use unpack64 only if BIT_WIDTH is larger than 32
// TODO (ARROW-13677): improve the performance of internal::unpack64
// and remove the restriction of BIT_WIDTH
int num_unpacked = arrow::internal::unpack64(in, reinterpret_cast<uint64_t*>(out), batch_size, BIT_WIDTH);
DCHECK(num_unpacked == batch_size);
in += num_unpacked * BIT_WIDTH / byte_width;
} else {
// TODO: revisit this limit if necessary
DCHECK_LE(BIT_WIDTH, 32);
const int buffer_size = 1024;
uint32_t unpack_buffer[buffer_size];

int64_t decoded = 0;
while (decoded < batch_size) {
auto size = batch_size - decoded > buffer_size ? buffer_size : batch_size - decoded;
#if defined(__AVX2__)
int num_unpacked = arrow::internal::unpack32_avx2(reinterpret_cast<const uint32_t*>(in), unpack_buffer,
size, BIT_WIDTH);
#elif defined(__ARM_NEON)
int num_unpacked = arrow::internal::unpack32_neon(reinterpret_cast<const uint32_t*>(in), unpack_buffer,
size, BIT_WIDTH);
#else
#error "Not supported instruction set"
#endif
DCHECK(num_unpacked == size);
for (int k = 0; k < size; ++k) {
out[decoded + k] = static_cast<OutType>(unpack_buffer[k]);
}
in += num_unpacked * BIT_WIDTH / byte_width;
decoded += size;
}
}
return in;
}
};

} // namespace starrocks
Loading

0 comments on commit 9c9348f

Please sign in to comment.