Skip to content

Commit

Permalink
[Enhancement]enhance bit unpacking with bmi2 instruction set
Browse files Browse the repository at this point in the history
Signed-off-by: zombee0 <[email protected]>
  • Loading branch information
zombee0 committed Sep 19, 2024
1 parent 8a34503 commit 84d0a70
Show file tree
Hide file tree
Showing 14 changed files with 1,177 additions and 21 deletions.
7 changes: 6 additions & 1 deletion be/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,8 @@ option(USE_STAROS "Use StarOS to manager tablet info" OFF)

option(USE_SSE4_2 "Build with SSE4.2 instruction" ON)

option(USE_BMI_2 "Build with BMI2 instruction" ON)

option(USE_AVX2 "Build with AVX2 instruction" ON)

option(USE_AVX512 "Build with AVX512f/AVX512BW instruction" OFF)
Expand Down Expand Up @@ -660,6 +662,9 @@ if ("${CMAKE_BUILD_TARGET_ARCH}" STREQUAL "x86" OR "${CMAKE_BUILD_TARGET_ARCH}"
# the compiler will define __AVX2__
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -mavx2")
endif()
if (${USE_BMI_2})
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -mbmi2")
endif()
if (${USE_AVX512})
# the compiler will define __AVX512F__ __AVX512BW__
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -mavx512f -mavx512bw")
Expand Down Expand Up @@ -1044,7 +1049,7 @@ FUNCTION(ADD_BE_BENCH BENCH_NAME)

find_package(benchmark REQUIRED)
ADD_EXECUTABLE(${BENCH_FILE_NAME} ${BENCH_NAME}.cpp)
TARGET_LINK_LIBRARIES(${BENCH_FILE_NAME} ${TEST_LINK_LIBS} benchmark benchmark_main)
TARGET_LINK_LIBRARIES(${BENCH_FILE_NAME} benchmark benchmark_main ${TEST_LINK_LIBS})

SET_TARGET_PROPERTIES(${BENCH_FILE_NAME} PROPERTIES COMPILE_FLAGS "-fno-access-control")
SET_TARGET_PROPERTIES(${BENCH_FILE_NAME} PROPERTIES COMPILE_FLAGS ${CXX_FLAGS_RELEASE})
Expand Down
31 changes: 16 additions & 15 deletions be/src/bench/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,19 @@
# See the License for the specific language governing permissions and
# limitations under the License.

ADD_BE_BENCH(${SRC_DIR}/bench/chunks_sorter_bench)
ADD_BE_BENCH(${SRC_DIR}/bench/runtime_filter_bench)
ADD_BE_BENCH(${SRC_DIR}/bench/csv_reader_bench)
ADD_BE_BENCH(${SRC_DIR}/bench/shuffle_chunk_bench)
#ADD_BE_BENCH(${SRC_DIR}/bench/block_cache_bench)
ADD_BE_BENCH(${SRC_DIR}/bench/roaring_bitmap_mem_bench)
ADD_BE_BENCH(${SRC_DIR}/bench/parquet_dict_decode_bench)
ADD_BE_BENCH(${SRC_DIR}/bench/get_dict_codes_bench)
ADD_BE_BENCH(${SRC_DIR}/bench/persistent_index_bench)
ADD_BE_BENCH(${SRC_DIR}/bench/orc_column_reader_bench)
ADD_BE_BENCH(${SRC_DIR}/bench/hash_functions_bench)
ADD_BE_BENCH(${SRC_DIR}/bench/binary_column_copy_bench)
ADD_BE_BENCH(${SRC_DIR}/bench/hyperscan_vec_bench)

ADD_BE_BENCH(${SRC_DIR}/bench/mem_equal_bench)
#ADD_BE_BENCH(${SRC_DIR}/bench/chunks_sorter_bench)
#ADD_BE_BENCH(${SRC_DIR}/bench/runtime_filter_bench)
#ADD_BE_BENCH(${SRC_DIR}/bench/csv_reader_bench)
#ADD_BE_BENCH(${SRC_DIR}/bench/shuffle_chunk_bench)
##ADD_BE_BENCH(${SRC_DIR}/bench/block_cache_bench)
#ADD_BE_BENCH(${SRC_DIR}/bench/roaring_bitmap_mem_bench)
#ADD_BE_BENCH(${SRC_DIR}/bench/parquet_dict_decode_bench)
#ADD_BE_BENCH(${SRC_DIR}/bench/get_dict_codes_bench)
#ADD_BE_BENCH(${SRC_DIR}/bench/persistent_index_bench)
#ADD_BE_BENCH(${SRC_DIR}/bench/orc_column_reader_bench)
#ADD_BE_BENCH(${SRC_DIR}/bench/hash_functions_bench)
#ADD_BE_BENCH(${SRC_DIR}/bench/binary_column_copy_bench)
#ADD_BE_BENCH(${SRC_DIR}/bench/hyperscan_vec_bench)
#
#ADD_BE_BENCH(${SRC_DIR}/bench/mem_equal_bench)
ADD_BE_BENCH(${SRC_DIR}/bench/bit_unpack_bench)
86 changes: 86 additions & 0 deletions be/src/bench/bit_copy.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <cstdint>

//// TODO, up to now, Bit Copy is only used in test and bench for preparing the
//// raw data, so we don't check the performance.
struct BitCopy {
// Returns at least 'numBits' bits of data starting at bit 'bitOffset'
// from 'source'. T must be at least 'numBits' wide. If 'numBits' bits
// from 'bitIffset' do not in T, loads the next byte to get the extra
// bits.
template <typename T>
static inline T loadBits(const uint64_t* source, uint64_t bitOffset, uint8_t numBits) {
constexpr int32_t kBitSize = 8 * sizeof(T);
auto address = reinterpret_cast<uint64_t>(source) + bitOffset / 8;
T word = *reinterpret_cast<const T*>(address);
auto bit = bitOffset & 7;
if (!bit) {
return word;
}
if (numBits + bit <= kBitSize) {
return word >> bit;
}
uint8_t lastByte = reinterpret_cast<const uint8_t*>(address)[sizeof(T)];
uint64_t lastBits = static_cast<T>(lastByte) << (kBitSize - bit);
return (word >> bit) | lastBits;
}

// Stores the 'numBits' low bits of 'word' into bits starting at the
// 'bitOffset'th bit from target. T must be at least 'numBits'
// wide. If the bit field that is stored overflows a word of T, writes
// the trailing bits in the low bits of the next byte. Preserves all
// bits below and above the written bits.
template <typename T>
static inline void storeBits(uint64_t* target, uint64_t offset, uint64_t word, uint8_t numBits) {
constexpr int32_t kBitSize = 8 * sizeof(T);
T* address = reinterpret_cast<T*>(reinterpret_cast<uint64_t>(target) + (offset / 8));
auto bitOffset = offset & 7;
uint64_t mask = (numBits == 64 ? ~0UL : ((1UL << numBits) - 1)) << bitOffset;
*address = (*address & ~mask) | (mask & (word << bitOffset));
if (numBits + bitOffset > kBitSize) {
uint8_t* lastByteAddress = reinterpret_cast<uint8_t*>(address) + sizeof(T);
uint8_t lastByteBits = bitOffset + numBits - kBitSize;
uint8_t lastByteMask = (1 << lastByteBits) - 1;
*lastByteAddress = (*lastByteAddress & ~lastByteMask) | (lastByteMask & (word >> (kBitSize - bitOffset)));
}
}

// Copies a string of bits between locations in memory given by an
// address and a bit offset for source and destination.
static inline void copyBits(const uint64_t* source, uint64_t sourceOffset, uint64_t* target, uint64_t targetOffset,
uint64_t numBits) {
uint64_t i = 0;
for (; i + 64 <= numBits; i += 64) {
uint64_t word = loadBits<uint64_t>(source, i + sourceOffset, 64);
storeBits<uint64_t>(target, targetOffset + i, word, 64);
}
if (i + 32 <= numBits) {
auto lastWord = loadBits<uint32_t>(source, sourceOffset + i, 32);
storeBits<uint32_t>(target, targetOffset + i, lastWord, 32);
i += 32;
}
if (i + 16 <= numBits) {
auto lastWord = loadBits<uint16_t>(source, sourceOffset + i, 16);
storeBits<uint16_t>(target, targetOffset + i, lastWord, 16);
i += 16;
}
for (; i < numBits; i += 8) {
auto copyBits = std::min<uint64_t>(numBits - i, 8);
auto lastWord = loadBits<uint8_t>(source, sourceOffset + i, copyBits);
storeBits<uint8_t>(target, targetOffset + i, lastWord, copyBits);
}
}
};
Loading

0 comments on commit 84d0a70

Please sign in to comment.