Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Enhancement]enhance bit unpacking with bmi2 instruction set #50904

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion be/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,8 @@ option(USE_STAROS "Use StarOS to manager tablet info" OFF)

option(USE_SSE4_2 "Build with SSE4.2 instruction" ON)

option(USE_BMI_2 "Build with BMI2 instruction" ON)

option(USE_AVX2 "Build with AVX2 instruction" ON)

option(USE_AVX512 "Build with AVX512f/AVX512BW instruction" OFF)
Expand Down Expand Up @@ -660,6 +662,9 @@ if ("${CMAKE_BUILD_TARGET_ARCH}" STREQUAL "x86" OR "${CMAKE_BUILD_TARGET_ARCH}"
# the compiler will define __AVX2__
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -mavx2")
endif()
if (${USE_BMI_2})
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -mbmi2")
endif()
if (${USE_AVX512})
# the compiler will define __AVX512F__ __AVX512BW__
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -mavx512f -mavx512bw")
Expand Down Expand Up @@ -1044,7 +1049,7 @@ FUNCTION(ADD_BE_BENCH BENCH_NAME)

find_package(benchmark REQUIRED)
ADD_EXECUTABLE(${BENCH_FILE_NAME} ${BENCH_NAME}.cpp)
TARGET_LINK_LIBRARIES(${BENCH_FILE_NAME} ${TEST_LINK_LIBS} benchmark benchmark_main)
TARGET_LINK_LIBRARIES(${BENCH_FILE_NAME} benchmark benchmark_main ${TEST_LINK_LIBS})

SET_TARGET_PROPERTIES(${BENCH_FILE_NAME} PROPERTIES COMPILE_FLAGS "-fno-access-control")
SET_TARGET_PROPERTIES(${BENCH_FILE_NAME} PROPERTIES COMPILE_FLAGS ${CXX_FLAGS_RELEASE})
Expand Down
2 changes: 1 addition & 1 deletion be/src/bench/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,5 +25,5 @@ ADD_BE_BENCH(${SRC_DIR}/bench/orc_column_reader_bench)
ADD_BE_BENCH(${SRC_DIR}/bench/hash_functions_bench)
ADD_BE_BENCH(${SRC_DIR}/bench/binary_column_copy_bench)
ADD_BE_BENCH(${SRC_DIR}/bench/hyperscan_vec_bench)

ADD_BE_BENCH(${SRC_DIR}/bench/mem_equal_bench)
ADD_BE_BENCH(${SRC_DIR}/bench/bit_unpack_bench)
86 changes: 86 additions & 0 deletions be/src/bench/bit_copy.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <cstdint>

//// TODO, up to now, Bit Copy is only used in test and bench for preparing the
//// raw data, so we don't check the performance.
struct BitCopy {
// Returns at least 'numBits' bits of data starting at bit 'bitOffset'
// from 'source'. T must be at least 'numBits' wide. If 'numBits' bits
// from 'bitIffset' do not in T, loads the next byte to get the extra
// bits.
template <typename T>
static inline T loadBits(const uint64_t* source, uint64_t bitOffset, uint8_t numBits) {
constexpr int32_t kBitSize = 8 * sizeof(T);
auto address = reinterpret_cast<uint64_t>(source) + bitOffset / 8;
T word = *reinterpret_cast<const T*>(address);
auto bit = bitOffset & 7;
if (!bit) {
return word;
}
if (numBits + bit <= kBitSize) {
return word >> bit;
}
uint8_t lastByte = reinterpret_cast<const uint8_t*>(address)[sizeof(T)];
uint64_t lastBits = static_cast<T>(lastByte) << (kBitSize - bit);
return (word >> bit) | lastBits;
}

// Stores the 'numBits' low bits of 'word' into bits starting at the
// 'bitOffset'th bit from target. T must be at least 'numBits'
// wide. If the bit field that is stored overflows a word of T, writes
// the trailing bits in the low bits of the next byte. Preserves all
// bits below and above the written bits.
template <typename T>
static inline void storeBits(uint64_t* target, uint64_t offset, uint64_t word, uint8_t numBits) {
constexpr int32_t kBitSize = 8 * sizeof(T);
T* address = reinterpret_cast<T*>(reinterpret_cast<uint64_t>(target) + (offset / 8));
auto bitOffset = offset & 7;
uint64_t mask = (numBits == 64 ? ~0UL : ((1UL << numBits) - 1)) << bitOffset;
*address = (*address & ~mask) | (mask & (word << bitOffset));
if (numBits + bitOffset > kBitSize) {
uint8_t* lastByteAddress = reinterpret_cast<uint8_t*>(address) + sizeof(T);
uint8_t lastByteBits = bitOffset + numBits - kBitSize;
uint8_t lastByteMask = (1 << lastByteBits) - 1;
*lastByteAddress = (*lastByteAddress & ~lastByteMask) | (lastByteMask & (word >> (kBitSize - bitOffset)));
}
}

// Copies a string of bits between locations in memory given by an
// address and a bit offset for source and destination.
static inline void copyBits(const uint64_t* source, uint64_t sourceOffset, uint64_t* target, uint64_t targetOffset,
uint64_t numBits) {
uint64_t i = 0;
for (; i + 64 <= numBits; i += 64) {
uint64_t word = loadBits<uint64_t>(source, i + sourceOffset, 64);
storeBits<uint64_t>(target, targetOffset + i, word, 64);
}
if (i + 32 <= numBits) {
auto lastWord = loadBits<uint32_t>(source, sourceOffset + i, 32);
storeBits<uint32_t>(target, targetOffset + i, lastWord, 32);
i += 32;
}
if (i + 16 <= numBits) {
auto lastWord = loadBits<uint16_t>(source, sourceOffset + i, 16);
storeBits<uint16_t>(target, targetOffset + i, lastWord, 16);
i += 16;
}
for (; i < numBits; i += 8) {
auto copyBits = std::min<uint64_t>(numBits - i, 8);
auto lastWord = loadBits<uint8_t>(source, sourceOffset + i, copyBits);
storeBits<uint8_t>(target, targetOffset + i, lastWord, copyBits);
}
}
};
Loading
Loading