Skip to content

Commit

Permalink
refactor: use simd bit packing for index
Browse files Browse the repository at this point in the history
  • Loading branch information
variar committed Nov 16, 2024
1 parent dbb79b3 commit 382ad68
Show file tree
Hide file tree
Showing 10 changed files with 212 additions and 784 deletions.
24 changes: 24 additions & 0 deletions 3rdparty/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,29 @@ if(KLOGG_USE_SENTRY)
endif()
endif(KLOGG_USE_SENTRY)

cpmaddpackage(
NAME
simdcomp
GITHUB_REPOSITORY
lemire/simdcomp
GIT_TAG
009c67807670d16f8984c0534aef0e630e5465a4
DOWNLOAD_ONLY
YES
)
if(simdcomp_ADDED)
add_library(simdcomp STATIC
${simdcomp_SOURCE_DIR}/src/avxbitpacking.c
${simdcomp_SOURCE_DIR}/src/simdfor.c
${simdcomp_SOURCE_DIR}/src/simdcomputil.c
${simdcomp_SOURCE_DIR}/src/simdbitpacking.c
${simdcomp_SOURCE_DIR}/src/simdintegratedbitpacking.c
${simdcomp_SOURCE_DIR}/src/simdpackedsearch.c
${simdcomp_SOURCE_DIR}/src/simdpackedselect.c
)
target_include_directories(simdcomp PUBLIC ${simdcomp_SOURCE_DIR}/include)
endif()

set(klogg_cpm_targets
xxhash
Catch2
Expand All @@ -407,6 +430,7 @@ set(klogg_cpm_targets
crashpad_compat
crashpad_util
mini_chromium
simdcomp
)
foreach(target ${klogg_cpm_targets})
if(TARGET ${target})
Expand Down
3 changes: 1 addition & 2 deletions src/logdata/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
add_library(
klogg_logdata STATIC
${CMAKE_CURRENT_SOURCE_DIR}/include/abstractlogdata.h
${CMAKE_CURRENT_SOURCE_DIR}/include/blockpool.h
${CMAKE_CURRENT_SOURCE_DIR}/include/compressedlinestorage.h
${CMAKE_CURRENT_SOURCE_DIR}/include/encodingdetector.h
${CMAKE_CURRENT_SOURCE_DIR}/include/linepositionarray.h
Expand All @@ -16,7 +15,6 @@ add_library(
${CMAKE_CURRENT_SOURCE_DIR}/include/filedigest.h
${CMAKE_CURRENT_SOURCE_DIR}/include/readablesize.h
${CMAKE_CURRENT_SOURCE_DIR}/src/abstractlogdata.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/blockpool.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/compressedlinestorage.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/encodingdetector.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/logdata.cpp
Expand Down Expand Up @@ -54,6 +52,7 @@ target_link_libraries(
kdtoolbox
robin_hood
simdutf
simdcomp
klogg_mimalloc_wrapper
)

Expand Down
100 changes: 0 additions & 100 deletions src/logdata/include/blockpool.h

This file was deleted.

162 changes: 32 additions & 130 deletions src/logdata/include/compressedlinestorage.h
Original file line number Diff line number Diff line change
@@ -1,22 +1,3 @@
/*
* Copyright (C) 2015 Nicolas Bonnefon and other contributors
*
* This file is part of glogg.
*
* glogg is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* glogg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with glogg. If not, see <http://www.gnu.org/licenses/>.
*/

/*
* Copyright (C) 2016 -- 2019 Anton Filimonov and other contributors
*
Expand All @@ -36,88 +17,36 @@
* along with klogg. If not, see <http://www.gnu.org/licenses/>.
*/

#include <array>
#include <cstddef>
#include <cstdint>
#include <vector>

#include "blockpool.h"
#include "linetypes.h"
#include <type_safe/strong_typedef.hpp>


// This class is a compressed storage backend for LinePositionArray
// It emulates the interface of a vector, but take advantage of the nature
// of the stored data (increasing end of line addresses) to apply some
// compression in memory, while still providing fast, constant-time look-up.

/* The current algorithm takes advantage of the fact most lines are reasonably
* short, it codes each line on:
* - Line < 127 bytes : 1 byte
* - 127 < line < 16383 : 2 bytes
* - line > 16383 : 6 bytes (or 10 bytes)
* Uncompressed backend stores line on 4 bytes or 8 bytes.
*
* The algorithm is quite simple, the file is first divided in two parts:
* - The lines whose end are located before UINT32_MAX
* - The lines whose end are located after UINT32_MAX
* Those end of lines are stored separately in the table32 and the table64
* respectively.
*
* The EOL list is then divided in blocks of IndexBlockSize (256) lines.
* A block index vector (per table) contains pointers to each block.
*
* Each block is then defined as such:
* Block32 (sizes in byte)
* 00 - Absolute EOF address (4 bytes)
* 04 - ( 0xxx xxxx if second line is < 127 ) (1 byte, relative)
* - ( 10xx xxxx
* xxxx xxxx if second line is < 16383 ) (2 bytes, relative)
* - ( 1111 1111
* xxxx xxxx
* xxxx xxxx if second line is > 16383 ) (6 bytes, absolute)
* ...
* (126 more lines)
*
* Block64 (sizes in byte)
* 00 - Absolute EOF address (8 bytes)
* 08 - ( 0xxx xxxx if second line is < 127 ) (1 byte, relative)
* - ( 10xx xxxx
* xxxx xxxx if second line is < 16383 ) (2 bytes, relative)
* - ( 1111 1111
* xxxx xxxx
* xxxx xxxx
* xxxx xxxx
* xxxx xxxx if second line is > 16383 ) (10 bytes, absolute)
* ...
* (126 more lines)
*
* Absolute addressing has been adopted for line > 16383 to bound memory usage in case
* of pathologically long (MBs or GBs) lines, even if it is a bit less efficient for
* long-ish (30 KB) lines.
*
* The table32 always starts at 0, the table64 starts at first_long_line_
*/

#ifndef COMPRESSEDLINESTORAGE_H
#define COMPRESSEDLINESTORAGE_H
#ifndef SIMDCOMPRESSEDLINESTORAGE_H
#define SIMDCOMPRESSEDLINESTORAGE_H

class CompressedLinePositionStorage {
public:
// Default constructor
CompressedLinePositionStorage()
: block_index_{ 0 }
, long_block_index_{ 0 }
{
}
public:
CompressedLinePositionStorage();

// Copy constructor would be slow, delete!
CompressedLinePositionStorage( const CompressedLinePositionStorage& orig ) = delete;
CompressedLinePositionStorage& operator=( const CompressedLinePositionStorage& orig ) = delete;
CompressedLinePositionStorage& operator=( const CompressedLinePositionStorage& orig )
= delete;

// Move constructor
CompressedLinePositionStorage( CompressedLinePositionStorage&& orig ) noexcept;
// Move assignement
CompressedLinePositionStorage& operator=( CompressedLinePositionStorage&& orig ) noexcept;
CompressedLinePositionStorage&
operator=( CompressedLinePositionStorage&& orig ) noexcept;

~CompressedLinePositionStorage() = default;

// Append the passed end-of-line to the storage
void append( OffsetInFile pos );
Expand All @@ -129,76 +58,49 @@ class CompressedLinePositionStorage {
// Size of the array
LinesCount size() const
{
return nb_lines_;
return nbLines_;
}

size_t allocatedSize() const;

struct BlockOffset
: type_safe::strong_typedef<BlockOffset, size_t>
, type_safe::strong_typedef_op::increment<BlockOffset>
, type_safe::strong_typedef_op::addition<BlockOffset>
, type_safe::strong_typedef_op::relational_comparison<BlockOffset>
, type_safe::strong_typedef_op::equality_comparison<BlockOffset>
, type_safe::strong_typedef_op::explicit_bool<BlockOffset>
{
using strong_typedef::strong_typedef;
};

// Cache the last position read
// This is to speed up consecutive reads (whole page)
struct Cache {
LineNumber index {std::numeric_limits<LineNumber::UnderlyingType>::max() - 1U};
OffsetInFile position {0};
BlockOffset offset {0};
};

// Element at index
OffsetInFile at( size_t i, Cache* lastPosition = nullptr ) const
OffsetInFile at( size_t i ) const
{
return at( LineNumber( i ), lastPosition );
return at( LineNumber( i ) );
}
OffsetInFile at( LineNumber i, Cache* lastPosition = nullptr ) const;
OffsetInFile at( LineNumber i ) const;

// Add one list to the other
void append_list( const klogg::vector<OffsetInFile>& positions );

// Pop the last element of the storage
void pop_back();

private:
private:
// Utility for move ctor/assign
void move_from( CompressedLinePositionStorage&& orig ) noexcept;

// The two indexes
BlockPool<uint32_t> pool32_;
BlockPool<OffsetInFile::UnderlyingType> pool64_;

// Total number of lines in storage
LinesCount nb_lines_;

// Current position (position of the end of the last line added)
OffsetInFile current_pos_;
void compress_current_block();
void uncompress_last_block();
struct BlockMetadata {
OffsetInFile firstLineOffset{};
uint8_t packetBitWidth{};
size_t packetStorageOffset{};
};

uint32_t block_index_;
uint32_t long_block_index_;
klogg::vector<BlockMetadata> blocks_;
klogg::vector<uint8_t> packedLinesStorage_;

// The index of the first line whose end is stored in a block64
// this is the origin point for all calculations in block64
OptionalLineNumber first_long_line_;
klogg::vector<OffsetInFile> currentLinesBlock_;
klogg::vector<uint32_t> currentLinesBlockShifted_;

// Offset of the next position (not yet written) within the current
// block. null means there is no current block (previous block
// finished or no data)
BlockOffset block_offset_;
// Total number of lines in storage
LinesCount nbLines_;

// For pop_back:
// Current position (position of the end of the last line added)
OffsetInFile lastPos_;

// Previous offset to block element, it is restored when we
// "pop_back" the last element.
// A null here means pop_back need to free the block
// that has just been created.
BlockOffset previous_block_offset_;
bool canUseSimdSelect_ {false};
};

#endif
Loading

0 comments on commit 382ad68

Please sign in to comment.