From 614077fbbc53b6606b4b162ab1e7185270ebe65f Mon Sep 17 00:00:00 2001 From: ZiyanShi Date: Mon, 5 Sep 2022 04:06:34 -0400 Subject: [PATCH 01/23] temp Signed-off-by: ZiyanShi --- CMakeLists.txt | 5 +- engine/allocator.hpp | 13 + engine/experimental/hashptr_map.hpp | 1379 ++++++++++++++++++++++++ engine/experimental/vhash.cpp | 130 +++ engine/experimental/vhash.hpp | 69 ++ engine/experimental/vhash_group.hpp | 42 + engine/experimental/vhash_kv.cpp | 26 + engine/experimental/vhash_kv.hpp | 70 ++ engine/kv_engine.hpp | 17 + engine/kv_engine_vhash.cpp | 38 + engine/macros.hpp | 16 + engine/version/old_records_cleaner.cpp | 43 + engine/version/old_records_cleaner.hpp | 14 + examples/kvredis/redis | 1 + include/kvdk/engine.hpp | 12 + include/kvdk/iterator.hpp | 15 + 16 files changed, 1889 insertions(+), 1 deletion(-) create mode 100644 engine/experimental/hashptr_map.hpp create mode 100644 engine/experimental/vhash.cpp create mode 100644 engine/experimental/vhash.hpp create mode 100644 engine/experimental/vhash_group.hpp create mode 100644 engine/experimental/vhash_kv.cpp create mode 100644 engine/experimental/vhash_kv.hpp create mode 100644 engine/kv_engine_vhash.cpp create mode 160000 examples/kvredis/redis diff --git a/CMakeLists.txt b/CMakeLists.txt index 3b5a73ab..464cac44 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,7 +12,7 @@ set(CMAKE_CXX_STANDARD 11) option(COVERAGE "code coverage" OFF) -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mrdseed -mrdrnd -mclwb -mclflushopt") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mrdseed -mrdrnd -mclwb -mclflushopt -mlzcnt -mbmi -mavx512bw -mavx512vl") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") if (CMAKE_BUILD_TYPE STREQUAL "Release") @@ -52,6 +52,8 @@ set(SOURCES engine/c/kvdk_list.cpp engine/c/kvdk_sorted.cpp engine/c/kvdk_string.cpp + engine/experimental/vhash_kv.cpp + engine/experimental/vhash_kv.cpp engine/utils/utils.cpp engine/utils/sync_point.cpp engine/engine.cpp @@ -61,6 +63,7 @@ set(SOURCES engine/kv_engine_list.cpp engine/kv_engine_sorted.cpp engine/kv_engine_string.cpp + engine/kv_engine_vhash.cpp engine/logger.cpp engine/hash_table.cpp engine/sorted_collection/skiplist.cpp diff --git a/engine/allocator.hpp b/engine/allocator.hpp index fdef669d..16553663 100644 --- a/engine/allocator.hpp +++ b/engine/allocator.hpp @@ -28,4 +28,17 @@ class Allocator { virtual SpaceEntry Allocate(uint64_t size) = 0; virtual void Free(const SpaceEntry& entry) = 0; }; + +class IVolatileAllocator { + public: + // No throw, It's up to caller to check for nullptr. + virtual void* Allocate(size_t bytes) = 0; + virtual void Deallocate(void* addr, size_t bytes) = 0; +}; + +class CharAllocator final : public IVolatileAllocator { + void* Allocate(size_t n) final { return ::malloc(n); } + void Deallocate(void* addr, size_t) final { ::free(addr); } +}; + } // namespace KVDK_NAMESPACE \ No newline at end of file diff --git a/engine/experimental/hashptr_map.hpp b/engine/experimental/hashptr_map.hpp new file mode 100644 index 00000000..ec74d3cf --- /dev/null +++ b/engine/experimental/hashptr_map.hpp @@ -0,0 +1,1379 @@ +#pragma once + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "../macros.hpp" +#define dynamic_assert kvdk_assert + +namespace KVDK_NAMESPACE +{ + +// Internal helpers +namespace +{ +// Templates to convert between a pointer and its compact representation +// If we use a cache-backed pool allocator, or a allocator with reference count, +// which defines it's own pointer class, we need these converters +// to convert the pointer and std::uint64_t. +template::pointer> +struct cvt_helper : public std::enable_if< + std::is_convertible().to_pointer(std::declval())), Pointer>::value && + std::is_convertible().to_rep(std::declval())), std::uint64_t>::value, T>::type {}; + +template::pointer, typename = void> +Pointer to_pointer(std::uint64_t rep, Alloc const&) +{ + return reinterpret_cast(rep); +} + +template::pointer, typename = void> +std::uint64_t to_rep(Pointer ptr, Alloc const&) +{ + return reinterpret_cast(ptr); +} + +// template::pointer> +// auto to_pointer(std::uint64_t rep, Alloc const& alloc) -> typename cvt_helper::type +// { +// return alloc.to_pointer(rep); +// } + +// template::pointer> +// auto to_rep(Pointer const& ptr, Alloc const& alloc) -> typename cvt_helper::type +// { +// return alloc.to_rep(ptr); +// } + +std::uint64_t reverse_bits(std::uint64_t u64) +{ + u64 = (u64 & 0xFFFFFFFF00000000) >> 32 | (u64 & 0x00000000FFFFFFFF) << 32; + u64 = (u64 & 0xFFFF0000FFFF0000) >> 16 | (u64 & 0x0000FFFF0000FFFF) << 16; + u64 = (u64 & 0xFF00FF00FF00FF00) >> 8 | (u64 & 0x00FF00FF00FF00FF) << 8; + u64 = (u64 & 0xF0F0F0F0F0F0F0F0) >> 4 | (u64 & 0x0F0F0F0F0F0F0F0F) << 4; + u64 = (u64 & 0xCCCCCCCCCCCCCCCC) >> 2 | (u64 & 0x3333333333333333) << 2; + u64 = (u64 & 0xAAAAAAAAAAAAAAAA) >> 1 | (u64 & 0x5555555555555555) << 1; + return u64; +} + +void mm_pause() +{ + constexpr size_t NPause = 32; + for (size_t i = 0; i < NPause; i++) + { + _mm_pause(); + } +} + +template +struct maybe_add_pointer + : public std::conditional::value, typename std::add_pointer::type, Func> +{ +}; + +} // namespace + +// A hashptr_map consists of multiple buckets. +// Each Bucket is a single linked list of node +// Each node consists of 1 control_field and field_cnt - 1 storage_field. +template, typename KeyEqual = std::equal_to, size_t field_cnt = 8, size_t embed_cnt = 4, size_t max_scale = 32, bool PML5 = false, typename Allocator = std::allocator> +class hashptr_map +{ + private: + static_assert(field_cnt == 8 || field_cnt == 16, ""); + static_assert(((embed_cnt - 1) & embed_cnt) == 0, "embed_cnt must be power of 2"); + static_assert(max_scale <= 64, ""); + + using Hash = std::uint64_t; + static_assert(std::is_convertible()(std::declval())), Hash>::value, ""); + + using field_rep = std::uint64_t; + using control_rep = field_rep; + using storage_rep = field_rep; + using hash_rep = std::uint64_t; + using tag_type = typename std::conditional::type; + using mask_type = std::uint16_t; + + class node; + using node_alloc = typename std::allocator_traits::template rebind_alloc; + using node_alloc_traits = typename std::allocator_traits::template rebind_traits; + using node_pointer = typename node_alloc_traits::pointer; + + // constants for bit manipulation in nodes + // A node consists of multiple 64-bit fields, one control field and several storage fields. + // The control field holds the pointer to next node, and bits for lock and rehash. + // A storage field holds a user pointer to some data, a tag extracted from hash value and a bit for rehash. + static_assert(sizeof(field_rep) == sizeof(void*), "64-bit system required"); + static_assert(sizeof(Hash) == 8, "64-bit hash required"); + + // Actual bits used by a pointer in a 4-level or 5-level paging system + static constexpr size_t pointer_bits = PML5 ? 56 : 48; + + // Bits [63:56]/[63:48] are reused by a node to store meta information + static constexpr field_rep meta_mask = ((~0UL >> pointer_bits) << pointer_bits); + + // Bits [55:0]/[47:0] are untouched + static constexpr field_rep ptr_mask = ~meta_mask; + + // Bit 63 is used as a mark for rehash + static constexpr field_rep mark_bit = (0x1UL << 63); + + // For the storage unit, + // bits [62:56]/[62:48] are used to store a tag. + // The tag consists high 7/15 bits inside a hash value + static constexpr storage_rep tag_mask = (meta_mask & ~mark_bit); + + // For the control unit of a node, + // bits [62:56]/[62:48] are used as a read-write lock. + static constexpr control_rep lock_mask = (meta_mask & ~mark_bit); + static constexpr control_rep slock_val = (0x1UL << pointer_bits); + static constexpr control_rep elock_bit = (0x1UL << 62); + + class tagged_pointer + { + private: + storage_rep u64{0}; + + public: + explicit tagged_pointer(nullptr_t) {} + + explicit tagged_pointer(tag_type tag, Pointer p) : u64{reinterpret_cast(p)} + { + dynamic_assert(!(u64 & meta_mask), "Upper bits in Pointer not zeros!"); + u64 |= (static_cast(tag) << pointer_bits); + dynamic_assert(!(u64 & mark_bit), "Invalid tag!"); + } + + explicit tagged_pointer(storage_rep rep) : u64{rep} + { + u64 &= ~mark_bit; + } + + tagged_pointer(tagged_pointer const&) = default; + tagged_pointer(tagged_pointer&&) = default; + + tag_type tag() const + { + return static_cast((u64 & tag_mask) >> pointer_bits); + } + + Pointer pointer() const + { + return reinterpret_cast(u64 & ptr_mask); + } + + storage_rep rep() const + { + return u64; + } + }; + + class alignas(64) node + { + private: + std::atomic meta{}; + std::array, field_cnt - 1> data{}; + + public: + node() : node{false} {} + + explicit node(bool mark) + { + if (!mark) return; + meta_mark_flip(); + for (size_t i = 0; i < data.size(); i++) + entry_mark_flip(i); + } + + // Access Entrys by indexing + void set_entry(size_t index, tagged_pointer tp) + { + dynamic_assert((data.at(index).load() & ~mark_bit) == 0, "Entry already set!"); + + storage_rep rep = tp.rep(); + rep |= data.at(index).load() & mark_bit; + data.at(index).store(rep); + } + + void set_entry(size_t index, tag_type tag, Pointer p) + { + set_entry(index, tagged_pointer{tag, p}); + } + + void set_entry(size_t index, Pointer p) + { + dynamic_assert((data.at(index).load() & ptr_mask) != 0, "Entry not set yet!"); + storage_rep rep = tagged_pointer{0, p}.rep(); + rep |= data.at(index).load() & meta_mask; + data.at(index).store(rep); + } + + void erase_entry(size_t index) + { + dynamic_assert((data.at(index).load() & ptr_mask) != 0, "Entry not set yet!"); + storage_rep rep = tagged_pointer{nullptr}.rep(); + rep |= data.at(index).load() & mark_bit; + data.at(index).store(rep); + } + + tagged_pointer load_entry(size_t index) + { + return tagged_pointer{data.at(index).load()}; + } + + void entry_mark_flip(size_t index) + { + data.at(index).fetch_xor(mark_bit); + } + + bool entry_mark(size_t index) + { + return data.at(index).load() & mark_bit; + } + + // Every node can be locked, + // but only the first node in a bucket is locked + // for exclusive/shared access of the bucket + bool try_lock() + { + control_rep old = meta.load(); + if ((old & elock_bit) || !meta.compare_exchange_strong(old, old | elock_bit)) + return false; + return true; + } + + void lock() + { + while (!try_lock()) + { + mm_pause(); + } + } + + void unlock() + { + dynamic_assert((meta.load() & lock_mask) == elock_bit, "Unlock a lock not locked yet!"); + meta.fetch_xor(elock_bit); + } + + bool try_lock_shared() + { + field_rep old = meta.load(); + if ((old & elock_bit) || (old & lock_mask) + slock_val >= mark_bit || !meta.compare_exchange_strong(old, old + slock_val)) + { + return false; + } + return true; + } + + void lock_shared() + { + while (!try_lock_shared()) + { + mm_pause(); + } + } + + void unlock_shared() + { + dynamic_assert((meta.load() & lock_mask) != 0 && !(meta.load() & elock_bit), "Unlock a lock yet locked!"); + meta.fetch_sub(slock_val); + } + + void meta_mark_flip() + { + meta.fetch_xor(mark_bit); + } + + bool meta_mark() + { + return meta.load() & mark_bit; + } + + node_pointer next_node(node_alloc const& alloc) const + { + control_rep u64 = meta.load(); + u64 &= ptr_mask; + return to_pointer(u64, alloc); + } + + node_pointer append_new_node(node_alloc& alloc) + { + node_pointer p_new_node = node_alloc_traits::allocate(alloc, 1); + node_alloc_traits::construct(alloc, p_new_node, meta_mark()); + control_rep rep = to_rep(p_new_node, alloc); + + dynamic_assert(next_node(alloc) == nullptr, "Not last node!"); + dynamic_assert(!(rep & meta_mask), ""); + + meta.fetch_or(rep); + return p_new_node; + } + + static void remove_appended_nodes(node_pointer node, node_alloc& alloc) + { + node_pointer next = node->next_node(alloc); + if (next == nullptr) return; + + remove_appended_nodes(next, alloc); + node->meta.fetch_and(meta_mask); + node_alloc_traits::destroy(alloc, next); + node_alloc_traits::deallocate(alloc, next, 1); + } + + // Unlink, destroy and deallocate next node if it is empty + static bool remove_empty_nodes(node_pointer node, node_alloc& alloc) + { + node_pointer next = node->next_node(alloc); + + if (next == nullptr) return true; + if (!remove_empty_nodes(next, alloc)) return false; + + for (size_t i = 0; i < field_cnt - 1; i++) + if (next->load_entry(i).pointer() != nullptr) + return false; + + remove_appended_nodes(node, alloc); + return true; + } + + template::type = true> + mask_type match_tag_impl(tag_type tag) const + { + static_assert(sizeof(node) / sizeof(__m512i) == 1 || sizeof(node) / sizeof(__m512i) == 2, ""); + union { + __m128i m128; + std::array mask{}; + } results{}; + + // 0b1000 0000 + constexpr std::uint64_t high_mask = 0x8080808080808080; + // 0b0100 0000 + constexpr std::uint64_t low_mask = 0x4040404040404040; + __m512i low_tag = _mm512_set1_epi8(static_cast(tag)); + __m512i high_tag = _mm512_set1_epi8(static_cast(tag >> 8)); + __m512i high_tag2 = _mm512_set1_epi8(static_cast(tag >> 8) | (mark_bit >> 56)); + + { + __m512i m512_0 = _mm512_load_si512(&meta); + results.mask[0] = (_mm512_mask_cmpeq_epi8_mask(low_mask, low_tag, m512_0) | + _mm512_mask_cmpeq_epi8_mask(high_mask, high_tag, m512_0) | + _mm512_mask_cmpeq_epi8_mask(high_mask, high_tag2, m512_0)); + } + if (sizeof(node) / sizeof(__m512i) == 2) + { + __m512i m512_1 = _mm512_load_si512(&data[sizeof(__m512i) / sizeof(Pointer) - 1]); + results.mask[1] = (_mm512_mask_cmpeq_epi8_mask(low_mask, low_tag, m512_1) | + _mm512_mask_cmpeq_epi8_mask(high_mask, high_tag, m512_1) | + _mm512_mask_cmpeq_epi8_mask(high_mask, high_tag2, m512_1)); + } + // 0b1100 0000 + constexpr char compress_mask = static_cast(0xC0); + mask_type compressed = _mm_cmpeq_epi8_mask(results.m128, _mm_set1_epi8(compress_mask)); + + return (compressed & match_nonempty()); + } + + template::type = true> + mask_type match_tag_impl(tag_type tag) const + { + /// TODO: test this on a machine with real five-level paging + static_assert(sizeof(node) / sizeof(__m512i) == 1 || sizeof(node) / sizeof(__m512i) == 2, ""); + union { + __m128i m128; + std::array mask{}; + } results{}; + + // 0b1000 0000 + constexpr std::uint64_t mask = 0x8080808080808080; + __m512i my_tag = _mm512_set1_epi8(static_cast(tag)); + __m512i my_tag2 = _mm512_set1_epi8(static_cast(tag) | mark_bit); + { + __m512i m512_0 = _mm512_load_si512(&meta); + results.mask[0] = _mm512_mask_cmpeq_epi8_mask(mask, my_tag, m512_0) | + _mm512_mask_cmpeq_epi8_mask(mask, my_tag2, m512_0); + } + if (sizeof(node) / sizeof(__m512i) == 2) + { + __m512i m512_1 = _mm512_load_si512(&data[sizeof(__m512i) / sizeof(Pointer) - 1]); + results.mask[1] = _mm512_mask_cmpeq_epi8_mask(mask, my_tag, m512_1) | + _mm512_mask_cmpeq_epi8_mask(mask, my_tag2, m512_1); + } + // 0b1000 0000 + constexpr char compress_mask = static_cast(0xB0); + mask_type compressed = _mm_cmpeq_epi8_mask(results.m128, _mm_set1_epi8(compress_mask)); + + return (compressed & match_nonempty()); + } + + mask_type match_tag(tag_type tag) const + { + return match_tag_impl(tag); + } + + mask_type match_empty() const + { + static_assert(sizeof(node) / sizeof(__m512i) == 1 || sizeof(node) / sizeof(__m512i) == 2, ""); + std::array empty{}; + + __m512i m512_0 = _mm512_load_si512(&meta); + empty[0] = _mm512_cmpeq_epu64_mask(_mm512_set1_epi64(0L), m512_0) | + _mm512_cmpeq_epu64_mask(_mm512_set1_epi64(mark_bit), m512_0); + if (sizeof(node) / sizeof(__m512i) == 2) + { + __m512i m512_1 = _mm512_load_si512(&data[sizeof(__m512i) / sizeof(Pointer) - 1]); + empty[1] = _mm512_cmpeq_epu64_mask(_mm512_set1_epi64(0L), m512_1) | + _mm512_cmpeq_epu64_mask(_mm512_set1_epi64(mark_bit), m512_1); + } + + return ((empty[0] | (empty[1] << 8)) & 0xFFFE); + } + + mask_type match_nonempty() const + { + return (~match_empty() & 0xFFFE); + } + + static size_t consume_mask(mask_type& match_result) + { + size_t tz = __tzcnt_u16(match_result); + match_result ^= (0x0001 << tz); + return tz - 1; + } + + // for debugging + friend std::ostream& operator<<(std::ostream& out, node const& node) + { + out << std::bitset<64>(node.meta.load()) << "\n"; + for (size_t i = 0; i < field_cnt - 1; i++) + { + out << std::bitset(node.data[i].tag()) << "\t" << node.data[i].pointer() << "\n"; + } + return out; + } + + private: + /// TODO: use this helper function to improve readibility + static char match_epi8_in_epi64(__m512i m512, std::uint8_t u8, size_t bias) + { + dynamic_assert(bias <= 7, ""); + union + { + __m128i m128; + std::array u64; + } result; + long long mask = (0x0101010101010101 << bias); + result.u64[0] = _mm512_mask_cmpeq_epi8_mask(mask, m512, _mm512_set1_epi8(u8)); + return _mm_cmpgt_epi8_mask(result.m128, _mm_set1_epi8(0)); + } + }; + + static_assert(sizeof(node) == field_cnt * sizeof(void*), ""); + static_assert(sizeof(node) % 64 == 0, ""); + static_assert(alignof(node) % 64 == 0, ""); + + struct entry_pointer + { + node_pointer np{nullptr}; + size_t idx{field_cnt-1}; + + entry_pointer() = default; + entry_pointer(node_pointer p, size_t i) : np{p}, idx{i} {} + static entry_pointer next(entry_pointer ep, node_alloc const& a) + { + dynamic_assert(ep.np != nullptr && ep.idx < field_cnt - 1, ""); + ++ep.idx; + if (ep.idx < field_cnt - 1) return ep; + ep.np = ep.np->next_node(a); + ep.idx = (ep.np != nullptr) ? 0 : field_cnt - 1; + return ep; + } + }; + + struct bucket_tuple + { + std::uint64_t old_svar; + size_t b_cnt; + node_pointer bucket1; + node_pointer bucket2; + node_pointer my_bucket; + + std::unique_lock guard1; + std::unique_lock guard2; + + bucket_tuple() = default; + bucket_tuple(std::uint64_t s, size_t cnt, node_pointer b1, node_pointer b2, node_pointer mb, std::unique_lock&& g1 = std::unique_lock{}, std::unique_lock&& g2 = std::unique_lock{}) : + old_svar{s}, + b_cnt{cnt}, + bucket1{b1}, + bucket2{b2}, + my_bucket{mb}, + guard1{std::move(g1)}, + guard2{std::move(g2)} {} + bucket_tuple(bucket_tuple const&) = delete; + bucket_tuple(bucket_tuple&&) = default; + bucket_tuple& operator=(bucket_tuple const&) = delete; + bucket_tuple& operator=(bucket_tuple&&) = default; + }; + + class accessor + { + private: + friend hashptr_map; + friend class iterator; + + // help end relocation + hashptr_map* map; + + // buckets that the key may fall in + bucket_tuple tuple; + + // Location of looked-up key + entry_pointer loc{}; + + public: + explicit accessor(hashptr_map& m) : map{&m} {} + accessor(accessor const&) = delete; + accessor& operator=(accessor const&) = delete; + accessor(accessor&&) = default; + accessor& operator=(accessor&&) = default; + ~accessor() + { + { + std::unique_lock g1{}; + std::unique_lock g2{}; + tuple.guard2.swap(g2); + tuple.guard1.swap(g1); + } + map->help_end_relocation(); + } + + Pointer pointer() + { + return (loc.np == nullptr) ? nullptr : loc.np->load_entry(loc.idx).pointer(); + } + + void set_pointer(Pointer p) + { + if (loc.np == nullptr) + { + // Since relocation may have been done by lookup(), + // we must guarantee Pointer p is inserted to the correct bucket. + Hash h = map->hasher(map->extract(p)); + loc = map->allocate_empty_entry(tuple.my_bucket); + loc.np->set_entry(loc.idx, map->make_tag(h), p); + } + else + { + Pointer old_p = loc.np->load_entry(loc.idx).pointer(); + dynamic_assert(map->equal(map->extract(p), map->extract(old_p)), ""); + loc.np->set_entry(loc.idx, p); + } + } + + void erase() + { + if (loc.np != nullptr) loc.np->erase_entry(loc.idx); + } + + private: + explicit accessor(hashptr_map& m, bucket_tuple&& t, entry_pointer l) : + map{&m}, + tuple{std::move(t)}, + loc{l} {} + }; + + public: + class lockless_t {}; + static constexpr lockless_t lockless{}; + class acquire_lock_t {}; + static constexpr acquire_lock_t acquire_lock{}; + + // We iterate through hashmap by + class iterator + { + public: + iterator() = delete; + iterator(iterator const&) = delete; + iterator& operator=(iterator const&) = delete; + iterator(iterator&&) = default; + iterator& operator=(iterator&&) = default; + + iterator& operator++() + { + acc.loc = entry_pointer::next(acc.loc, acc.map->alloc); + while (true) + { + acc.loc = skip_visited(acc.loc); + // Found unvisited entry + if (acc.loc.np != nullptr) return *this; + if (acc.tuple.my_bucket == acc.tuple.bucket1) + { + acc.tuple.my_bucket = acc.tuple.bucket2; + acc.loc = skip_visited(entry_pointer{acc.tuple.my_bucket, 0}); + if (acc.loc.np != nullptr) return *this; + } + // All entries visited, reset lock and goto next bucket + inc_hash(); + acc.tuple = bucket_tuple{}; + // inc_hash() overflow, end of iteration + if (hash == Hash{}) break; + + acc.tuple = acc.map->load_and_lock_buckets(hash); + acc.tuple.my_bucket = acc.tuple.bucket1; + acc.loc = entry_pointer{acc.tuple.my_bucket, 0}; + } + *this = iterator{*acc.map}; + return *this; + } + + // No copy because iterator holds locks. + iterator operator++(int) = delete; + + // Only two end iterators are equal + // Two iterators cannot be equal as they acquire locks on buckets + bool operator==(iterator const& rhs) const + { + return (acc.loc.np == nullptr) && (rhs.acc.loc.np == nullptr); + } + + bool operator!=(iterator const& rhs) const + { + return !operator==(rhs); + } + + accessor& operator*() + { + return acc; + } + + accessor* operator->() + { + return &acc; + } + + private: + friend hashptr_map; + + accessor acc; + Hash hash; + Hash rev_hash; + Hash rev_lim; + + // End iterator + iterator(hashptr_map& map) : acc{map}, hash{}, rev_hash{} {} + + iterator(hashptr_map& map, Hash h, Hash lim = -1UL) : acc{map}, hash{h}, rev_hash{reverse_bits(h)}, rev_lim{reverse_bits(lim)} + { + while (true) + { + acc.tuple = acc.map->load_and_lock_buckets(hash); + acc.tuple.my_bucket = acc.tuple.bucket1; + acc.loc = skip_visited(entry_pointer{acc.tuple.my_bucket, 0}); + // Found unvisited entry in bucket1 + if (acc.loc.np != nullptr) return; + acc.tuple.my_bucket = acc.tuple.bucket2; + acc.loc = skip_visited(entry_pointer{acc.tuple.my_bucket, 0}); + // Found unvisited entry in bucket2 + if (acc.loc.np != nullptr) return; + + // All entries visited, reset lock and goto next bucket + inc_hash(); + acc.tuple = bucket_tuple{}; + // inc_hash() overflow, end of iteration + if (hash == Hash{}) break; + } + *this = iterator{map}; + } + + // For 2^N buckets, low N bits determines the bucket + // To increment the bucket index, + // we need to increment the (N-1)th highest bit in reversed hash, + // and clear all lower bits. + // Must be called when iterator holds a valid accessor! + void inc_hash() + { + std::uint64_t v = reverse_bits(acc.tuple.b_cnt >> 1); + rev_hash &= ~(v - 1); + rev_hash += v; + hash = reverse_bits(rev_hash); + } + + /// TODO: examine if we may miss some entries + bool visited(entry_pointer pos) + { + dynamic_assert(pos.np != nullptr, ""); + Pointer ptr = pos.np->load_entry(pos.idx).pointer(); + Hash h = acc.map->hasher(acc.map->extract(ptr)); + h = reverse_bits(h); + return (h < rev_hash) || (rev_lim < h); + } + + // Seek first non-empty entry in the bucket from pos. + entry_pointer skip_empty(entry_pointer pos) + { + if (pos.np == nullptr) return entry_pointer{}; + if (pos.np->load_entry(pos.idx).pointer() != nullptr) return pos; + pos = entry_pointer::next(pos, acc.map->alloc); + return skip_empty(pos); + } + + // Seek first non-empty entry not visited yet in the bucket from pos. + entry_pointer skip_visited(entry_pointer pos) + { + pos = skip_empty(pos); + if (pos.np == nullptr) return entry_pointer{}; + if (!visited(pos)) return pos; + return skip_visited(entry_pointer::next(pos, acc.map->alloc)); + } + }; + + hashptr_map(size_t buckets, KeyExtract const& ext, HashFunc const& hash = HashFunc{}, KeyEqual const& eq = KeyEqual{}, Allocator const& a = Allocator{}) : + alloc{a}, hasher{hash}, equal{eq}, extract{ext} + { + node_blocks[0] = &embedded_block[0]; + size_t b_cnt = bucket_count(); + while (b_cnt < buckets) + { + node_pointer new_block = node_alloc_traits::allocate(alloc, b_cnt); + for (size_t i = 0; i < b_cnt; i++) + node_alloc_traits::construct(alloc, new_block + i, false); + node_blocks[block_index(b_cnt)] = new_block; + active_blocks.fetch_add(active_blocks.load() + 1); + b_cnt = bucket_count(); + } + } + + ~hashptr_map() + { + for (size_t i = 1; i < node_blocks.size(); i++) + { + node_pointer old_block = node_blocks[i]; + if (old_block == nullptr) continue; + size_t cnt = (embed_cnt << i) >> 1; + for (size_t i = 0; i < cnt; i++) + { + node_pointer np = old_block + i; + node::remove_appended_nodes(np, alloc); + node_alloc_traits::destroy(alloc, np); + } + node_alloc_traits::deallocate(alloc, old_block, cnt); + } + for (size_t i = 0; i < embedded_block.size(); i++) + node::remove_appended_nodes(&embedded_block.at(i), alloc); + } + + Pointer lookup(Key const& key, lockless_t) const + { + Hash h = hasher(key); + tag_type tag = make_tag(h); + while (true) + { + Pointer p = nullptr; + bucket_tuple tuple = load_buckets(h); + if (!is_relocating(tuple.old_svar)) + { + p = search_backward(key, tag, tuple.bucket1); + } + else + { + // Relocation from bucket1 to bucket2 + using std::swap; + if (is_shrk_proc(tuple.old_svar)) swap(tuple.bucket1, tuple.bucket2); + + // Look up the moved-from bucket first, then moved-to bucket. + p = search_backward(key, tag, tuple.bucket1); + p = (p != nullptr) ? p : search_backward(key, tag, tuple.bucket2); + } + + // svar not incremented, lookup result is valid, return. + if (tuple.old_svar == svar.load()) return p; + } + } + + // deref_lim should be larger than 1. + // 1 is merely enough to maintain load factor at current level + // if we keep inserting and try_double_capacity(). + accessor lookup(Key const& key, acquire_lock_t, size_t deref_lim = 4) + { + Hash h = hasher(key); + bucket_tuple tuple = load_and_lock_buckets(h); + + // Help relocation + if (is_relocating(tuple.old_svar)) + { + if (tuple.bucket1->meta_mark() != mark_of(tuple.old_svar)) + deref_lim -= is_exp_proc(tuple.old_svar) + ? relocate_bucket(tuple.bucket2, tuple.bucket1, deref_lim) + : relocate_bucket(tuple.bucket1, tuple.bucket2, deref_lim); + relocate_global(deref_lim, tuple.old_svar); + } + + // Actual lookup, done after relocation to prevent loc from + // being invalidated. + tag_type tag = make_tag(h); + entry_pointer loc{}; + if (!is_relocating(tuple.old_svar)) + { + loc = search_forward(key, tag, tuple.bucket1); + } + else + { + loc = search_forward(key, tag, tuple.bucket1); + if (loc.np == nullptr) + loc = search_forward(key, tag, tuple.bucket2); + } + return accessor{*this, std::move(tuple), loc}; + } + + iterator begin() + { + return iterator{*this, 0}; + } + + iterator end() + { + return iterator{*this}; + } + + // Estimate load factor by checking embedded nodes + double approx_load_factor() const + { + size_t cnt = 0; + for (size_t i = 0; i < embed_cnt; i++) + cnt += entry_count(const_cast(&embedded_block[i])); + return static_cast(cnt) / ((field_cnt - 1) * embed_cnt); + } + + // Triggers rehash by write threads. + // Return false if rehash is under progress. + bool try_double_capacity() + { + std::uint64_t old_svar = svar.load(); + if (old_svar % 8 != 0 || !svar.compare_exchange_strong(old_svar, old_svar + 1)) + return false; + + old_svar = svar.load(); + size_t cnt = bucket_count(); + node_pointer new_block = node_alloc_traits::allocate(alloc, cnt); + for (size_t i = 0; i < cnt; i++) + { + // Newly allocated buckets do not need relocation. + // For convenience, the new bucket has same mark as its conjugated bucket, + // the mark is cleared after scan + node_alloc_traits::construct(alloc, new_block + i, !mark_of(old_svar)); + } + // activate new block + node_blocks[block_index(cnt)] = new_block; + dynamic_assert(svar.load() % 4 == 1, ""); + pvar.store(0UL); + svar.fetch_add(1UL); + return true; + } + + // Triggers rehash by write threads. + // Return false if rehash is under progress. + bool try_halve_capacity() + { + std::uint64_t old_svar = svar.load(); + if (old_svar % 8 != 0 || !svar.compare_exchange_strong(old_svar, old_svar + 5)) + { + return false; + } + if (active_blocks.load() == 0x0001UL) + { + // Cannot shrink embedded block + svar.fetch_add(3U); + return false; + } + // The last block is logically deactivated. + // A bucket pair forms a logical bucket internally. + active_blocks.fetch_sub((active_blocks.load() + 1) >> 1); + dynamic_assert(svar.load() % 4 == 1, ""); + pvar.store(0UL); + svar.fetch_add(1); + return true; + } + + size_t help_relocate() + { + size_t old_svar = svar.load(); + if (!is_relocating(old_svar)) return 0; + + size_t n = relocate_global(-1UL, svar.load()); + help_end_relocation(); + return n; + } + + // Logical count of buckets + // During relocation, a pair of conjugated buckets are considered as + // a logical bucket. + // After shrinkage, the higher half of buckets are removed. + // After expansion, the pair is seperated. + size_t bucket_count() const + { + std::uint64_t active = active_blocks.load(); + return embed_cnt * (active + 1) / 2; + } + + private: + static tag_type make_tag(Hash h) + { + return (h >> (pointer_bits + 1)); + } + + enum class stage : std::uint64_t + { + // No rehashing + stable = 0, + + // Initializing expansion, allocate space + exp_init = 1, + // Processing expansion, relocating entries + exp_proc = 2, + // Finalizing expansion + exp_fin = 3, + + // Block rehashing + blocking = 4, + + // Initializing shrinkage + shrk_init = 5, + // Processing shrinkage, relocating entries + shrk_proc = 6, + // Finalizing shrinkage, modify node_block pointers + shrk_fin = 7 + }; + // Initialization and finalizing stage are ignored by accessors. + // Accessors help relocation during processing stage. + + static stage stage_of(std::uint64_t old_svar) + { + return static_cast(old_svar % 8); + } + + // Mark of svar is flipped each time + // try_halve_capacity() or try_double_capacity() is called. + // Relocation will check buckets and flip their marks accordingly. + static bool mark_of(std::uint64_t old_svar) + { + return ((old_svar + 7) / 8) % 2 == 1; + } + + static bool is_relocating(std::uint64_t old_svar) + { + return (is_exp_proc(old_svar) || is_shrk_proc(old_svar)); + } + + static bool is_exp_proc(std::uint64_t old_svar) + { + return (stage_of(old_svar) == stage::exp_proc); + } + + static bool is_shrk_proc(std::uint64_t old_svar) + { + return (stage_of(old_svar) == stage::shrk_proc); + } + + static size_t block_index(size_t bucket_index) + { + return 64 - __lzcnt64(bucket_index / embed_cnt); + } + + static size_t block_offset(size_t bucket_index) + { + return bucket_index - ((1UL << block_index(bucket_index)) >> 1) * embed_cnt; + } + + node_pointer locate_bucket(size_t idx) const + { + size_t block_idx = block_index(idx); + size_t block_off = block_offset(idx); + node_pointer base = node_blocks[block_idx]; + return base + block_off; + } + + // Load a consistent triplet of + bucket_tuple load_buckets(Hash h) const + { + while (true) + { + std::uint64_t old_svar = svar.load(); + size_t b_cnt = bucket_count(); + node_pointer b1 = locate_bucket(h % b_cnt); + node_pointer b2 = locate_bucket(h % b_cnt + b_cnt); + + /// TODO: Investigate this carefully!!!! + node_pointer b = is_exp_proc(old_svar) ? locate_bucket(h % (b_cnt * 2)) : b1; + // Initializing stages and finalizing stages are dangerous zones! + /// TODO: Investigate extremely carefully and optimize. + if (old_svar != svar.load() || old_svar % 2 != 0) + { + mm_pause(); + continue; + } + if (!is_relocating(old_svar)) + return bucket_tuple{old_svar, b_cnt, b1, nullptr, b}; + else + return bucket_tuple{old_svar, b_cnt, b1, b2, b}; + } + } + + bucket_tuple load_and_lock_buckets(Hash h) + { + while (true) + { + std::uint64_t old_svar = svar.load(); + size_t b_cnt = bucket_count(); + node_pointer b1 = locate_bucket(h % b_cnt); + node_pointer b2 = locate_bucket(h % b_cnt + b_cnt); + + node_pointer mb = is_exp_proc(old_svar) ? locate_bucket(h % (b_cnt * 2)) : b1; + if (!is_relocating(old_svar)) + { + /// TODO: mm_pause()? + std::unique_lock g1{*b1, std::try_to_lock}; + if (!g1.owns_lock()) continue; + if (old_svar != svar.load() || old_svar % 2 != 0) continue; + + return bucket_tuple{old_svar, b_cnt, b1, nullptr, mb, std::move(g1)}; + } + else + { + std::unique_lock g1{*b1, std::try_to_lock}; + if (!g1.owns_lock()) continue; + std::unique_lock g2{*b2, std::try_to_lock}; + if (!g2.owns_lock()) continue; + if (old_svar != svar.load() || old_svar % 2 != 0) continue; + + return bucket_tuple{old_svar, b_cnt, b1, b2, mb, std::move(g1), std::move(g2)}; + } + } + } + + entry_pointer allocate_empty_entry(node_pointer np) + { + dynamic_assert(np != nullptr, ""); + for (size_t i = 0; i < field_cnt - 1; i++) + if (np->load_entry(i).pointer() == nullptr) + return entry_pointer{np, i}; + np = (np->next_node(alloc) == nullptr) + ? np->append_new_node(alloc) + : np->next_node(alloc); + return allocate_empty_entry(np); + } + + size_t entry_count(node_pointer node) const + { + if (node == nullptr) return 0; + size_t cnt = 0; + for (size_t i = 0; i < field_cnt - 1; i++) + if (node->load_entry(i).pointer() != nullptr) + ++cnt; + return cnt + entry_count(node->next_node(alloc)); + } + + void print(node_pointer node) + { + std::cout << std::endl; + if (node == nullptr) return; + for (size_t i = 0; i < field_cnt - 1; i++) + { + Pointer p = node->load_entry(i).pointer(); + if (p == nullptr) std::cout << "nullptr\t"; + else std::cout << extract(p) << "\t"; + } + print(node->next_node(alloc)); + } + + static void relocate(node_pointer dst_node, size_t dst_idx, node_pointer src_node, size_t src_idx) + { + tagged_pointer tp = src_node->load_entry(src_idx); + dynamic_assert(tp.pointer() != nullptr, ""); + // Copy then erase, so that lockless readers will not miss the entry + dst_node->set_entry(dst_idx, tp); + src_node->erase_entry(src_idx); + } + + void compress(node_pointer bkt) + { + std::deque empties; + node_pointer np = bkt; + while (np != nullptr) + { + for (size_t i = 0; i < field_cnt - 1; i++) + { + tagged_pointer tp = np->load_entry(i); + if (tp.pointer() == nullptr) + { + // Empty entry, reserved for compaction + empties.emplace_back(np, i); + continue; + } + + if (empties.empty()) continue; + + // Relocate + auto dst = empties.front(); + empties.pop_front(); + relocate(dst.np, dst.idx, np, i); + empties.emplace_back(np, i); + } + np = np->next_node(alloc); + } + node::remove_empty_nodes(bkt, alloc); + } + + // Recursively flip marks in a bucket that won't need relocation + // During shrinkage, lower half of buckets does not need relocation, + // but their mark has to be flipped. + void flip_mark(node_pointer node) + { + if (node == nullptr) return; + + flip_mark(node->next_node(alloc)); + + for (size_t i = 0; i < field_cnt - 1; i++) + node->entry_mark_flip(i); + + node->meta_mark_flip(); + return; + } + + // Scan and relocate entries in sub-chain led by src_node in src_bucket + // It's guaranteed that the mark of dst_bucket and src_bucket are flipped together. + size_t relocate_helper(node_pointer dst_bucket, node_pointer src_bucket, node_pointer src_node, size_t deref_lim) + { + if (src_node == nullptr) return 0; + + std::uint64_t old_svar = svar.load(); + dynamic_assert(is_relocating(old_svar), ""); + bool mark = mark_of(old_svar); + dynamic_assert(src_bucket->meta_mark() != mark, ""); + dynamic_assert(dst_bucket->meta_mark() != mark, ""); + + // The node already scanned and relocated + if (src_node->meta_mark() == mark) return 0; + + size_t deref_cnt = relocate_helper(dst_bucket, src_bucket, src_node->next_node(alloc), deref_lim); + deref_lim -= deref_cnt; + dynamic_assert(old_svar == svar.load(), ""); + if (deref_lim == 0) return deref_cnt; + + size_t b_cnt = bucket_count(); + + for (size_t i = 0; i < field_cnt - 1; i++) + { + if (deref_lim == 0) return deref_cnt; + + // Already relocated, skip + if (src_node->entry_mark(i) == mark) continue; + + src_node->entry_mark_flip(i); + tagged_pointer tp = src_node->load_entry(i); + + // Empty entry, skip + if (tp.pointer() == nullptr) continue; + + // extract() is the potential bottleneck + --deref_lim; + ++deref_cnt; + Hash h = hasher(extract(tp.pointer())); + + // No need to relocate, skip + if (h % (b_cnt * 2) < b_cnt) continue; + + // Relocation necessary, find an empty entry + entry_pointer dst = allocate_empty_entry(dst_bucket); + relocate(dst.np, dst.idx, src_node, i); + + dynamic_assert(old_svar == svar.load(), ""); + } + // Whole node scanned + // When src_node is the leading node, + // flip the mark means the whole bucket is relocated. + src_node->meta_mark_flip(); + + // Relocation is done + if (src_bucket == src_node) + { + compress(src_bucket); + compress(dst_bucket); + flip_mark(dst_bucket); + } + + dynamic_assert(old_svar == svar.load(), ""); + return deref_cnt; + } + + size_t relocate_bucket(node_pointer dst_bucket, node_pointer src_bucket, size_t deref_lim) + { + return relocate_helper(dst_bucket, src_bucket, src_bucket, deref_lim); + } + + void help_end_relocation() + { + std::uint64_t old_svar = svar.load(); + if (!is_relocating(old_svar)) return; + + // Not done yet. + size_t b_cnt = bucket_count(); + if (pvar.load() != b_cnt) return; + + // Only one thread is allowed to continue + if (!pvar.compare_exchange_strong(b_cnt, -1UL)) return; + if (is_exp_proc(old_svar)) + { + // End of expansion + svar.fetch_add(1); + active_blocks.fetch_add(active_blocks.load() + 1); + svar.fetch_add(5); + } + else + { + dynamic_assert(is_shrk_proc(old_svar), ""); + // End of shrinkage + svar.fetch_add(1UL); + node_pointer old_block = node_blocks[block_index(b_cnt)]; + node_blocks[block_index(b_cnt)] = nullptr; + svar.fetch_add(1UL); + std::this_thread::sleep_for(std::chrono::milliseconds{1000}); + for (size_t i = 0; i < b_cnt; i++) + { + node_pointer np = old_block + i; + node::remove_appended_nodes(np, alloc); + node_alloc_traits::destroy(alloc, np); + } + node_alloc_traits::deallocate(alloc, old_block, b_cnt); + } + } + + // When bucket associated with accessor is relocated, + // accessor helps relocate entries in other buckets. + // Since locks are acquired inside, we have to pass old_svar from outside + /// TODO: can we load old_svar inside? + size_t relocate_global(size_t deref_lim, std::uint64_t old_svar) + { + // Logical bucket count + size_t b_cnt = bucket_count(); + size_t deref_cnt = 0; + + for (size_t idx = pvar.load(); idx < b_cnt && deref_lim != 0; idx++) + { + // Loading a new bucket pair also counts as a dereference + deref_lim--; + bucket_tuple tuple = load_buckets(idx); + if (old_svar != tuple.old_svar) return deref_cnt; + std::unique_lock guard1{*tuple.bucket1, std::try_to_lock}; + if (!guard1.owns_lock()) + { + idx += __rdtsc() % (b_cnt - idx); + continue; + } + std::unique_lock guard2{*tuple.bucket2, std::try_to_lock}; + if (!guard2.owns_lock()) continue; + + if (tuple.bucket1->meta_mark() != mark_of(old_svar)) + { + // Both buckets locked, + // meta_mark indicates necessity for relocation + // svar not incremented + dynamic_assert(is_relocating(old_svar), ""); + size_t cnt = is_exp_proc(old_svar) + ? relocate_bucket(tuple.bucket2, tuple.bucket1, deref_lim) + : relocate_bucket(tuple.bucket1, tuple.bucket2, deref_lim); + deref_lim -= cnt; + deref_cnt += cnt; + } + + // Relocated bucket pair, refresh relocation progress + // pvar is protected by lock on bucket indexed by idx1 + if (tuple.bucket1->meta_mark() == mark_of(old_svar)) + if (idx == pvar.load()) + pvar.fetch_add(1U); + } + return deref_cnt; + } + + // Recursively traverse a bucket backwards. + // Since compaction only moves entry from back of a bucket to front, + // we avoid missing an entry during rehash. + Pointer search_backward(Key const& key, tag_type tag, node_pointer np) const + { + if (np == nullptr) return nullptr; + + Pointer p = search_backward(key, tag, np->next_node(alloc)); + if (p != nullptr) return p; + + // Next node does not exist or does not contain entry of interest. + // Lookup key in current node. + for (size_t i = 0; i < field_cnt - 1; i++) + { + size_t idx = field_cnt - 2 - i; + tagged_pointer tp = np->load_entry(idx); + + if (tp.pointer() == nullptr || tp.tag() != tag) continue; + if (!equal(extract(tp.pointer()), key)) continue; + return tp.pointer(); + } + return nullptr; + } + + // Traverse a bucket in normal order. + entry_pointer search_forward(Key const& key, tag_type tag, node_pointer node) + { + if (node == nullptr) return entry_pointer{}; + + mask_type match = node->match_tag(tag); + for (size_t idx = node::consume_mask(match); idx < field_cnt - 1; idx = node::consume_mask(match)) + if (equal(extract(node->load_entry(idx).pointer()), key)) + return entry_pointer{node, idx}; + + return search_forward(key, tag, node->next_node(alloc)); + } + + private: + node_alloc alloc; + // Nodes are organized into blocks. + // First embed_cnt nodes are put in embedded_block, + // and when rehashed, new nodes are allocated into node_blocks. + // allocated_nodes[0] points to embedded_block and + // allocated_nodes[k] contains embed_cnt * 2^(k-1) nodes + std::array embedded_block{}; + std::array node_blocks{}; + // Initially only embedded_block is active + std::atomic_uint64_t active_blocks{1U}; + + // state variable indicating the map shrinking or expanding + std::atomic_uint64_t svar{0UL}; + // progress variable indicating relocation progress + // 0 for start, -1 for finish + std::atomic pvar{-1UL}; + + typename maybe_add_pointer::type hasher; + typename maybe_add_pointer::type equal; + typename maybe_add_pointer::type extract; +}; + +} // KVDK_NAMESPACE diff --git a/engine/experimental/vhash.cpp b/engine/experimental/vhash.cpp new file mode 100644 index 00000000..76c86df5 --- /dev/null +++ b/engine/experimental/vhash.cpp @@ -0,0 +1,130 @@ +#include "vhash.hpp" +#include "../alias.hpp" +#include "../macros.hpp" +#include "../version/old_records_cleaner.hpp" + +namespace KVDK_NAMESPACE +{ + +template +Status VHash::Get(StringView key, CopyTo copy, void* dst) +KVDK_TRY +{ + VHashKV* kvp = hpmap.lookup(key, hpmap.lockless); + if (kvp == nullptr) return Status::NotFound; + copy(dst, kvp->Value()); + return Status::Ok; +} +KVDK_HANDLE_EXCEPTIONS + +Status VHash::Put(StringView key, StringView value) +KVDK_TRY +{ + VHashKV* new_kv = kvb.NewKV(key, value); + VHashKV* old_kv = nullptr; + { + auto acc = hpmap.lookup(key, hpmap.acquire_lock); + old_kv = acc.pointer(); + acc.set_pointer(new_kv); + } + if (old_kv == nullptr) sz.fetch_add(1LL); + kvb.DeleteKV(old_kv); + return Status::Ok; +} +KVDK_HANDLE_EXCEPTIONS + +Status VHash::Delete(StringView key) +KVDK_TRY +{ + VHashKV* old_kv = nullptr; + { + auto acc = hpmap.lookup(key, hpmap.acquire_lock); + old_kv = acc.pointer(); + acc.erase(); + } + if (old_kv != nullptr) sz.fetch_sub(1LL); + kvb.DeleteKV(old_kv); + return Status::Ok; +} +KVDK_HANDLE_EXCEPTIONS + +Status VHash::DeleteAll() +KVDK_TRY +{ + for(auto iter = hpmap.begin(); iter != hpmap.end(); ++iter) + { + VHashKV* old_kv = iter->pointer(); + iter->erase(); + kvb.DeleteKV(old_kv); + sz.fetch_sub(1LL); + } + kvdk_assert(sz.load() == 0LL, ""); + return Status::Ok; +} +KVDK_HANDLE_EXCEPTIONS + +template +Status VHash::Modify(StringView key, ModifyFunc modify, void* cb_args, Cleanup cleanup) +KVDK_TRY +{ + VHashKV* old_kv = nullptr; + ModifyOperation op; + { + auto acc = hpmap.lookup(key, hpmap.acquire_lock); + old_kv = acc.pointer(); + StringView new_value; + op = modify(old_kv ? old_kv->Value() : nullptr, &new_value, cb_args); + switch (op) + { + case ModifyOperation::Write: + { + VHashKV* new_kv = kvb.NewKV(key, new_value); + acc.set_pointer(new_kv); + if (old_kv == nullptr) sz.fetch_add(1LL); + kvb.DeleteKV(old_kv); + break; + } + case ModifyOperation::Delete: + { + kvdk_assert(old_kv != nullptr, "Invalid callback!"); + acc.erase(); + sz.fetch_sub(1LL); + kvb.DeleteKV(old_kv); + break; + } + case ModifyOperation::Noop: + case ModifyOperation::Abort: + { + break; + } + } + cleanup(new_value); + } + return (op != ModifyOperation::Abort) ? Status::Ok : Status::Abort; +} +KVDK_HANDLE_EXCEPTIONS + +void VHashKVBuilder::PurgeKV(VHashKV* kv) +{ + kv->~VHashKV(); + alloc.Deallocate(kv, sizeof(VHashKV) + kv->Key().size() + kv->Value().size()); +} + +VHash* VHashBuilder::NewVHash(StringView name) +{ + return new VHash{name, kvb}; +} + +void VHashBuilder::DeleteVHash(VHash* vhash) +{ + if (vhash == nullptr) return; + cleaner.DelayDelete(vhash); +} + +void VHashBuilder::PurgeVHash(VHash* vhash) +{ + auto iter = vhash-> +} + + +} // namespace KVDK_NAMESPACE diff --git a/engine/experimental/vhash.hpp b/engine/experimental/vhash.hpp new file mode 100644 index 00000000..ccc8dcf4 --- /dev/null +++ b/engine/experimental/vhash.hpp @@ -0,0 +1,69 @@ +#pragma once + +#include +#include + +#include "hashptr_map.hpp" +#include "vhash_kv.hpp" +#include "kvdk/iterator.hpp" + +namespace KVDK_NAMESPACE +{ +/// TODO: Support dynamically choose a allocator when creating VHash +/// Currently VHash allocate KVs by VHashKVBuilder, +/// which can bind to other allocators, +/// but VHashBuilder does not support custom allocator for +/// hashptr_map and name +class VHash +{ +private: + hashptr_map hpmap; + VHashKVBuilder& kvb; + std::atomic_int64_t sz{0LL}; + std::string name; + +public: + VHash(StringView n, VHashKVBuilder& b) : hpmap{4, VHashKV::ExtractKey}, kvb{b}, name{n.data(), n.size()} {} + + StringView Name() const { return name; } + + // CopyTo should have signature of + // void(*)(void* dst, StringView src) + template + Status Get(StringView key, CopyTo copy, void* dst); + + Status Put(StringView key, StringView value); + + Status Delete(StringView key); + + // ModifyFunc should have signature of + // ModifyOperation(*)(StringView const* old_val, StringView& new_val, void* args) + // Cleanup should have signature of + // void(*)(StringView new_val) + // for cleaning up memory allocated by ModifyFunc. + template + Status Modify(StringView key, ModifyFunc modify, void* cb_args, Cleanup cleanup); + + // Called by VHashBuilder::PurgeVHash() to Delete all VHashKVs inside it. + Status DeleteAll(); +}; + + +class VHashBuilder +{ +private: + OldRecordsCleaner& cleaner; + VHashKVBuilder& kvb; + +public: + VHashBuilder(OldRecordsCleaner& c, VHashKVBuilder& b) : cleaner{c}, kvb{b} {} + + // Called by VHashGroup to create a VHash. + VHash* NewVHash(StringView name); + // Called by VHashGroup to "delete" a VHash. Delegate the deletion to cleaner. + void DeleteVHash(VHash* vhash); + // Called by cleaner to actually delete the VHash and recycle its memory. + void PurgeVHash(VHash* vhash); +}; + +} // KVDK_NAMESPACE diff --git a/engine/experimental/vhash_group.hpp b/engine/experimental/vhash_group.hpp new file mode 100644 index 00000000..282126ca --- /dev/null +++ b/engine/experimental/vhash_group.hpp @@ -0,0 +1,42 @@ +#include "hashptr_map.hpp" +#include "vhash.hpp" + +#pragma once + +#include + +#include "hashptr_map.hpp" +#include "vhash_kv.hpp" + +namespace KVDK_NAMESPACE +{ + +class VHashGroup +{ +private: + hashptr_map hpmap; + VHashKVBuilder& kvb; + std::atomic_int64_t sz{0LL}; + +public: + VHashGroup(VHashKVBuilder& b) : hpmap{4, VHash::Name}, kvb{b} {} + + // CopyTo should have signature of + // void(*)(void* dst, StringView src) + template + Status Get(StringView key, CopyTo copy, void* dst); + + Status Put(StringView key, StringView value); + + Status Delete(StringView key); + + // ModifyFunc should have signature of + // ModifyOperation(*)(StringView const* old_val, StringView& new_val, void* args) + // Cleanup should have signature of + // void(*)(StringView new_val) + // for cleaning up memory allocated by ModifyFunc. + template + Status Modify(StringView key, ModifyFunc modify, void* cb_args, Cleanup cleanup); +}; + +} // KVDK_NAMESPACE diff --git a/engine/experimental/vhash_kv.cpp b/engine/experimental/vhash_kv.cpp new file mode 100644 index 00000000..ea857238 --- /dev/null +++ b/engine/experimental/vhash_kv.cpp @@ -0,0 +1,26 @@ +#include "vhash_kv.hpp" +#include "../version/old_records_cleaner.hpp" + +namespace KVDK_NAMESPACE +{ + +VHashKV* VHashKVBuilder::NewKV(StringView key, StringView value) +{ + void* dst = alloc.Allocate(sizeof(VHashKV) + key.size() + value.size()); + if (dst != nullptr) new(dst) VHashKV{key, value}; + return static_cast(dst); +} + +void VHashKVBuilder::DeleteKV(VHashKV* kv) +{ + if (kv == nullptr) return; + cleaner.DelayDelete(kv); +} + +void VHashKVBuilder::PurgeKV(VHashKV* kv) +{ + kv->~VHashKV(); + alloc.Deallocate(kv, sizeof(VHashKV) + kv->Key().size() + kv->Value().size()); +} + +} // namespace KVDK_NAMESPACE diff --git a/engine/experimental/vhash_kv.hpp b/engine/experimental/vhash_kv.hpp new file mode 100644 index 00000000..43707895 --- /dev/null +++ b/engine/experimental/vhash_kv.hpp @@ -0,0 +1,70 @@ +#pragma once + +#include + +#include "../alias.hpp" +#include "../macros.hpp" +#include "../allocator.hpp" + +namespace KVDK_NAMESPACE +{ + +/// TODO: Add timestamp field for MVCC if necessary +class VHashKV +{ +private: + std::uint32_t key_sz; + std::uint32_t value_sz; + char data[]; + +public: + VHashKV(StringView key, StringView value) + { + kvdk_assert(key.size() <= std::numeric_limits::max(), ""); + kvdk_assert(value.size() <= std::numeric_limits::max(), ""); + key_sz = static_cast(key.size()); + value_sz = static_cast(value.size()); + memcpy(data, key.data(), key.size()); + memcpy(data + key_sz, value.data(), value.size()); + } + + StringView Key() const + { + return StringView{data, key_sz}; + } + + StringView Value() const + { + return StringView{data+key_sz, value_sz}; + } + + static StringView ExtractKey(VHashKV* kvp) + { + return kvp->Key(); + } +}; + +class OldRecordsCleaner; + +class VHashKVBuilder +{ +private: + IVolatileAllocator& alloc; + OldRecordsCleaner& cleaner; + +public: + VHashKVBuilder(IVolatileAllocator& a, OldRecordsCleaner& c) : alloc{a}, cleaner{c} {} + VHashKVBuilder(VHashKVBuilder const&) = delete; + VHashKVBuilder(VHashKVBuilder&&) = default; + + // Called by VHash to create a KV. + VHashKV* NewKV(StringView key, StringView value); + // Called by VHash to "delete" a KV. Delegate the deletion to cleaner. + void DeleteKV(VHashKV* kv); + // Called by cleaner to actually delete the KV and recycle its memory. + void PurgeKV(VHashKV* kv); +}; + + + +} // KVDK_NAMESPACE diff --git a/engine/kv_engine.hpp b/engine/kv_engine.hpp index 22853c9f..e1819464 100644 --- a/engine/kv_engine.hpp +++ b/engine/kv_engine.hpp @@ -22,6 +22,7 @@ #include "alias.hpp" #include "data_record.hpp" #include "dram_allocator.hpp" +#include "experimental/vhash.hpp" #include "hash_collection/hash_list.hpp" #include "hash_collection/rebuilder.hpp" #include "hash_table.hpp" @@ -229,6 +230,17 @@ class KVEngine : public Engine { Status* s) final; void HashIteratorRelease(HashIterator*) final; + // Volatile Hash + Status VHashCreate(StringView key) final; + Status VHashDestroy(StringView key) final; + Status VHashSize(StringView key, size_t* len) final; + Status VHashGet(StringView key, StringView field, std::string* value) final; + Status VHashPut(StringView key, StringView field, StringView value) final; + Status VHashDelete(StringView key, StringView field) final; + Status VHashModify(StringView key, StringView field, ModifyFunc modify_func, + void* cb_args) final; + std::unique_ptrVHashIteratorCreate(StringView key, Status* s) final; + private: // Look up a first level key in hash table(e.g. collections or string, not // collection elems), the first level key should be unique among all types @@ -607,6 +619,11 @@ class KVEngine : public Engine { OldRecordsCleaner old_records_cleaner_; Cleaner cleaner_; + CharAllocator char_alloc_; + std::mutex vhashes_mu_; + VHashKVBuilder vhash_kvb_{char_alloc_, old_records_cleaner_}; + std::unordered_map> vhashes_; + ComparatorTable comparators_; struct BackgroundWorkSignals { diff --git a/engine/kv_engine_vhash.cpp b/engine/kv_engine_vhash.cpp new file mode 100644 index 00000000..5613b082 --- /dev/null +++ b/engine/kv_engine_vhash.cpp @@ -0,0 +1,38 @@ +#include "kv_engine.hpp" + +namespace KVDK_NAMESPACE { + +Status KVEngine::VHashCreate(StringView key) { +return Status::NotSupported; +} + +Status KVEngine::VHashDestroy(StringView key) { +return Status::NotSupported; +} + +Status KVEngine::VHashSize(StringView key, size_t* len) { +return Status::NotSupported; +} + +Status KVEngine::VHashGet(StringView key, StringView field, std::string* value) { +return Status::NotSupported; +} + +Status KVEngine::VHashPut(StringView key, StringView field, StringView value) { +return Status::NotSupported; +} + +Status KVEngine::VHashDelete(StringView key, StringView field) { +return Status::NotSupported; +} + +Status KVEngine::VHashModify(StringView key, StringView field, ModifyFunc modify_func, + void* cb_args) { +return Status::NotSupported; +} + +std::unique_ptrKVEngine::VHashIteratorCreate(StringView key, Status* s) { +return nullptr; +} + +} // namespace KVDK_NAMESPACE diff --git a/engine/macros.hpp b/engine/macros.hpp index b40c546e..9a8d3359 100644 --- a/engine/macros.hpp +++ b/engine/macros.hpp @@ -4,6 +4,8 @@ #include #include +#include "kvdk/types.hpp" + #define to_hex(x) \ std::hex << std::setfill('0') << std::setw(sizeof(decltype(x)) * 2) << x \ << std::dec @@ -20,3 +22,17 @@ ":\t" + std::string{msg}}; \ } \ } + +namespace KVDK_NAMESPACE +{ +#define KVDK_TRY try + +#define KVDK_HANDLE_EXCEPTIONS \ +catch(std::bad_alloc const&) { return Status::MemoryOverflow; } \ +catch(std::out_of_range const&) { return Status::OutOfRange; } \ +catch(std::invalid_argument const&) { return Status::InvalidArgument; } \ +catch(...) { return Status::Abort; } + +} // KVDK_NAMESPACE + + diff --git a/engine/version/old_records_cleaner.cpp b/engine/version/old_records_cleaner.cpp index 2eeae24f..0ec7b32c 100644 --- a/engine/version/old_records_cleaner.cpp +++ b/engine/version/old_records_cleaner.cpp @@ -43,6 +43,42 @@ void OldRecordsCleaner::PushToPendingFree(void* addr, TimeStampType ts) { } } +template +void OldRecordsCleaner::tryDelete(Deleter del, PendingQueue& pending_kvs, size_t lim) +{ + maybeUpdateOldestSnapshot(); + TimeStampType acc_ts = kv_engine_->version_controller_.LocalOldestSnapshotTS(); + for (size_t i = 0; i < lim && !pending_kvs.empty(); i++) + { + auto const& pair = pending_kvs.front(); + if (pair.first >= acc_ts) break; + del(pair.second); + pending_kvs.pop_front(); + } +} + +template +void OldRecordsCleaner::DelayDelete(KVType* kv) +{ + static_assert(std::is_same::value, ""); + + kvdk_assert(access_thread.id >= 0, ""); + auto& tc = cleaner_thread_cache_[access_thread.id]; + std::lock_guard guard{tc.old_records_lock}; + + TimeStampType ts = kv_engine_->version_controller_.GetCurrentTimestamp(); + if (std::is_same::value) + { + tc.pending_vhash_kvs.emplace_back(ts, kv); + tryDelete( + [&](VHashKV* outdated_kv){kv_engine_->vhash_kvb_.PurgeKV(outdated_kv);}, + tc.pending_vhash_kvs, + 16 + ); + } +} +template void OldRecordsCleaner::DelayDelete(VHashKV*); + void OldRecordsCleaner::TryGlobalClean() { std::vector space_to_free; // Update recorded oldest snapshot up to state so we can know which records @@ -66,6 +102,13 @@ void OldRecordsCleaner::TryGlobalClean() { } cleaner_thread_cache.pending_free_space_entries.clear(); } + + std::lock_guard lg(cleaner_thread_cache.old_records_lock); + tryDelete( + [&](VHashKV* outdated_kv){kv_engine_->vhash_kvb_.PurgeKV(outdated_kv);}, + cleaner_thread_cache.pending_vhash_kvs, + 16 + ); } auto iter = global_pending_free_space_entries_.begin(); diff --git a/engine/version/old_records_cleaner.hpp b/engine/version/old_records_cleaner.hpp index 8fba01e4..0ee05bfe 100644 --- a/engine/version/old_records_cleaner.hpp +++ b/engine/version/old_records_cleaner.hpp @@ -5,6 +5,7 @@ #include #include +#include #include #include "../alias.hpp" @@ -13,6 +14,7 @@ #include "../kv_engine_cleaner.hpp" #include "../thread_manager.hpp" #include "../utils/utils.hpp" +#include "../experimental/vhash_kv.hpp" #include "kvdk/configs.hpp" #include "version_controller.hpp" @@ -32,23 +34,35 @@ class OldRecordsCleaner { } void PushToPendingFree(void* addr, TimeStampType ts); + + /// TODO: unify DelayDelete with static polymorphism + /// It cannot be done with dynamic polymorphism as sometimes + /// we cannot afford the memory usage by virtual function pointer + /// and if we use shared memory dynamic polymorphism is impossible. + template + void DelayDelete(KVType* kv); // Try to clean global old records void TryGlobalClean(); private: struct CleanerThreadCache { std::deque pending_free_space_entries{}; + std::deque> pending_vhash_kvs; SpinMutex old_records_lock; }; const uint64_t kLimitCachedDeleteRecords = 1000000; void maybeUpdateOldestSnapshot(); + template + void tryDelete(Deleter del, PendingQueue& pending_kvs, size_t lim); + KVEngine* kv_engine_; Array cleaner_thread_cache_; std::deque global_pending_free_space_entries_; + std::deque> global_pending_vhash_kvs; }; } // namespace KVDK_NAMESPACE diff --git a/examples/kvredis/redis b/examples/kvredis/redis new file mode 160000 index 00000000..4930d19e --- /dev/null +++ b/examples/kvredis/redis @@ -0,0 +1 @@ +Subproject commit 4930d19e70c391750479951022e207e19111eb55 diff --git a/include/kvdk/engine.hpp b/include/kvdk/engine.hpp index c802bd59..3a7a0018 100644 --- a/include/kvdk/engine.hpp +++ b/include/kvdk/engine.hpp @@ -335,6 +335,18 @@ class Engine { Status* s = nullptr) = 0; virtual void HashIteratorRelease(HashIterator*) = 0; + /// Volatile Hash APIs ////////////////////////////////////////////////////// + + virtual Status VHashCreate(StringView key) = 0; + virtual Status VHashDestroy(StringView key) = 0; + virtual Status VHashSize(StringView key, size_t* len) = 0; + virtual Status VHashGet(StringView key, StringView field, std::string* value) = 0; + virtual Status VHashPut(StringView key, StringView field, StringView value) = 0; + virtual Status VHashDelete(StringView key, StringView field) = 0; + virtual Status VHashModify(StringView key, StringView field, ModifyFunc modify_func, + void* cb_args) = 0; + virtual std::unique_ptrVHashIteratorCreate(StringView key, Status* s) = 0; + /// Other /////////////////////////////////////////////////////////////////// // Get a snapshot of the instance at this moment. diff --git a/include/kvdk/iterator.hpp b/include/kvdk/iterator.hpp index 10e7e981..ca71ba2f 100644 --- a/include/kvdk/iterator.hpp +++ b/include/kvdk/iterator.hpp @@ -79,4 +79,19 @@ class HashIterator { virtual ~HashIterator() = default; }; +class VHashIterator { + public: + virtual void SeekToFirst() = 0; + + virtual void Next() = 0; + + virtual bool Valid() = 0; + + virtual std::string Key() const = 0; + + virtual std::string Value() const = 0; + + virtual ~VHashIterator() = default; +}; + } // namespace KVDK_NAMESPACE \ No newline at end of file From 950bd14d0a2dfe17f18f4ede6cd963fc4da9ae19 Mon Sep 17 00:00:00 2001 From: ZiyanShi Date: Tue, 6 Sep 2022 02:16:55 -0400 Subject: [PATCH 02/23] build success Signed-off-by: ZiyanShi --- CMakeLists.txt | 3 +- engine/allocator.hpp | 7 +- engine/experimental/hashptr_map.hpp | 18 +++- engine/experimental/vhash.cpp | 81 ++++++++------ engine/experimental/vhash.hpp | 67 ++++++++---- engine/experimental/vhash_group.cpp | 39 +++++++ engine/experimental/vhash_group.hpp | 37 +++---- engine/experimental/vhash_kv.cpp | 15 ++- engine/experimental/vhash_kv.hpp | 18 ++-- engine/kv_engine.hpp | 6 +- engine/kv_engine_vhash.cpp | 141 ++++++++++++++++++++++--- engine/version/old_records_cleaner.cpp | 55 +++++----- engine/version/old_records_cleaner.hpp | 32 ++++-- include/kvdk/iterator.hpp | 2 +- 14 files changed, 371 insertions(+), 150 deletions(-) create mode 100644 engine/experimental/vhash_group.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 464cac44..249dcd2a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -53,7 +53,8 @@ set(SOURCES engine/c/kvdk_sorted.cpp engine/c/kvdk_string.cpp engine/experimental/vhash_kv.cpp - engine/experimental/vhash_kv.cpp + engine/experimental/vhash.cpp + engine/experimental/vhash_group.cpp engine/utils/utils.cpp engine/utils/sync_point.cpp engine/engine.cpp diff --git a/engine/allocator.hpp b/engine/allocator.hpp index 16553663..1e0dcb47 100644 --- a/engine/allocator.hpp +++ b/engine/allocator.hpp @@ -31,13 +31,16 @@ class Allocator { class IVolatileAllocator { public: - // No throw, It's up to caller to check for nullptr. virtual void* Allocate(size_t bytes) = 0; virtual void Deallocate(void* addr, size_t bytes) = 0; }; class CharAllocator final : public IVolatileAllocator { - void* Allocate(size_t n) final { return ::malloc(n); } + void* Allocate(size_t n) final { + void* mem = ::malloc(n); + if (mem == nullptr) throw std::bad_alloc{}; + return mem; + } void Deallocate(void* addr, size_t) final { ::free(addr); } }; diff --git a/engine/experimental/hashptr_map.hpp b/engine/experimental/hashptr_map.hpp index ec74d3cf..ab6a67de 100644 --- a/engine/experimental/hashptr_map.hpp +++ b/engine/experimental/hashptr_map.hpp @@ -569,14 +569,18 @@ class hashptr_map map->help_end_relocation(); } - Pointer pointer() + Pointer pointer() const { return (loc.np == nullptr) ? nullptr : loc.np->load_entry(loc.idx).pointer(); } void set_pointer(Pointer p) { - if (loc.np == nullptr) + if (p == nullptr) + { + erase(); + } + else if (loc.np == nullptr) { // Since relocation may have been done by lookup(), // we must guarantee Pointer p is inserted to the correct bucket. @@ -668,11 +672,21 @@ class hashptr_map return acc; } + accessor const& operator*() const + { + return acc; + } + accessor* operator->() { return &acc; } + accessor const* operator->() const + { + return &acc; + } + private: friend hashptr_map; diff --git a/engine/experimental/vhash.cpp b/engine/experimental/vhash.cpp index 76c86df5..40623f37 100644 --- a/engine/experimental/vhash.cpp +++ b/engine/experimental/vhash.cpp @@ -6,19 +6,15 @@ namespace KVDK_NAMESPACE { -template -Status VHash::Get(StringView key, CopyTo copy, void* dst) -KVDK_TRY +Status VHash::Get(StringView key, VHash::CopyFunc copy, void* dst) { VHashKV* kvp = hpmap.lookup(key, hpmap.lockless); if (kvp == nullptr) return Status::NotFound; copy(dst, kvp->Value()); return Status::Ok; } -KVDK_HANDLE_EXCEPTIONS Status VHash::Put(StringView key, StringView value) -KVDK_TRY { VHashKV* new_kv = kvb.NewKV(key, value); VHashKV* old_kv = nullptr; @@ -28,13 +24,11 @@ KVDK_TRY acc.set_pointer(new_kv); } if (old_kv == nullptr) sz.fetch_add(1LL); - kvb.DeleteKV(old_kv); + kvb.Recycle(old_kv); return Status::Ok; } -KVDK_HANDLE_EXCEPTIONS Status VHash::Delete(StringView key) -KVDK_TRY { VHashKV* old_kv = nullptr; { @@ -43,29 +37,26 @@ KVDK_TRY acc.erase(); } if (old_kv != nullptr) sz.fetch_sub(1LL); - kvb.DeleteKV(old_kv); + kvb.Recycle(old_kv); return Status::Ok; } -KVDK_HANDLE_EXCEPTIONS -Status VHash::DeleteAll() -KVDK_TRY +Status VHash::deleteAll() { for(auto iter = hpmap.begin(); iter != hpmap.end(); ++iter) { VHashKV* old_kv = iter->pointer(); iter->erase(); - kvb.DeleteKV(old_kv); + // It's safe to call Delete() instead of Recycle() here, + // As deleteAll() is called by cleaner. + kvb.Delete(old_kv); sz.fetch_sub(1LL); } kvdk_assert(sz.load() == 0LL, ""); return Status::Ok; } -KVDK_HANDLE_EXCEPTIONS -template -Status VHash::Modify(StringView key, ModifyFunc modify, void* cb_args, Cleanup cleanup) -KVDK_TRY +Status VHash::Modify(StringView key, VHash::ModifyFunc modify, void* cb_args, VHash::Cleanup cleanup) { VHashKV* old_kv = nullptr; ModifyOperation op; @@ -73,7 +64,9 @@ KVDK_TRY auto acc = hpmap.lookup(key, hpmap.acquire_lock); old_kv = acc.pointer(); StringView new_value; - op = modify(old_kv ? old_kv->Value() : nullptr, &new_value, cb_args); + StringView old_value; + old_value = old_kv ? old_kv->Value() : old_value; + op = modify(old_kv ? &old_value : nullptr, new_value, cb_args); switch (op) { case ModifyOperation::Write: @@ -81,7 +74,7 @@ KVDK_TRY VHashKV* new_kv = kvb.NewKV(key, new_value); acc.set_pointer(new_kv); if (old_kv == nullptr) sz.fetch_add(1LL); - kvb.DeleteKV(old_kv); + kvb.Recycle(old_kv); break; } case ModifyOperation::Delete: @@ -89,7 +82,7 @@ KVDK_TRY kvdk_assert(old_kv != nullptr, "Invalid callback!"); acc.erase(); sz.fetch_sub(1LL); - kvb.DeleteKV(old_kv); + kvb.Recycle(old_kv); break; } case ModifyOperation::Noop: @@ -102,29 +95,57 @@ KVDK_TRY } return (op != ModifyOperation::Abort) ? Status::Ok : Status::Abort; } -KVDK_HANDLE_EXCEPTIONS -void VHashKVBuilder::PurgeKV(VHashKV* kv) +void VHash::Iterator::SeekToFirst() { + pos = owner.hpmap.begin(); +} + +void VHash::Iterator::Next() { + if (Valid()) ++pos; +} + +bool VHash::Iterator::Valid() const { + return (pos != owner.hpmap.end()); +} + +std::string VHash::Iterator::Key() const { + VHashKV* kv = pos->pointer(); + StringView key = kv->Key(); + return std::string(key.data(), key.size()); +} + +std::string VHash::Iterator::Value() const { + VHashKV* kv = pos->pointer(); + StringView value = kv->Value(); + return std::string(value.data(), value.size()); +} + +std::unique_ptr VHash::MakeIterator() { + // Initialized to end() iterator without acquiring lock. + return std::unique_ptr{new Iterator{*this, hpmap.end()}}; +} + +VHashBuilder::VHashBuilder(OldRecordsCleaner& c) : cleaner{c} { - kv->~VHashKV(); - alloc.Deallocate(kv, sizeof(VHashKV) + kv->Key().size() + kv->Value().size()); + cleaner.RegisterDelayDeleter(*this); } -VHash* VHashBuilder::NewVHash(StringView name) +VHash* VHashBuilder::NewVHash(StringView name, VHashKVBuilder& kvb) { return new VHash{name, kvb}; } -void VHashBuilder::DeleteVHash(VHash* vhash) +void VHashBuilder::Recycle(VHash* vhash) { if (vhash == nullptr) return; - cleaner.DelayDelete(vhash); + cleaner.DelayDelete(*this, vhash); } -void VHashBuilder::PurgeVHash(VHash* vhash) +void VHashBuilder::Delete(void* vhash) { - auto iter = vhash-> + VHash* vh = static_cast(vhash); + vh->deleteAll(); + delete vh; } - } // namespace KVDK_NAMESPACE diff --git a/engine/experimental/vhash.hpp b/engine/experimental/vhash.hpp index ccc8dcf4..991bbde2 100644 --- a/engine/experimental/vhash.hpp +++ b/engine/experimental/vhash.hpp @@ -1,11 +1,13 @@ #pragma once +#include #include #include #include "hashptr_map.hpp" #include "vhash_kv.hpp" #include "kvdk/iterator.hpp" +#include "../version/old_records_cleaner.hpp" namespace KVDK_NAMESPACE { @@ -27,43 +29,66 @@ class VHash StringView Name() const { return name; } - // CopyTo should have signature of - // void(*)(void* dst, StringView src) - template - Status Get(StringView key, CopyTo copy, void* dst); + static StringView ExtractName(VHash* vhash) + { + return vhash->Name(); + } + + size_t Size() const { return sz.load(); } + + using CopyFunc = std::function; + // copy() will copy StringView of value to dst + Status Get(StringView key, CopyFunc copy, void* dst); Status Put(StringView key, StringView value); Status Delete(StringView key); - // ModifyFunc should have signature of - // ModifyOperation(*)(StringView const* old_val, StringView& new_val, void* args) - // Cleanup should have signature of - // void(*)(StringView new_val) - // for cleaning up memory allocated by ModifyFunc. - template + // Cleanup is for cleaning up memory allocated by ModifyFunc. + using ModifyFunc = std::function; + using Cleanup = std::function; Status Modify(StringView key, ModifyFunc modify, void* cb_args, Cleanup cleanup); - // Called by VHashBuilder::PurgeVHash() to Delete all VHashKVs inside it. - Status DeleteAll(); -}; + class Iterator : public VHashIterator + { + public: + void SeekToFirst() final; + void Next() final; + bool Valid() const final; + std::string Key() const final; + std::string Value() const final; + virtual ~Iterator() = default; + + private: + friend VHash; + using rep = typename decltype(hpmap)::iterator; + VHash& owner; + rep pos; + Iterator(VHash& o, rep&& p) : owner{o}, pos{std::move(p)} {} + }; + + std::unique_ptr MakeIterator(); + private: + friend class VHashBuilder; + // Called by VHashBuilder::Delete() to Delete all VHashKVs inside it. + Status deleteAll(); +}; -class VHashBuilder +class VHashBuilder : public IDeleter { private: OldRecordsCleaner& cleaner; - VHashKVBuilder& kvb; public: - VHashBuilder(OldRecordsCleaner& c, VHashKVBuilder& b) : cleaner{c}, kvb{b} {} + VHashBuilder(OldRecordsCleaner& c); // Called by VHashGroup to create a VHash. - VHash* NewVHash(StringView name); - // Called by VHashGroup to "delete" a VHash. Delegate the deletion to cleaner. - void DeleteVHash(VHash* vhash); - // Called by cleaner to actually delete the VHash and recycle its memory. - void PurgeVHash(VHash* vhash); + VHash* NewVHash(StringView name, VHashKVBuilder& b); + + void Recycle(VHash* vhash); + + void Delete(void* vhash) final; }; } // KVDK_NAMESPACE diff --git a/engine/experimental/vhash_group.cpp b/engine/experimental/vhash_group.cpp new file mode 100644 index 00000000..5b4fb417 --- /dev/null +++ b/engine/experimental/vhash_group.cpp @@ -0,0 +1,39 @@ +#include "vhash_group.hpp" +#include "../macros.hpp" + +namespace KVDK_NAMESPACE +{ + +Status VHashGroup::Create(StringView name) +KVDK_TRY +{ + auto acc = hpmap.lookup(name, hpmap.acquire_lock); + if (acc.pointer() != nullptr) return Status::Existed; + VHash* vhash = vhb.NewVHash(name, kvb); + acc.set_pointer(vhash); + return Status::Ok; +} +KVDK_HANDLE_EXCEPTIONS + +Status VHashGroup::Destroy(StringView name) +KVDK_TRY +{ + auto acc = hpmap.lookup(name, hpmap.acquire_lock); + VHash* vhash = acc.pointer(); + if (vhash == nullptr) return Status::NotFound; + acc.erase(); + vhb.Recycle(vhash); + return Status::Ok; +} +KVDK_HANDLE_EXCEPTIONS + +Status VHashGroup::Get(StringView name, VHash** vhash) +KVDK_TRY +{ + *vhash = hpmap.lookup(name, hpmap.lockless); + if (*vhash == nullptr) return Status::NotFound; + else return Status::Ok; +} +KVDK_HANDLE_EXCEPTIONS + +} // namespace KVDK_NAMESPACE diff --git a/engine/experimental/vhash_group.hpp b/engine/experimental/vhash_group.hpp index 282126ca..cc7b6d1d 100644 --- a/engine/experimental/vhash_group.hpp +++ b/engine/experimental/vhash_group.hpp @@ -1,42 +1,35 @@ -#include "hashptr_map.hpp" -#include "vhash.hpp" - #pragma once #include +#include "../alias.hpp" #include "hashptr_map.hpp" -#include "vhash_kv.hpp" +#include "vhash.hpp" namespace KVDK_NAMESPACE { +// A VHashGroup contains VHashes that share the same memory allocator for kvs. +/// TODO: Add hpmap_alloc to allocate memory for hashptr_maps. class VHashGroup { private: - hashptr_map hpmap; - VHashKVBuilder& kvb; + OldRecordsCleaner& cleaner; + IVolatileAllocator& kv_alloc; + VHashKVBuilder kvb{kv_alloc, cleaner}; + VHashBuilder vhb{cleaner}; + + hashptr_map hpmap{4, VHash::ExtractName}; std::atomic_int64_t sz{0LL}; public: - VHashGroup(VHashKVBuilder& b) : hpmap{4, VHash::Name}, kvb{b} {} - - // CopyTo should have signature of - // void(*)(void* dst, StringView src) - template - Status Get(StringView key, CopyTo copy, void* dst); + VHashGroup(IVolatileAllocator& a, OldRecordsCleaner& c) : kv_alloc{a}, cleaner{c} {} - Status Put(StringView key, StringView value); + Status Create(StringView name); - Status Delete(StringView key); + Status Destroy(StringView name); - // ModifyFunc should have signature of - // ModifyOperation(*)(StringView const* old_val, StringView& new_val, void* args) - // Cleanup should have signature of - // void(*)(StringView new_val) - // for cleaning up memory allocated by ModifyFunc. - template - Status Modify(StringView key, ModifyFunc modify, void* cb_args, Cleanup cleanup); + Status Get(StringView name, VHash** vhash); }; -} // KVDK_NAMESPACE +} // namespace KVDK_NAMESPACE diff --git a/engine/experimental/vhash_kv.cpp b/engine/experimental/vhash_kv.cpp index ea857238..7403a87f 100644 --- a/engine/experimental/vhash_kv.cpp +++ b/engine/experimental/vhash_kv.cpp @@ -4,21 +4,28 @@ namespace KVDK_NAMESPACE { +VHashKVBuilder::VHashKVBuilder(IVolatileAllocator& a, OldRecordsCleaner& c) : + alloc{a}, cleaner{c} +{ + c.RegisterDelayDeleter(*this); +} + VHashKV* VHashKVBuilder::NewKV(StringView key, StringView value) { void* dst = alloc.Allocate(sizeof(VHashKV) + key.size() + value.size()); - if (dst != nullptr) new(dst) VHashKV{key, value}; + new(dst) VHashKV{key, value}; return static_cast(dst); } -void VHashKVBuilder::DeleteKV(VHashKV* kv) +void VHashKVBuilder::Recycle(VHashKV* kv) { if (kv == nullptr) return; - cleaner.DelayDelete(kv); + cleaner.DelayDelete(*this, kv); } -void VHashKVBuilder::PurgeKV(VHashKV* kv) +void VHashKVBuilder::Delete(void* obj) { + VHashKV* kv = static_cast(obj); kv->~VHashKV(); alloc.Deallocate(kv, sizeof(VHashKV) + kv->Key().size() + kv->Value().size()); } diff --git a/engine/experimental/vhash_kv.hpp b/engine/experimental/vhash_kv.hpp index 43707895..625777d8 100644 --- a/engine/experimental/vhash_kv.hpp +++ b/engine/experimental/vhash_kv.hpp @@ -5,6 +5,7 @@ #include "../alias.hpp" #include "../macros.hpp" #include "../allocator.hpp" +#include "../version/old_records_cleaner.hpp" namespace KVDK_NAMESPACE { @@ -44,25 +45,24 @@ class VHashKV } }; -class OldRecordsCleaner; - -class VHashKVBuilder +class VHashKVBuilder : public IDeleter { private: IVolatileAllocator& alloc; OldRecordsCleaner& cleaner; public: - VHashKVBuilder(IVolatileAllocator& a, OldRecordsCleaner& c) : alloc{a}, cleaner{c} {} + VHashKVBuilder(IVolatileAllocator& a, OldRecordsCleaner& c); VHashKVBuilder(VHashKVBuilder const&) = delete; VHashKVBuilder(VHashKVBuilder&&) = default; - // Called by VHash to create a KV. VHashKV* NewKV(StringView key, StringView value); - // Called by VHash to "delete" a KV. Delegate the deletion to cleaner. - void DeleteKV(VHashKV* kv); - // Called by cleaner to actually delete the KV and recycle its memory. - void PurgeKV(VHashKV* kv); + + // Recycle VHashKV to OldRecordsCleaner for later deletion. + void Recycle(VHashKV* kv); + + // Called by OldRecordsCleaner to delete KV. + void Delete(void* kv) final; }; diff --git a/engine/kv_engine.hpp b/engine/kv_engine.hpp index e1819464..5a7f4fdc 100644 --- a/engine/kv_engine.hpp +++ b/engine/kv_engine.hpp @@ -22,7 +22,7 @@ #include "alias.hpp" #include "data_record.hpp" #include "dram_allocator.hpp" -#include "experimental/vhash.hpp" +#include "experimental/vhash_group.hpp" #include "hash_collection/hash_list.hpp" #include "hash_collection/rebuilder.hpp" #include "hash_table.hpp" @@ -620,9 +620,7 @@ class KVEngine : public Engine { Cleaner cleaner_; CharAllocator char_alloc_; - std::mutex vhashes_mu_; - VHashKVBuilder vhash_kvb_{char_alloc_, old_records_cleaner_}; - std::unordered_map> vhashes_; + VHashGroup vhashes_{char_alloc_, old_records_cleaner_}; ComparatorTable comparators_; diff --git a/engine/kv_engine_vhash.cpp b/engine/kv_engine_vhash.cpp index 5613b082..a1f075d7 100644 --- a/engine/kv_engine_vhash.cpp +++ b/engine/kv_engine_vhash.cpp @@ -1,38 +1,149 @@ #include "kv_engine.hpp" +#include "macros.hpp" namespace KVDK_NAMESPACE { -Status KVEngine::VHashCreate(StringView key) { -return Status::NotSupported; +Status KVEngine::VHashCreate(StringView key) +KVDK_TRY +{ + Status s = MaybeInitAccessThread(); + defer(ReleaseAccessThread()); + if (s != Status::Ok) return s; + if (!CheckKeySize(key)) return Status::InvalidDataSize; + + return vhashes_.Create(key); } +KVDK_HANDLE_EXCEPTIONS -Status KVEngine::VHashDestroy(StringView key) { -return Status::NotSupported; +Status KVEngine::VHashDestroy(StringView key) +KVDK_TRY +{ + Status s = MaybeInitAccessThread(); + defer(ReleaseAccessThread()); + if (s != Status::Ok) return s; + if (!CheckKeySize(key)) return Status::InvalidDataSize; + + return vhashes_.Destroy(key); } +KVDK_HANDLE_EXCEPTIONS -Status KVEngine::VHashSize(StringView key, size_t* len) { -return Status::NotSupported; +Status KVEngine::VHashSize(StringView key, size_t* len) +KVDK_TRY +{ + Status s = MaybeInitAccessThread(); + if (s != Status::Ok) return s; + if (!CheckKeySize(key)) return Status::InvalidDataSize; + + auto token = version_controller_.GetLocalSnapshotHolder(); + VHash* vhash = nullptr; + s = vhashes_.Get(key, &vhash); + if (s != Status::Ok) return s; + + *len = vhash->Size(); + return Status::Ok; } +KVDK_HANDLE_EXCEPTIONS + +Status KVEngine::VHashGet(StringView key, StringView field, std::string* value) +KVDK_TRY +{ + Status s = MaybeInitAccessThread(); + if (s != Status::Ok) return s; + if (!CheckKeySize(key) || !CheckKeySize(field)) return Status::InvalidDataSize; + + auto token = version_controller_.GetLocalSnapshotHolder(); + VHash* vhash = nullptr; + s = vhashes_.Get(key, &vhash); + if (s != Status::Ok) return s; + + auto CopyValue = [&](void* dst, StringView src) + { + static_cast(dst)->assign(src.data(), src.size()); + }; + return vhash->Get(field, CopyValue, value); -Status KVEngine::VHashGet(StringView key, StringView field, std::string* value) { return Status::NotSupported; } +KVDK_HANDLE_EXCEPTIONS -Status KVEngine::VHashPut(StringView key, StringView field, StringView value) { -return Status::NotSupported; +Status KVEngine::VHashPut(StringView key, StringView field, StringView value) +KVDK_TRY +{ + Status s = MaybeInitAccessThread(); + if (s != Status::Ok) return s; + if (!CheckKeySize(key) || !CheckKeySize(field) || !CheckValueSize(value)) + return Status::InvalidDataSize; + + auto token = version_controller_.GetLocalSnapshotHolder(); + VHash* vhash = nullptr; + s = vhashes_.Get(key, &vhash); + if (s != Status::Ok) return s; + + return vhash->Put(field, value); } +KVDK_HANDLE_EXCEPTIONS -Status KVEngine::VHashDelete(StringView key, StringView field) { -return Status::NotSupported; +Status KVEngine::VHashDelete(StringView key, StringView field) +KVDK_TRY +{ + Status s = MaybeInitAccessThread(); + if (s != Status::Ok) return s; + if (!CheckKeySize(key) || !CheckKeySize(field)) return Status::InvalidDataSize; + + auto token = version_controller_.GetLocalSnapshotHolder(); + VHash* vhash = nullptr; + s = vhashes_.Get(key, &vhash); + if (s != Status::Ok) return s; + + return vhash->Delete(field); } +KVDK_HANDLE_EXCEPTIONS Status KVEngine::VHashModify(StringView key, StringView field, ModifyFunc modify_func, - void* cb_args) { -return Status::NotSupported; + void* cb_args) +KVDK_TRY +{ + Status s = MaybeInitAccessThread(); + if (s != Status::Ok) return s; + if (!CheckKeySize(key) || !CheckKeySize(field)) return Status::InvalidDataSize; + + std::string old_value; + std::string new_value; + auto modify = [&](StringView const* old_val, StringView& new_val, void* args) + { + if (old_val != nullptr) old_value.assign(old_val->data(), old_val->size()); + ModifyOperation op = modify_func(old_val ? &old_value : nullptr, &new_value, args); + new_val = new_value; + return op; + }; + + auto cleanup = [&](StringView) { return; }; + + auto token = version_controller_.GetLocalSnapshotHolder(); + VHash* vhash = nullptr; + s = vhashes_.Get(key, &vhash); + if (s != Status::Ok) return s; + + return vhash->Modify(field, modify, cb_args, cleanup); } +KVDK_HANDLE_EXCEPTIONS + +std::unique_ptrKVEngine::VHashIteratorCreate(StringView key, Status* s) +{ + Status sink; + s = (s != nullptr) ? s : &sink; + + *s = MaybeInitAccessThread(); + if (*s != Status::Ok) return nullptr; + if (!CheckKeySize(key)) return nullptr; + + /// TODO: iterator should hold an access token to keep VHash valid. + auto token = version_controller_.GetLocalSnapshotHolder(); + VHash* vhash = nullptr; + *s = vhashes_.Get(key, &vhash); + if (*s != Status::Ok) return nullptr; -std::unique_ptrKVEngine::VHashIteratorCreate(StringView key, Status* s) { -return nullptr; + return vhash->MakeIterator(); } } // namespace KVDK_NAMESPACE diff --git a/engine/version/old_records_cleaner.cpp b/engine/version/old_records_cleaner.cpp index 0ec7b32c..bd2c043e 100644 --- a/engine/version/old_records_cleaner.cpp +++ b/engine/version/old_records_cleaner.cpp @@ -43,41 +43,43 @@ void OldRecordsCleaner::PushToPendingFree(void* addr, TimeStampType ts) { } } -template -void OldRecordsCleaner::tryDelete(Deleter del, PendingQueue& pending_kvs, size_t lim) +void OldRecordsCleaner::RegisterDelayDeleter(IDeleter& deleter) { - maybeUpdateOldestSnapshot(); - TimeStampType acc_ts = kv_engine_->version_controller_.LocalOldestSnapshotTS(); - for (size_t i = 0; i < lim && !pending_kvs.empty(); i++) - { - auto const& pair = pending_kvs.front(); - if (pair.first >= acc_ts) break; - del(pair.second); - pending_kvs.pop_front(); - } + size_t idx = global_queues_.size(); + delay_deleters_[&deleter] = idx; + global_queues_.emplace_back(); + for (size_t i = 0; i < cleaner_thread_cache_.size(); i++) + cleaner_thread_cache_[i].local_queues_.emplace_back(); } -template -void OldRecordsCleaner::DelayDelete(KVType* kv) +void OldRecordsCleaner::DelayDelete(IDeleter& deleter, void* obj) { - static_assert(std::is_same::value, ""); - kvdk_assert(access_thread.id >= 0, ""); auto& tc = cleaner_thread_cache_[access_thread.id]; std::lock_guard guard{tc.old_records_lock}; + TimeStampType ts = kv_engine_->version_controller_.GetCurrentTimestamp(); - if (std::is_same::value) + + size_t idx = delay_deleters_.at(&deleter); + tc.local_queues_[idx].emplace_back(ts, obj); + + constexpr size_t kMaxFreePending = 16; + tryPurge(deleter, tc.local_queues_[idx], kMaxFreePending); +} + +void OldRecordsCleaner::tryPurge(IDeleter& deleter, PendingQueue& pending_kvs, size_t lim) +{ + maybeUpdateOldestSnapshot(); + TimeStampType acc_ts = kv_engine_->version_controller_.LocalOldestSnapshotTS(); + for (size_t i = 0; i < lim && !pending_kvs.empty(); i++) { - tc.pending_vhash_kvs.emplace_back(ts, kv); - tryDelete( - [&](VHashKV* outdated_kv){kv_engine_->vhash_kvb_.PurgeKV(outdated_kv);}, - tc.pending_vhash_kvs, - 16 - ); + auto const& pair = pending_kvs.front(); + if (pair.first >= acc_ts) break; + deleter.Delete(pair.second); + pending_kvs.pop_front(); } } -template void OldRecordsCleaner::DelayDelete(VHashKV*); void OldRecordsCleaner::TryGlobalClean() { std::vector space_to_free; @@ -104,11 +106,8 @@ void OldRecordsCleaner::TryGlobalClean() { } std::lock_guard lg(cleaner_thread_cache.old_records_lock); - tryDelete( - [&](VHashKV* outdated_kv){kv_engine_->vhash_kvb_.PurgeKV(outdated_kv);}, - cleaner_thread_cache.pending_vhash_kvs, - 16 - ); + for (auto const& del_idx : delay_deleters_) + tryPurge(*del_idx.first, cleaner_thread_cache.local_queues_[del_idx.second], -1U); } auto iter = global_pending_free_space_entries_.begin(); diff --git a/engine/version/old_records_cleaner.hpp b/engine/version/old_records_cleaner.hpp index 0ee05bfe..77b1a4ce 100644 --- a/engine/version/old_records_cleaner.hpp +++ b/engine/version/old_records_cleaner.hpp @@ -7,6 +7,7 @@ #include #include #include +#include #include "../alias.hpp" #include "../collection.hpp" @@ -14,13 +15,19 @@ #include "../kv_engine_cleaner.hpp" #include "../thread_manager.hpp" #include "../utils/utils.hpp" -#include "../experimental/vhash_kv.hpp" #include "kvdk/configs.hpp" #include "version_controller.hpp" namespace KVDK_NAMESPACE { class KVEngine; +class IDeleter +{ +public: + // Called by OldRecordsCleaner for actual deletion. + virtual void Delete(void* obj) = 0; +}; + // OldRecordsCleaner is used to clean old version PMem records of kvdk // // To support multi-version machenism and consistent backup of kvdk, @@ -35,34 +42,37 @@ class OldRecordsCleaner { void PushToPendingFree(void* addr, TimeStampType ts); - /// TODO: unify DelayDelete with static polymorphism - /// It cannot be done with dynamic polymorphism as sometimes - /// we cannot afford the memory usage by virtual function pointer - /// and if we use shared memory dynamic polymorphism is impossible. - template - void DelayDelete(KVType* kv); + // Warning: not thread safe. Must be called during kv_engine initialization. + void RegisterDelayDeleter(IDeleter& deleter); + + void DelayDelete(IDeleter& deleter, void* obj); + // Try to clean global old records void TryGlobalClean(); private: + using PendingQueue = std::deque>; + struct CleanerThreadCache { std::deque pending_free_space_entries{}; - std::deque> pending_vhash_kvs; + std::vector local_queues_; SpinMutex old_records_lock; }; const uint64_t kLimitCachedDeleteRecords = 1000000; void maybeUpdateOldestSnapshot(); - template - void tryDelete(Deleter del, PendingQueue& pending_kvs, size_t lim); + // Try purging some entries from a locked PendingQueue with associated deleter. + void tryPurge(IDeleter& deleter, PendingQueue& pending_kvs, size_t lim); KVEngine* kv_engine_; Array cleaner_thread_cache_; std::deque global_pending_free_space_entries_; - std::deque> global_pending_vhash_kvs; + + std::unordered_map delay_deleters_; + std::vector global_queues_; }; } // namespace KVDK_NAMESPACE diff --git a/include/kvdk/iterator.hpp b/include/kvdk/iterator.hpp index ca71ba2f..3fd1c76b 100644 --- a/include/kvdk/iterator.hpp +++ b/include/kvdk/iterator.hpp @@ -85,7 +85,7 @@ class VHashIterator { virtual void Next() = 0; - virtual bool Valid() = 0; + virtual bool Valid() const = 0; virtual std::string Key() const = 0; From 00d188b6caad72eb558f127d47a77e15b29faaf7 Mon Sep 17 00:00:00 2001 From: ZiyanShi Date: Tue, 6 Sep 2022 03:03:04 -0400 Subject: [PATCH 03/23] refactor and format Signed-off-by: ZiyanShi --- engine/allocator.hpp | 18 +- engine/experimental/hashptr_map.hpp | 2377 +++++++++++------------- engine/experimental/vhash.cpp | 212 +-- engine/experimental/vhash.hpp | 146 +- engine/experimental/vhash_group.cpp | 47 +- engine/experimental/vhash_group.hpp | 34 +- engine/experimental/vhash_kv.cpp | 38 +- engine/experimental/vhash_kv.hpp | 89 +- engine/kv_engine.hpp | 5 +- engine/kv_engine_vhash.cpp | 210 ++- engine/macros.hpp | 26 +- engine/version/old_records_cleaner.cpp | 58 +- engine/version/old_records_cleaner.hpp | 14 +- include/kvdk/engine.hpp | 13 +- 14 files changed, 1563 insertions(+), 1724 deletions(-) diff --git a/engine/allocator.hpp b/engine/allocator.hpp index 1e0dcb47..15290aaa 100644 --- a/engine/allocator.hpp +++ b/engine/allocator.hpp @@ -30,18 +30,18 @@ class Allocator { }; class IVolatileAllocator { - public: - virtual void* Allocate(size_t bytes) = 0; - virtual void Deallocate(void* addr, size_t bytes) = 0; + public: + virtual void* Allocate(size_t bytes) = 0; + virtual void Deallocate(void* addr, size_t bytes) = 0; }; class CharAllocator final : public IVolatileAllocator { - void* Allocate(size_t n) final { - void* mem = ::malloc(n); - if (mem == nullptr) throw std::bad_alloc{}; - return mem; - } - void Deallocate(void* addr, size_t) final { ::free(addr); } + void* Allocate(size_t n) final { + void* mem = ::malloc(n); + if (mem == nullptr) throw std::bad_alloc{}; + return mem; + } + void Deallocate(void* addr, size_t) final { ::free(addr); } }; } // namespace KVDK_NAMESPACE \ No newline at end of file diff --git a/engine/experimental/hashptr_map.hpp b/engine/experimental/hashptr_map.hpp index ab6a67de..80463306 100644 --- a/engine/experimental/hashptr_map.hpp +++ b/engine/experimental/hashptr_map.hpp @@ -1,11 +1,13 @@ #pragma once -#include -#include -#include +#include +#include #include #include +#include +#include +#include #include #include #include @@ -21,1373 +23,1258 @@ #include #include -#include -#include - #include "../macros.hpp" #define dynamic_assert kvdk_assert -namespace KVDK_NAMESPACE -{ +namespace KVDK_NAMESPACE { // Internal helpers -namespace -{ +namespace { // Templates to convert between a pointer and its compact representation // If we use a cache-backed pool allocator, or a allocator with reference count, // which defines it's own pointer class, we need these converters // to convert the pointer and std::uint64_t. -template::pointer> -struct cvt_helper : public std::enable_if< - std::is_convertible().to_pointer(std::declval())), Pointer>::value && - std::is_convertible().to_rep(std::declval())), std::uint64_t>::value, T>::type {}; - -template::pointer, typename = void> -Pointer to_pointer(std::uint64_t rep, Alloc const&) -{ - return reinterpret_cast(rep); +template ::pointer> +struct cvt_helper + : public std::enable_if< + std::is_convertible().to_pointer( + std::declval())), + Pointer>::value && + std::is_convertible().to_rep( + std::declval())), + std::uint64_t>::value, + T>::type {}; + +template ::pointer, + typename = void> +Pointer to_pointer(std::uint64_t rep, Alloc const&) { + return reinterpret_cast(rep); } -template::pointer, typename = void> -std::uint64_t to_rep(Pointer ptr, Alloc const&) -{ - return reinterpret_cast(ptr); +template ::pointer, + typename = void> +std::uint64_t to_rep(Pointer ptr, Alloc const&) { + return reinterpret_cast(ptr); } -// template::pointer> -// auto to_pointer(std::uint64_t rep, Alloc const& alloc) -> typename cvt_helper::type +// template::pointer> auto to_pointer(std::uint64_t rep, +// Alloc const& alloc) -> typename cvt_helper::type // { // return alloc.to_pointer(rep); // } -// template::pointer> -// auto to_rep(Pointer const& ptr, Alloc const& alloc) -> typename cvt_helper::type +// template::pointer> auto to_rep(Pointer const& ptr, Alloc +// const& alloc) -> typename cvt_helper::type // { // return alloc.to_rep(ptr); // } -std::uint64_t reverse_bits(std::uint64_t u64) -{ - u64 = (u64 & 0xFFFFFFFF00000000) >> 32 | (u64 & 0x00000000FFFFFFFF) << 32; - u64 = (u64 & 0xFFFF0000FFFF0000) >> 16 | (u64 & 0x0000FFFF0000FFFF) << 16; - u64 = (u64 & 0xFF00FF00FF00FF00) >> 8 | (u64 & 0x00FF00FF00FF00FF) << 8; - u64 = (u64 & 0xF0F0F0F0F0F0F0F0) >> 4 | (u64 & 0x0F0F0F0F0F0F0F0F) << 4; - u64 = (u64 & 0xCCCCCCCCCCCCCCCC) >> 2 | (u64 & 0x3333333333333333) << 2; - u64 = (u64 & 0xAAAAAAAAAAAAAAAA) >> 1 | (u64 & 0x5555555555555555) << 1; - return u64; +std::uint64_t reverse_bits(std::uint64_t u64) { + u64 = (u64 & 0xFFFFFFFF00000000) >> 32 | (u64 & 0x00000000FFFFFFFF) << 32; + u64 = (u64 & 0xFFFF0000FFFF0000) >> 16 | (u64 & 0x0000FFFF0000FFFF) << 16; + u64 = (u64 & 0xFF00FF00FF00FF00) >> 8 | (u64 & 0x00FF00FF00FF00FF) << 8; + u64 = (u64 & 0xF0F0F0F0F0F0F0F0) >> 4 | (u64 & 0x0F0F0F0F0F0F0F0F) << 4; + u64 = (u64 & 0xCCCCCCCCCCCCCCCC) >> 2 | (u64 & 0x3333333333333333) << 2; + u64 = (u64 & 0xAAAAAAAAAAAAAAAA) >> 1 | (u64 & 0x5555555555555555) << 1; + return u64; } -void mm_pause() -{ - constexpr size_t NPause = 32; - for (size_t i = 0; i < NPause; i++) - { - _mm_pause(); - } +void mm_pause() { + constexpr size_t NPause = 32; + for (size_t i = 0; i < NPause; i++) { + _mm_pause(); + } } template struct maybe_add_pointer - : public std::conditional::value, typename std::add_pointer::type, Func> -{ -}; + : public std::conditional::value, + typename std::add_pointer::type, Func> {}; -} // namespace +} // namespace // A hashptr_map consists of multiple buckets. // Each Bucket is a single linked list of node // Each node consists of 1 control_field and field_cnt - 1 storage_field. -template, typename KeyEqual = std::equal_to, size_t field_cnt = 8, size_t embed_cnt = 4, size_t max_scale = 32, bool PML5 = false, typename Allocator = std::allocator> -class hashptr_map -{ - private: - static_assert(field_cnt == 8 || field_cnt == 16, ""); - static_assert(((embed_cnt - 1) & embed_cnt) == 0, "embed_cnt must be power of 2"); - static_assert(max_scale <= 64, ""); - - using Hash = std::uint64_t; - static_assert(std::is_convertible()(std::declval())), Hash>::value, ""); - - using field_rep = std::uint64_t; - using control_rep = field_rep; - using storage_rep = field_rep; - using hash_rep = std::uint64_t; - using tag_type = typename std::conditional::type; - using mask_type = std::uint16_t; - - class node; - using node_alloc = typename std::allocator_traits::template rebind_alloc; - using node_alloc_traits = typename std::allocator_traits::template rebind_traits; - using node_pointer = typename node_alloc_traits::pointer; - - // constants for bit manipulation in nodes - // A node consists of multiple 64-bit fields, one control field and several storage fields. - // The control field holds the pointer to next node, and bits for lock and rehash. - // A storage field holds a user pointer to some data, a tag extracted from hash value and a bit for rehash. - static_assert(sizeof(field_rep) == sizeof(void*), "64-bit system required"); - static_assert(sizeof(Hash) == 8, "64-bit hash required"); - - // Actual bits used by a pointer in a 4-level or 5-level paging system - static constexpr size_t pointer_bits = PML5 ? 56 : 48; - - // Bits [63:56]/[63:48] are reused by a node to store meta information - static constexpr field_rep meta_mask = ((~0UL >> pointer_bits) << pointer_bits); - - // Bits [55:0]/[47:0] are untouched - static constexpr field_rep ptr_mask = ~meta_mask; - - // Bit 63 is used as a mark for rehash - static constexpr field_rep mark_bit = (0x1UL << 63); - - // For the storage unit, - // bits [62:56]/[62:48] are used to store a tag. - // The tag consists high 7/15 bits inside a hash value - static constexpr storage_rep tag_mask = (meta_mask & ~mark_bit); - - // For the control unit of a node, - // bits [62:56]/[62:48] are used as a read-write lock. - static constexpr control_rep lock_mask = (meta_mask & ~mark_bit); - static constexpr control_rep slock_val = (0x1UL << pointer_bits); - static constexpr control_rep elock_bit = (0x1UL << 62); - - class tagged_pointer - { - private: - storage_rep u64{0}; - - public: - explicit tagged_pointer(nullptr_t) {} - - explicit tagged_pointer(tag_type tag, Pointer p) : u64{reinterpret_cast(p)} - { - dynamic_assert(!(u64 & meta_mask), "Upper bits in Pointer not zeros!"); - u64 |= (static_cast(tag) << pointer_bits); - dynamic_assert(!(u64 & mark_bit), "Invalid tag!"); - } - - explicit tagged_pointer(storage_rep rep) : u64{rep} - { - u64 &= ~mark_bit; - } - - tagged_pointer(tagged_pointer const&) = default; - tagged_pointer(tagged_pointer&&) = default; - - tag_type tag() const - { - return static_cast((u64 & tag_mask) >> pointer_bits); - } - - Pointer pointer() const - { - return reinterpret_cast(u64 & ptr_mask); - } - - storage_rep rep() const - { - return u64; - } - }; - - class alignas(64) node - { - private: - std::atomic meta{}; - std::array, field_cnt - 1> data{}; - - public: - node() : node{false} {} - - explicit node(bool mark) - { - if (!mark) return; - meta_mark_flip(); - for (size_t i = 0; i < data.size(); i++) - entry_mark_flip(i); - } - - // Access Entrys by indexing - void set_entry(size_t index, tagged_pointer tp) - { - dynamic_assert((data.at(index).load() & ~mark_bit) == 0, "Entry already set!"); - - storage_rep rep = tp.rep(); - rep |= data.at(index).load() & mark_bit; - data.at(index).store(rep); - } - - void set_entry(size_t index, tag_type tag, Pointer p) - { - set_entry(index, tagged_pointer{tag, p}); - } - - void set_entry(size_t index, Pointer p) - { - dynamic_assert((data.at(index).load() & ptr_mask) != 0, "Entry not set yet!"); - storage_rep rep = tagged_pointer{0, p}.rep(); - rep |= data.at(index).load() & meta_mask; - data.at(index).store(rep); - } - - void erase_entry(size_t index) - { - dynamic_assert((data.at(index).load() & ptr_mask) != 0, "Entry not set yet!"); - storage_rep rep = tagged_pointer{nullptr}.rep(); - rep |= data.at(index).load() & mark_bit; - data.at(index).store(rep); - } - - tagged_pointer load_entry(size_t index) - { - return tagged_pointer{data.at(index).load()}; - } - - void entry_mark_flip(size_t index) - { - data.at(index).fetch_xor(mark_bit); - } - - bool entry_mark(size_t index) - { - return data.at(index).load() & mark_bit; - } - - // Every node can be locked, - // but only the first node in a bucket is locked - // for exclusive/shared access of the bucket - bool try_lock() - { - control_rep old = meta.load(); - if ((old & elock_bit) || !meta.compare_exchange_strong(old, old | elock_bit)) - return false; - return true; - } - - void lock() - { - while (!try_lock()) - { - mm_pause(); - } - } - - void unlock() - { - dynamic_assert((meta.load() & lock_mask) == elock_bit, "Unlock a lock not locked yet!"); - meta.fetch_xor(elock_bit); - } +template , + typename KeyEqual = std::equal_to, size_t field_cnt = 8, + size_t embed_cnt = 4, size_t max_scale = 32, bool PML5 = false, + typename Allocator = std::allocator> +class hashptr_map { + private: + static_assert(field_cnt == 8 || field_cnt == 16, ""); + static_assert(((embed_cnt - 1) & embed_cnt) == 0, + "embed_cnt must be power of 2"); + static_assert(max_scale <= 64, ""); + + using Hash = std::uint64_t; + static_assert(std::is_convertible()( + std::declval())), + Hash>::value, + ""); + + using field_rep = std::uint64_t; + using control_rep = field_rep; + using storage_rep = field_rep; + using hash_rep = std::uint64_t; + using tag_type = + typename std::conditional::type; + using mask_type = std::uint16_t; + + class node; + using node_alloc = + typename std::allocator_traits::template rebind_alloc; + using node_alloc_traits = + typename std::allocator_traits::template rebind_traits; + using node_pointer = typename node_alloc_traits::pointer; + + // constants for bit manipulation in nodes + // A node consists of multiple 64-bit fields, one control field and several + // storage fields. The control field holds the pointer to next node, and bits + // for lock and rehash. A storage field holds a user pointer to some data, a + // tag extracted from hash value and a bit for rehash. + static_assert(sizeof(field_rep) == sizeof(void*), "64-bit system required"); + static_assert(sizeof(Hash) == 8, "64-bit hash required"); + + // Actual bits used by a pointer in a 4-level or 5-level paging system + static constexpr size_t pointer_bits = PML5 ? 56 : 48; + + // Bits [63:56]/[63:48] are reused by a node to store meta information + static constexpr field_rep meta_mask = + ((~0UL >> pointer_bits) << pointer_bits); + + // Bits [55:0]/[47:0] are untouched + static constexpr field_rep ptr_mask = ~meta_mask; + + // Bit 63 is used as a mark for rehash + static constexpr field_rep mark_bit = (0x1UL << 63); + + // For the storage unit, + // bits [62:56]/[62:48] are used to store a tag. + // The tag consists high 7/15 bits inside a hash value + static constexpr storage_rep tag_mask = (meta_mask & ~mark_bit); + + // For the control unit of a node, + // bits [62:56]/[62:48] are used as a read-write lock. + static constexpr control_rep lock_mask = (meta_mask & ~mark_bit); + static constexpr control_rep slock_val = (0x1UL << pointer_bits); + static constexpr control_rep elock_bit = (0x1UL << 62); + + class tagged_pointer { + private: + storage_rep u64{0}; + + public: + explicit tagged_pointer(nullptr_t) {} + + explicit tagged_pointer(tag_type tag, Pointer p) + : u64{reinterpret_cast(p)} { + dynamic_assert(!(u64 & meta_mask), "Upper bits in Pointer not zeros!"); + u64 |= (static_cast(tag) << pointer_bits); + dynamic_assert(!(u64 & mark_bit), "Invalid tag!"); + } - bool try_lock_shared() - { - field_rep old = meta.load(); - if ((old & elock_bit) || (old & lock_mask) + slock_val >= mark_bit || !meta.compare_exchange_strong(old, old + slock_val)) - { - return false; - } - return true; - } + explicit tagged_pointer(storage_rep rep) : u64{rep} { u64 &= ~mark_bit; } - void lock_shared() - { - while (!try_lock_shared()) - { - mm_pause(); - } - } + tagged_pointer(tagged_pointer const&) = default; + tagged_pointer(tagged_pointer&&) = default; - void unlock_shared() - { - dynamic_assert((meta.load() & lock_mask) != 0 && !(meta.load() & elock_bit), "Unlock a lock yet locked!"); - meta.fetch_sub(slock_val); - } + tag_type tag() const { + return static_cast((u64 & tag_mask) >> pointer_bits); + } - void meta_mark_flip() - { - meta.fetch_xor(mark_bit); - } + Pointer pointer() const { + return reinterpret_cast(u64 & ptr_mask); + } - bool meta_mark() - { - return meta.load() & mark_bit; - } + storage_rep rep() const { return u64; } + }; - node_pointer next_node(node_alloc const& alloc) const - { - control_rep u64 = meta.load(); - u64 &= ptr_mask; - return to_pointer(u64, alloc); - } + class alignas(64) node { + private: + std::atomic meta{}; + std::array, field_cnt - 1> data{}; - node_pointer append_new_node(node_alloc& alloc) - { - node_pointer p_new_node = node_alloc_traits::allocate(alloc, 1); - node_alloc_traits::construct(alloc, p_new_node, meta_mark()); - control_rep rep = to_rep(p_new_node, alloc); + public: + node() : node{false} {} - dynamic_assert(next_node(alloc) == nullptr, "Not last node!"); - dynamic_assert(!(rep & meta_mask), ""); - - meta.fetch_or(rep); - return p_new_node; - } + explicit node(bool mark) { + if (!mark) return; + meta_mark_flip(); + for (size_t i = 0; i < data.size(); i++) entry_mark_flip(i); + } - static void remove_appended_nodes(node_pointer node, node_alloc& alloc) - { - node_pointer next = node->next_node(alloc); - if (next == nullptr) return; + // Access Entrys by indexing + void set_entry(size_t index, tagged_pointer tp) { + dynamic_assert((data.at(index).load() & ~mark_bit) == 0, + "Entry already set!"); - remove_appended_nodes(next, alloc); - node->meta.fetch_and(meta_mask); - node_alloc_traits::destroy(alloc, next); - node_alloc_traits::deallocate(alloc, next, 1); - } - - // Unlink, destroy and deallocate next node if it is empty - static bool remove_empty_nodes(node_pointer node, node_alloc& alloc) - { - node_pointer next = node->next_node(alloc); + storage_rep rep = tp.rep(); + rep |= data.at(index).load() & mark_bit; + data.at(index).store(rep); + } - if (next == nullptr) return true; - if (!remove_empty_nodes(next, alloc)) return false; + void set_entry(size_t index, tag_type tag, Pointer p) { + set_entry(index, tagged_pointer{tag, p}); + } - for (size_t i = 0; i < field_cnt - 1; i++) - if (next->load_entry(i).pointer() != nullptr) - return false; + void set_entry(size_t index, Pointer p) { + dynamic_assert((data.at(index).load() & ptr_mask) != 0, + "Entry not set yet!"); + storage_rep rep = tagged_pointer{0, p}.rep(); + rep |= data.at(index).load() & meta_mask; + data.at(index).store(rep); + } - remove_appended_nodes(node, alloc); - return true; - } + void erase_entry(size_t index) { + dynamic_assert((data.at(index).load() & ptr_mask) != 0, + "Entry not set yet!"); + storage_rep rep = tagged_pointer{nullptr}.rep(); + rep |= data.at(index).load() & mark_bit; + data.at(index).store(rep); + } - template::type = true> - mask_type match_tag_impl(tag_type tag) const - { - static_assert(sizeof(node) / sizeof(__m512i) == 1 || sizeof(node) / sizeof(__m512i) == 2, ""); - union { - __m128i m128; - std::array mask{}; - } results{}; - - // 0b1000 0000 - constexpr std::uint64_t high_mask = 0x8080808080808080; - // 0b0100 0000 - constexpr std::uint64_t low_mask = 0x4040404040404040; - __m512i low_tag = _mm512_set1_epi8(static_cast(tag)); - __m512i high_tag = _mm512_set1_epi8(static_cast(tag >> 8)); - __m512i high_tag2 = _mm512_set1_epi8(static_cast(tag >> 8) | (mark_bit >> 56)); - - { - __m512i m512_0 = _mm512_load_si512(&meta); - results.mask[0] = (_mm512_mask_cmpeq_epi8_mask(low_mask, low_tag, m512_0) | - _mm512_mask_cmpeq_epi8_mask(high_mask, high_tag, m512_0) | - _mm512_mask_cmpeq_epi8_mask(high_mask, high_tag2, m512_0)); - } - if (sizeof(node) / sizeof(__m512i) == 2) - { - __m512i m512_1 = _mm512_load_si512(&data[sizeof(__m512i) / sizeof(Pointer) - 1]); - results.mask[1] = (_mm512_mask_cmpeq_epi8_mask(low_mask, low_tag, m512_1) | - _mm512_mask_cmpeq_epi8_mask(high_mask, high_tag, m512_1) | - _mm512_mask_cmpeq_epi8_mask(high_mask, high_tag2, m512_1)); - } - // 0b1100 0000 - constexpr char compress_mask = static_cast(0xC0); - mask_type compressed = _mm_cmpeq_epi8_mask(results.m128, _mm_set1_epi8(compress_mask)); - - return (compressed & match_nonempty()); - } + tagged_pointer load_entry(size_t index) { + return tagged_pointer{data.at(index).load()}; + } - template::type = true> - mask_type match_tag_impl(tag_type tag) const - { - /// TODO: test this on a machine with real five-level paging - static_assert(sizeof(node) / sizeof(__m512i) == 1 || sizeof(node) / sizeof(__m512i) == 2, ""); - union { - __m128i m128; - std::array mask{}; - } results{}; - - // 0b1000 0000 - constexpr std::uint64_t mask = 0x8080808080808080; - __m512i my_tag = _mm512_set1_epi8(static_cast(tag)); - __m512i my_tag2 = _mm512_set1_epi8(static_cast(tag) | mark_bit); - { - __m512i m512_0 = _mm512_load_si512(&meta); - results.mask[0] = _mm512_mask_cmpeq_epi8_mask(mask, my_tag, m512_0) | - _mm512_mask_cmpeq_epi8_mask(mask, my_tag2, m512_0); - } - if (sizeof(node) / sizeof(__m512i) == 2) - { - __m512i m512_1 = _mm512_load_si512(&data[sizeof(__m512i) / sizeof(Pointer) - 1]); - results.mask[1] = _mm512_mask_cmpeq_epi8_mask(mask, my_tag, m512_1) | - _mm512_mask_cmpeq_epi8_mask(mask, my_tag2, m512_1); - } - // 0b1000 0000 - constexpr char compress_mask = static_cast(0xB0); - mask_type compressed = _mm_cmpeq_epi8_mask(results.m128, _mm_set1_epi8(compress_mask)); - - return (compressed & match_nonempty()); - } + void entry_mark_flip(size_t index) { data.at(index).fetch_xor(mark_bit); } - mask_type match_tag(tag_type tag) const - { - return match_tag_impl(tag); - } + bool entry_mark(size_t index) { return data.at(index).load() & mark_bit; } - mask_type match_empty() const - { - static_assert(sizeof(node) / sizeof(__m512i) == 1 || sizeof(node) / sizeof(__m512i) == 2, ""); - std::array empty{}; - - __m512i m512_0 = _mm512_load_si512(&meta); - empty[0] = _mm512_cmpeq_epu64_mask(_mm512_set1_epi64(0L), m512_0) | - _mm512_cmpeq_epu64_mask(_mm512_set1_epi64(mark_bit), m512_0); - if (sizeof(node) / sizeof(__m512i) == 2) - { - __m512i m512_1 = _mm512_load_si512(&data[sizeof(__m512i) / sizeof(Pointer) - 1]); - empty[1] = _mm512_cmpeq_epu64_mask(_mm512_set1_epi64(0L), m512_1) | - _mm512_cmpeq_epu64_mask(_mm512_set1_epi64(mark_bit), m512_1); - } - - return ((empty[0] | (empty[1] << 8)) & 0xFFFE); - } + // Every node can be locked, + // but only the first node in a bucket is locked + // for exclusive/shared access of the bucket + bool try_lock() { + control_rep old = meta.load(); + if ((old & elock_bit) || + !meta.compare_exchange_strong(old, old | elock_bit)) + return false; + return true; + } - mask_type match_nonempty() const - { - return (~match_empty() & 0xFFFE); - } + void lock() { + while (!try_lock()) { + mm_pause(); + } + } - static size_t consume_mask(mask_type& match_result) - { - size_t tz = __tzcnt_u16(match_result); - match_result ^= (0x0001 << tz); - return tz - 1; - } + void unlock() { + dynamic_assert((meta.load() & lock_mask) == elock_bit, + "Unlock a lock not locked yet!"); + meta.fetch_xor(elock_bit); + } - // for debugging - friend std::ostream& operator<<(std::ostream& out, node const& node) - { - out << std::bitset<64>(node.meta.load()) << "\n"; - for (size_t i = 0; i < field_cnt - 1; i++) - { - out << std::bitset(node.data[i].tag()) << "\t" << node.data[i].pointer() << "\n"; - } - return out; - } + bool try_lock_shared() { + field_rep old = meta.load(); + if ((old & elock_bit) || (old & lock_mask) + slock_val >= mark_bit || + !meta.compare_exchange_strong(old, old + slock_val)) { + return false; + } + return true; + } - private: - /// TODO: use this helper function to improve readibility - static char match_epi8_in_epi64(__m512i m512, std::uint8_t u8, size_t bias) - { - dynamic_assert(bias <= 7, ""); - union - { - __m128i m128; - std::array u64; - } result; - long long mask = (0x0101010101010101 << bias); - result.u64[0] = _mm512_mask_cmpeq_epi8_mask(mask, m512, _mm512_set1_epi8(u8)); - return _mm_cmpgt_epi8_mask(result.m128, _mm_set1_epi8(0)); - } - }; - - static_assert(sizeof(node) == field_cnt * sizeof(void*), ""); - static_assert(sizeof(node) % 64 == 0, ""); - static_assert(alignof(node) % 64 == 0, ""); - - struct entry_pointer - { - node_pointer np{nullptr}; - size_t idx{field_cnt-1}; - - entry_pointer() = default; - entry_pointer(node_pointer p, size_t i) : np{p}, idx{i} {} - static entry_pointer next(entry_pointer ep, node_alloc const& a) - { - dynamic_assert(ep.np != nullptr && ep.idx < field_cnt - 1, ""); - ++ep.idx; - if (ep.idx < field_cnt - 1) return ep; - ep.np = ep.np->next_node(a); - ep.idx = (ep.np != nullptr) ? 0 : field_cnt - 1; - return ep; - } - }; - - struct bucket_tuple - { - std::uint64_t old_svar; - size_t b_cnt; - node_pointer bucket1; - node_pointer bucket2; - node_pointer my_bucket; - - std::unique_lock guard1; - std::unique_lock guard2; - - bucket_tuple() = default; - bucket_tuple(std::uint64_t s, size_t cnt, node_pointer b1, node_pointer b2, node_pointer mb, std::unique_lock&& g1 = std::unique_lock{}, std::unique_lock&& g2 = std::unique_lock{}) : - old_svar{s}, - b_cnt{cnt}, - bucket1{b1}, - bucket2{b2}, - my_bucket{mb}, - guard1{std::move(g1)}, - guard2{std::move(g2)} {} - bucket_tuple(bucket_tuple const&) = delete; - bucket_tuple(bucket_tuple&&) = default; - bucket_tuple& operator=(bucket_tuple const&) = delete; - bucket_tuple& operator=(bucket_tuple&&) = default; - }; - - class accessor - { - private: - friend hashptr_map; - friend class iterator; - - // help end relocation - hashptr_map* map; - - // buckets that the key may fall in - bucket_tuple tuple; - - // Location of looked-up key - entry_pointer loc{}; - - public: - explicit accessor(hashptr_map& m) : map{&m} {} - accessor(accessor const&) = delete; - accessor& operator=(accessor const&) = delete; - accessor(accessor&&) = default; - accessor& operator=(accessor&&) = default; - ~accessor() - { - { - std::unique_lock g1{}; - std::unique_lock g2{}; - tuple.guard2.swap(g2); - tuple.guard1.swap(g1); - } - map->help_end_relocation(); - } + void lock_shared() { + while (!try_lock_shared()) { + mm_pause(); + } + } - Pointer pointer() const - { - return (loc.np == nullptr) ? nullptr : loc.np->load_entry(loc.idx).pointer(); - } + void unlock_shared() { + dynamic_assert( + (meta.load() & lock_mask) != 0 && !(meta.load() & elock_bit), + "Unlock a lock yet locked!"); + meta.fetch_sub(slock_val); + } - void set_pointer(Pointer p) - { - if (p == nullptr) - { - erase(); - } - else if (loc.np == nullptr) - { - // Since relocation may have been done by lookup(), - // we must guarantee Pointer p is inserted to the correct bucket. - Hash h = map->hasher(map->extract(p)); - loc = map->allocate_empty_entry(tuple.my_bucket); - loc.np->set_entry(loc.idx, map->make_tag(h), p); - } - else - { - Pointer old_p = loc.np->load_entry(loc.idx).pointer(); - dynamic_assert(map->equal(map->extract(p), map->extract(old_p)), ""); - loc.np->set_entry(loc.idx, p); - } - } + void meta_mark_flip() { meta.fetch_xor(mark_bit); } - void erase() - { - if (loc.np != nullptr) loc.np->erase_entry(loc.idx); - } - - private: - explicit accessor(hashptr_map& m, bucket_tuple&& t, entry_pointer l) : - map{&m}, - tuple{std::move(t)}, - loc{l} {} - }; - - public: - class lockless_t {}; - static constexpr lockless_t lockless{}; - class acquire_lock_t {}; - static constexpr acquire_lock_t acquire_lock{}; - - // We iterate through hashmap by - class iterator - { - public: - iterator() = delete; - iterator(iterator const&) = delete; - iterator& operator=(iterator const&) = delete; - iterator(iterator&&) = default; - iterator& operator=(iterator&&) = default; - - iterator& operator++() - { - acc.loc = entry_pointer::next(acc.loc, acc.map->alloc); - while (true) - { - acc.loc = skip_visited(acc.loc); - // Found unvisited entry - if (acc.loc.np != nullptr) return *this; - if (acc.tuple.my_bucket == acc.tuple.bucket1) - { - acc.tuple.my_bucket = acc.tuple.bucket2; - acc.loc = skip_visited(entry_pointer{acc.tuple.my_bucket, 0}); - if (acc.loc.np != nullptr) return *this; - } - // All entries visited, reset lock and goto next bucket - inc_hash(); - acc.tuple = bucket_tuple{}; - // inc_hash() overflow, end of iteration - if (hash == Hash{}) break; - - acc.tuple = acc.map->load_and_lock_buckets(hash); - acc.tuple.my_bucket = acc.tuple.bucket1; - acc.loc = entry_pointer{acc.tuple.my_bucket, 0}; - } - *this = iterator{*acc.map}; - return *this; - } + bool meta_mark() { return meta.load() & mark_bit; } - // No copy because iterator holds locks. - iterator operator++(int) = delete; + node_pointer next_node(node_alloc const& alloc) const { + control_rep u64 = meta.load(); + u64 &= ptr_mask; + return to_pointer(u64, alloc); + } - // Only two end iterators are equal - // Two iterators cannot be equal as they acquire locks on buckets - bool operator==(iterator const& rhs) const - { - return (acc.loc.np == nullptr) && (rhs.acc.loc.np == nullptr); - } + node_pointer append_new_node(node_alloc& alloc) { + node_pointer p_new_node = node_alloc_traits::allocate(alloc, 1); + node_alloc_traits::construct(alloc, p_new_node, meta_mark()); + control_rep rep = to_rep(p_new_node, alloc); - bool operator!=(iterator const& rhs) const - { - return !operator==(rhs); - } + dynamic_assert(next_node(alloc) == nullptr, "Not last node!"); + dynamic_assert(!(rep & meta_mask), ""); - accessor& operator*() - { - return acc; - } + meta.fetch_or(rep); + return p_new_node; + } - accessor const& operator*() const - { - return acc; - } + static void remove_appended_nodes(node_pointer node, node_alloc& alloc) { + node_pointer next = node->next_node(alloc); + if (next == nullptr) return; - accessor* operator->() - { - return &acc; - } + remove_appended_nodes(next, alloc); + node->meta.fetch_and(meta_mask); + node_alloc_traits::destroy(alloc, next); + node_alloc_traits::deallocate(alloc, next, 1); + } - accessor const* operator->() const - { - return &acc; - } + // Unlink, destroy and deallocate next node if it is empty + static bool remove_empty_nodes(node_pointer node, node_alloc& alloc) { + node_pointer next = node->next_node(alloc); - private: - friend hashptr_map; - - accessor acc; - Hash hash; - Hash rev_hash; - Hash rev_lim; - - // End iterator - iterator(hashptr_map& map) : acc{map}, hash{}, rev_hash{} {} - - iterator(hashptr_map& map, Hash h, Hash lim = -1UL) : acc{map}, hash{h}, rev_hash{reverse_bits(h)}, rev_lim{reverse_bits(lim)} - { - while (true) - { - acc.tuple = acc.map->load_and_lock_buckets(hash); - acc.tuple.my_bucket = acc.tuple.bucket1; - acc.loc = skip_visited(entry_pointer{acc.tuple.my_bucket, 0}); - // Found unvisited entry in bucket1 - if (acc.loc.np != nullptr) return; - acc.tuple.my_bucket = acc.tuple.bucket2; - acc.loc = skip_visited(entry_pointer{acc.tuple.my_bucket, 0}); - // Found unvisited entry in bucket2 - if (acc.loc.np != nullptr) return; - - // All entries visited, reset lock and goto next bucket - inc_hash(); - acc.tuple = bucket_tuple{}; - // inc_hash() overflow, end of iteration - if (hash == Hash{}) break; - } - *this = iterator{map}; - } + if (next == nullptr) return true; + if (!remove_empty_nodes(next, alloc)) return false; - // For 2^N buckets, low N bits determines the bucket - // To increment the bucket index, - // we need to increment the (N-1)th highest bit in reversed hash, - // and clear all lower bits. - // Must be called when iterator holds a valid accessor! - void inc_hash() - { - std::uint64_t v = reverse_bits(acc.tuple.b_cnt >> 1); - rev_hash &= ~(v - 1); - rev_hash += v; - hash = reverse_bits(rev_hash); - } + for (size_t i = 0; i < field_cnt - 1; i++) + if (next->load_entry(i).pointer() != nullptr) return false; - /// TODO: examine if we may miss some entries - bool visited(entry_pointer pos) - { - dynamic_assert(pos.np != nullptr, ""); - Pointer ptr = pos.np->load_entry(pos.idx).pointer(); - Hash h = acc.map->hasher(acc.map->extract(ptr)); - h = reverse_bits(h); - return (h < rev_hash) || (rev_lim < h); - } + remove_appended_nodes(node, alloc); + return true; + } - // Seek first non-empty entry in the bucket from pos. - entry_pointer skip_empty(entry_pointer pos) - { - if (pos.np == nullptr) return entry_pointer{}; - if (pos.np->load_entry(pos.idx).pointer() != nullptr) return pos; - pos = entry_pointer::next(pos, acc.map->alloc); - return skip_empty(pos); - } + template ::type = true> + mask_type match_tag_impl(tag_type tag) const { + static_assert(sizeof(node) / sizeof(__m512i) == 1 || + sizeof(node) / sizeof(__m512i) == 2, + ""); + union { + __m128i m128; + std::array mask{}; + } results{}; + + // 0b1000 0000 + constexpr std::uint64_t high_mask = 0x8080808080808080; + // 0b0100 0000 + constexpr std::uint64_t low_mask = 0x4040404040404040; + __m512i low_tag = _mm512_set1_epi8(static_cast(tag)); + __m512i high_tag = _mm512_set1_epi8(static_cast(tag >> 8)); + __m512i high_tag2 = + _mm512_set1_epi8(static_cast(tag >> 8) | (mark_bit >> 56)); + + { + __m512i m512_0 = _mm512_load_si512(&meta); + results.mask[0] = + (_mm512_mask_cmpeq_epi8_mask(low_mask, low_tag, m512_0) | + _mm512_mask_cmpeq_epi8_mask(high_mask, high_tag, m512_0) | + _mm512_mask_cmpeq_epi8_mask(high_mask, high_tag2, m512_0)); + } + if (sizeof(node) / sizeof(__m512i) == 2) { + __m512i m512_1 = + _mm512_load_si512(&data[sizeof(__m512i) / sizeof(Pointer) - 1]); + results.mask[1] = + (_mm512_mask_cmpeq_epi8_mask(low_mask, low_tag, m512_1) | + _mm512_mask_cmpeq_epi8_mask(high_mask, high_tag, m512_1) | + _mm512_mask_cmpeq_epi8_mask(high_mask, high_tag2, m512_1)); + } + // 0b1100 0000 + constexpr char compress_mask = static_cast(0xC0); + mask_type compressed = + _mm_cmpeq_epi8_mask(results.m128, _mm_set1_epi8(compress_mask)); + + return (compressed & match_nonempty()); + } - // Seek first non-empty entry not visited yet in the bucket from pos. - entry_pointer skip_visited(entry_pointer pos) - { - pos = skip_empty(pos); - if (pos.np == nullptr) return entry_pointer{}; - if (!visited(pos)) return pos; - return skip_visited(entry_pointer::next(pos, acc.map->alloc)); - } - }; - - hashptr_map(size_t buckets, KeyExtract const& ext, HashFunc const& hash = HashFunc{}, KeyEqual const& eq = KeyEqual{}, Allocator const& a = Allocator{}) : - alloc{a}, hasher{hash}, equal{eq}, extract{ext} - { - node_blocks[0] = &embedded_block[0]; - size_t b_cnt = bucket_count(); - while (b_cnt < buckets) - { - node_pointer new_block = node_alloc_traits::allocate(alloc, b_cnt); - for (size_t i = 0; i < b_cnt; i++) - node_alloc_traits::construct(alloc, new_block + i, false); - node_blocks[block_index(b_cnt)] = new_block; - active_blocks.fetch_add(active_blocks.load() + 1); - b_cnt = bucket_count(); - } + template ::type = true> + mask_type match_tag_impl(tag_type tag) const { + /// TODO: test this on a machine with real five-level paging + static_assert(sizeof(node) / sizeof(__m512i) == 1 || + sizeof(node) / sizeof(__m512i) == 2, + ""); + union { + __m128i m128; + std::array mask{}; + } results{}; + + // 0b1000 0000 + constexpr std::uint64_t mask = 0x8080808080808080; + __m512i my_tag = _mm512_set1_epi8(static_cast(tag)); + __m512i my_tag2 = _mm512_set1_epi8(static_cast(tag) | mark_bit); + { + __m512i m512_0 = _mm512_load_si512(&meta); + results.mask[0] = _mm512_mask_cmpeq_epi8_mask(mask, my_tag, m512_0) | + _mm512_mask_cmpeq_epi8_mask(mask, my_tag2, m512_0); + } + if (sizeof(node) / sizeof(__m512i) == 2) { + __m512i m512_1 = + _mm512_load_si512(&data[sizeof(__m512i) / sizeof(Pointer) - 1]); + results.mask[1] = _mm512_mask_cmpeq_epi8_mask(mask, my_tag, m512_1) | + _mm512_mask_cmpeq_epi8_mask(mask, my_tag2, m512_1); + } + // 0b1000 0000 + constexpr char compress_mask = static_cast(0xB0); + mask_type compressed = + _mm_cmpeq_epi8_mask(results.m128, _mm_set1_epi8(compress_mask)); + + return (compressed & match_nonempty()); } - ~hashptr_map() - { - for (size_t i = 1; i < node_blocks.size(); i++) - { - node_pointer old_block = node_blocks[i]; - if (old_block == nullptr) continue; - size_t cnt = (embed_cnt << i) >> 1; - for (size_t i = 0; i < cnt; i++) - { - node_pointer np = old_block + i; - node::remove_appended_nodes(np, alloc); - node_alloc_traits::destroy(alloc, np); - } - node_alloc_traits::deallocate(alloc, old_block, cnt); - } - for (size_t i = 0; i < embedded_block.size(); i++) - node::remove_appended_nodes(&embedded_block.at(i), alloc); + mask_type match_tag(tag_type tag) const { + return match_tag_impl(tag); } - Pointer lookup(Key const& key, lockless_t) const - { - Hash h = hasher(key); - tag_type tag = make_tag(h); - while (true) - { - Pointer p = nullptr; - bucket_tuple tuple = load_buckets(h); - if (!is_relocating(tuple.old_svar)) - { - p = search_backward(key, tag, tuple.bucket1); - } - else - { - // Relocation from bucket1 to bucket2 - using std::swap; - if (is_shrk_proc(tuple.old_svar)) swap(tuple.bucket1, tuple.bucket2); - - // Look up the moved-from bucket first, then moved-to bucket. - p = search_backward(key, tag, tuple.bucket1); - p = (p != nullptr) ? p : search_backward(key, tag, tuple.bucket2); - } - - // svar not incremented, lookup result is valid, return. - if (tuple.old_svar == svar.load()) return p; - } + mask_type match_empty() const { + static_assert(sizeof(node) / sizeof(__m512i) == 1 || + sizeof(node) / sizeof(__m512i) == 2, + ""); + std::array empty{}; + + __m512i m512_0 = _mm512_load_si512(&meta); + empty[0] = _mm512_cmpeq_epu64_mask(_mm512_set1_epi64(0L), m512_0) | + _mm512_cmpeq_epu64_mask(_mm512_set1_epi64(mark_bit), m512_0); + if (sizeof(node) / sizeof(__m512i) == 2) { + __m512i m512_1 = + _mm512_load_si512(&data[sizeof(__m512i) / sizeof(Pointer) - 1]); + empty[1] = _mm512_cmpeq_epu64_mask(_mm512_set1_epi64(0L), m512_1) | + _mm512_cmpeq_epu64_mask(_mm512_set1_epi64(mark_bit), m512_1); + } + + return ((empty[0] | (empty[1] << 8)) & 0xFFFE); } - // deref_lim should be larger than 1. - // 1 is merely enough to maintain load factor at current level - // if we keep inserting and try_double_capacity(). - accessor lookup(Key const& key, acquire_lock_t, size_t deref_lim = 4) - { - Hash h = hasher(key); - bucket_tuple tuple = load_and_lock_buckets(h); - - // Help relocation - if (is_relocating(tuple.old_svar)) - { - if (tuple.bucket1->meta_mark() != mark_of(tuple.old_svar)) - deref_lim -= is_exp_proc(tuple.old_svar) - ? relocate_bucket(tuple.bucket2, tuple.bucket1, deref_lim) - : relocate_bucket(tuple.bucket1, tuple.bucket2, deref_lim); - relocate_global(deref_lim, tuple.old_svar); - } + mask_type match_nonempty() const { return (~match_empty() & 0xFFFE); } - // Actual lookup, done after relocation to prevent loc from - // being invalidated. - tag_type tag = make_tag(h); - entry_pointer loc{}; - if (!is_relocating(tuple.old_svar)) - { - loc = search_forward(key, tag, tuple.bucket1); - } - else - { - loc = search_forward(key, tag, tuple.bucket1); - if (loc.np == nullptr) - loc = search_forward(key, tag, tuple.bucket2); - } - return accessor{*this, std::move(tuple), loc}; + static size_t consume_mask(mask_type& match_result) { + size_t tz = __tzcnt_u16(match_result); + match_result ^= (0x0001 << tz); + return tz - 1; } - iterator begin() - { - return iterator{*this, 0}; + // for debugging + friend std::ostream& operator<<(std::ostream& out, node const& node) { + out << std::bitset<64>(node.meta.load()) << "\n"; + for (size_t i = 0; i < field_cnt - 1; i++) { + out << std::bitset(node.data[i].tag()) << "\t" + << node.data[i].pointer() << "\n"; + } + return out; } - iterator end() - { - return iterator{*this}; + private: + /// TODO: use this helper function to improve readibility + static char match_epi8_in_epi64(__m512i m512, std::uint8_t u8, + size_t bias) { + dynamic_assert(bias <= 7, ""); + union { + __m128i m128; + std::array u64; + } result; + long long mask = (0x0101010101010101 << bias); + result.u64[0] = + _mm512_mask_cmpeq_epi8_mask(mask, m512, _mm512_set1_epi8(u8)); + return _mm_cmpgt_epi8_mask(result.m128, _mm_set1_epi8(0)); } - - // Estimate load factor by checking embedded nodes - double approx_load_factor() const - { - size_t cnt = 0; - for (size_t i = 0; i < embed_cnt; i++) - cnt += entry_count(const_cast(&embedded_block[i])); - return static_cast(cnt) / ((field_cnt - 1) * embed_cnt); + }; + + static_assert(sizeof(node) == field_cnt * sizeof(void*), ""); + static_assert(sizeof(node) % 64 == 0, ""); + static_assert(alignof(node) % 64 == 0, ""); + + struct entry_pointer { + node_pointer np{nullptr}; + size_t idx{field_cnt - 1}; + + entry_pointer() = default; + entry_pointer(node_pointer p, size_t i) : np{p}, idx{i} {} + static entry_pointer next(entry_pointer ep, node_alloc const& a) { + dynamic_assert(ep.np != nullptr && ep.idx < field_cnt - 1, ""); + ++ep.idx; + if (ep.idx < field_cnt - 1) return ep; + ep.np = ep.np->next_node(a); + ep.idx = (ep.np != nullptr) ? 0 : field_cnt - 1; + return ep; } - - // Triggers rehash by write threads. - // Return false if rehash is under progress. - bool try_double_capacity() - { - std::uint64_t old_svar = svar.load(); - if (old_svar % 8 != 0 || !svar.compare_exchange_strong(old_svar, old_svar + 1)) - return false; - - old_svar = svar.load(); - size_t cnt = bucket_count(); - node_pointer new_block = node_alloc_traits::allocate(alloc, cnt); - for (size_t i = 0; i < cnt; i++) - { - // Newly allocated buckets do not need relocation. - // For convenience, the new bucket has same mark as its conjugated bucket, - // the mark is cleared after scan - node_alloc_traits::construct(alloc, new_block + i, !mark_of(old_svar)); - } - // activate new block - node_blocks[block_index(cnt)] = new_block; - dynamic_assert(svar.load() % 4 == 1, ""); - pvar.store(0UL); - svar.fetch_add(1UL); - return true; + }; + + struct bucket_tuple { + std::uint64_t old_svar; + size_t b_cnt; + node_pointer bucket1; + node_pointer bucket2; + node_pointer my_bucket; + + std::unique_lock guard1; + std::unique_lock guard2; + + bucket_tuple() = default; + bucket_tuple(std::uint64_t s, size_t cnt, node_pointer b1, node_pointer b2, + node_pointer mb, + std::unique_lock&& g1 = std::unique_lock{}, + std::unique_lock&& g2 = std::unique_lock{}) + : old_svar{s}, + b_cnt{cnt}, + bucket1{b1}, + bucket2{b2}, + my_bucket{mb}, + guard1{std::move(g1)}, + guard2{std::move(g2)} {} + bucket_tuple(bucket_tuple const&) = delete; + bucket_tuple(bucket_tuple&&) = default; + bucket_tuple& operator=(bucket_tuple const&) = delete; + bucket_tuple& operator=(bucket_tuple&&) = default; + }; + + class accessor { + private: + friend hashptr_map; + friend class iterator; + + // help end relocation + hashptr_map* map; + + // buckets that the key may fall in + bucket_tuple tuple; + + // Location of looked-up key + entry_pointer loc{}; + + public: + explicit accessor(hashptr_map& m) : map{&m} {} + accessor(accessor const&) = delete; + accessor& operator=(accessor const&) = delete; + accessor(accessor&&) = default; + accessor& operator=(accessor&&) = default; + ~accessor() { + { + std::unique_lock g1{}; + std::unique_lock g2{}; + tuple.guard2.swap(g2); + tuple.guard1.swap(g1); + } + map->help_end_relocation(); } - // Triggers rehash by write threads. - // Return false if rehash is under progress. - bool try_halve_capacity() - { - std::uint64_t old_svar = svar.load(); - if (old_svar % 8 != 0 || !svar.compare_exchange_strong(old_svar, old_svar + 5)) - { - return false; - } - if (active_blocks.load() == 0x0001UL) - { - // Cannot shrink embedded block - svar.fetch_add(3U); - return false; - } - // The last block is logically deactivated. - // A bucket pair forms a logical bucket internally. - active_blocks.fetch_sub((active_blocks.load() + 1) >> 1); - dynamic_assert(svar.load() % 4 == 1, ""); - pvar.store(0UL); - svar.fetch_add(1); - return true; + Pointer pointer() const { + return (loc.np == nullptr) ? nullptr + : loc.np->load_entry(loc.idx).pointer(); } - size_t help_relocate() - { - size_t old_svar = svar.load(); - if (!is_relocating(old_svar)) return 0; - - size_t n = relocate_global(-1UL, svar.load()); - help_end_relocation(); - return n; + void set_pointer(Pointer p) { + if (p == nullptr) { + erase(); + } else if (loc.np == nullptr) { + // Since relocation may have been done by lookup(), + // we must guarantee Pointer p is inserted to the correct bucket. + Hash h = map->hasher(map->extract(p)); + loc = map->allocate_empty_entry(tuple.my_bucket); + loc.np->set_entry(loc.idx, map->make_tag(h), p); + } else { + Pointer old_p = loc.np->load_entry(loc.idx).pointer(); + dynamic_assert(map->equal(map->extract(p), map->extract(old_p)), ""); + loc.np->set_entry(loc.idx, p); + } } - // Logical count of buckets - // During relocation, a pair of conjugated buckets are considered as - // a logical bucket. - // After shrinkage, the higher half of buckets are removed. - // After expansion, the pair is seperated. - size_t bucket_count() const - { - std::uint64_t active = active_blocks.load(); - return embed_cnt * (active + 1) / 2; + void erase() { + if (loc.np != nullptr) loc.np->erase_entry(loc.idx); } - private: - static tag_type make_tag(Hash h) - { - return (h >> (pointer_bits + 1)); + private: + explicit accessor(hashptr_map& m, bucket_tuple&& t, entry_pointer l) + : map{&m}, tuple{std::move(t)}, loc{l} {} + }; + + public: + class lockless_t {}; + static constexpr lockless_t lockless{}; + class acquire_lock_t {}; + static constexpr acquire_lock_t acquire_lock{}; + + // We iterate through hashmap by + class iterator { + public: + iterator() = delete; + iterator(iterator const&) = delete; + iterator& operator=(iterator const&) = delete; + iterator(iterator&&) = default; + iterator& operator=(iterator&&) = default; + + iterator& operator++() { + acc.loc = entry_pointer::next(acc.loc, acc.map->alloc); + while (true) { + acc.loc = skip_visited(acc.loc); + // Found unvisited entry + if (acc.loc.np != nullptr) return *this; + if (acc.tuple.my_bucket == acc.tuple.bucket1) { + acc.tuple.my_bucket = acc.tuple.bucket2; + acc.loc = skip_visited(entry_pointer{acc.tuple.my_bucket, 0}); + if (acc.loc.np != nullptr) return *this; + } + // All entries visited, reset lock and goto next bucket + inc_hash(); + acc.tuple = bucket_tuple{}; + // inc_hash() overflow, end of iteration + if (hash == Hash{}) break; + + acc.tuple = acc.map->load_and_lock_buckets(hash); + acc.tuple.my_bucket = acc.tuple.bucket1; + acc.loc = entry_pointer{acc.tuple.my_bucket, 0}; + } + *this = iterator{*acc.map}; + return *this; } - enum class stage : std::uint64_t - { - // No rehashing - stable = 0, - - // Initializing expansion, allocate space - exp_init = 1, - // Processing expansion, relocating entries - exp_proc = 2, - // Finalizing expansion - exp_fin = 3, - - // Block rehashing - blocking = 4, - - // Initializing shrinkage - shrk_init = 5, - // Processing shrinkage, relocating entries - shrk_proc = 6, - // Finalizing shrinkage, modify node_block pointers - shrk_fin = 7 - }; - // Initialization and finalizing stage are ignored by accessors. - // Accessors help relocation during processing stage. - - static stage stage_of(std::uint64_t old_svar) - { - return static_cast(old_svar % 8); - } + // No copy because iterator holds locks. + iterator operator++(int) = delete; - // Mark of svar is flipped each time - // try_halve_capacity() or try_double_capacity() is called. - // Relocation will check buckets and flip their marks accordingly. - static bool mark_of(std::uint64_t old_svar) - { - return ((old_svar + 7) / 8) % 2 == 1; + // Only two end iterators are equal + // Two iterators cannot be equal as they acquire locks on buckets + bool operator==(iterator const& rhs) const { + return (acc.loc.np == nullptr) && (rhs.acc.loc.np == nullptr); } - static bool is_relocating(std::uint64_t old_svar) - { - return (is_exp_proc(old_svar) || is_shrk_proc(old_svar)); + bool operator!=(iterator const& rhs) const { return !operator==(rhs); } + + accessor& operator*() { return acc; } + + accessor const& operator*() const { return acc; } + + accessor* operator->() { return &acc; } + + accessor const* operator->() const { return &acc; } + + private: + friend hashptr_map; + + accessor acc; + Hash hash; + Hash rev_hash; + Hash rev_lim; + + // End iterator + iterator(hashptr_map& map) : acc{map}, hash{}, rev_hash{} {} + + iterator(hashptr_map& map, Hash h, Hash lim = -1UL) + : acc{map}, + hash{h}, + rev_hash{reverse_bits(h)}, + rev_lim{reverse_bits(lim)} { + while (true) { + acc.tuple = acc.map->load_and_lock_buckets(hash); + acc.tuple.my_bucket = acc.tuple.bucket1; + acc.loc = skip_visited(entry_pointer{acc.tuple.my_bucket, 0}); + // Found unvisited entry in bucket1 + if (acc.loc.np != nullptr) return; + acc.tuple.my_bucket = acc.tuple.bucket2; + acc.loc = skip_visited(entry_pointer{acc.tuple.my_bucket, 0}); + // Found unvisited entry in bucket2 + if (acc.loc.np != nullptr) return; + + // All entries visited, reset lock and goto next bucket + inc_hash(); + acc.tuple = bucket_tuple{}; + // inc_hash() overflow, end of iteration + if (hash == Hash{}) break; + } + *this = iterator{map}; } - static bool is_exp_proc(std::uint64_t old_svar) - { - return (stage_of(old_svar) == stage::exp_proc); + // For 2^N buckets, low N bits determines the bucket + // To increment the bucket index, + // we need to increment the (N-1)th highest bit in reversed hash, + // and clear all lower bits. + // Must be called when iterator holds a valid accessor! + void inc_hash() { + std::uint64_t v = reverse_bits(acc.tuple.b_cnt >> 1); + rev_hash &= ~(v - 1); + rev_hash += v; + hash = reverse_bits(rev_hash); } - static bool is_shrk_proc(std::uint64_t old_svar) - { - return (stage_of(old_svar) == stage::shrk_proc); + /// TODO: examine if we may miss some entries + bool visited(entry_pointer pos) { + dynamic_assert(pos.np != nullptr, ""); + Pointer ptr = pos.np->load_entry(pos.idx).pointer(); + Hash h = acc.map->hasher(acc.map->extract(ptr)); + h = reverse_bits(h); + return (h < rev_hash) || (rev_lim < h); } - static size_t block_index(size_t bucket_index) - { - return 64 - __lzcnt64(bucket_index / embed_cnt); + // Seek first non-empty entry in the bucket from pos. + entry_pointer skip_empty(entry_pointer pos) { + if (pos.np == nullptr) return entry_pointer{}; + if (pos.np->load_entry(pos.idx).pointer() != nullptr) return pos; + pos = entry_pointer::next(pos, acc.map->alloc); + return skip_empty(pos); } - static size_t block_offset(size_t bucket_index) - { - return bucket_index - ((1UL << block_index(bucket_index)) >> 1) * embed_cnt; + // Seek first non-empty entry not visited yet in the bucket from pos. + entry_pointer skip_visited(entry_pointer pos) { + pos = skip_empty(pos); + if (pos.np == nullptr) return entry_pointer{}; + if (!visited(pos)) return pos; + return skip_visited(entry_pointer::next(pos, acc.map->alloc)); } - - node_pointer locate_bucket(size_t idx) const - { - size_t block_idx = block_index(idx); - size_t block_off = block_offset(idx); - node_pointer base = node_blocks[block_idx]; - return base + block_off; + }; + + hashptr_map(size_t buckets, KeyExtract const& ext, + HashFunc const& hash = HashFunc{}, + KeyEqual const& eq = KeyEqual{}, Allocator const& a = Allocator{}) + : alloc{a}, hasher{hash}, equal{eq}, extract{ext} { + node_blocks[0] = &embedded_block[0]; + size_t b_cnt = bucket_count(); + while (b_cnt < buckets) { + node_pointer new_block = node_alloc_traits::allocate(alloc, b_cnt); + for (size_t i = 0; i < b_cnt; i++) + node_alloc_traits::construct(alloc, new_block + i, false); + node_blocks[block_index(b_cnt)] = new_block; + active_blocks.fetch_add(active_blocks.load() + 1); + b_cnt = bucket_count(); } - - // Load a consistent triplet of - bucket_tuple load_buckets(Hash h) const - { - while (true) - { - std::uint64_t old_svar = svar.load(); - size_t b_cnt = bucket_count(); - node_pointer b1 = locate_bucket(h % b_cnt); - node_pointer b2 = locate_bucket(h % b_cnt + b_cnt); - - /// TODO: Investigate this carefully!!!! - node_pointer b = is_exp_proc(old_svar) ? locate_bucket(h % (b_cnt * 2)) : b1; - // Initializing stages and finalizing stages are dangerous zones! - /// TODO: Investigate extremely carefully and optimize. - if (old_svar != svar.load() || old_svar % 2 != 0) - { - mm_pause(); - continue; - } - if (!is_relocating(old_svar)) - return bucket_tuple{old_svar, b_cnt, b1, nullptr, b}; - else - return bucket_tuple{old_svar, b_cnt, b1, b2, b}; - } + } + + ~hashptr_map() { + for (size_t i = 1; i < node_blocks.size(); i++) { + node_pointer old_block = node_blocks[i]; + if (old_block == nullptr) continue; + size_t cnt = (embed_cnt << i) >> 1; + for (size_t i = 0; i < cnt; i++) { + node_pointer np = old_block + i; + node::remove_appended_nodes(np, alloc); + node_alloc_traits::destroy(alloc, np); + } + node_alloc_traits::deallocate(alloc, old_block, cnt); } - - bucket_tuple load_and_lock_buckets(Hash h) - { - while (true) - { - std::uint64_t old_svar = svar.load(); - size_t b_cnt = bucket_count(); - node_pointer b1 = locate_bucket(h % b_cnt); - node_pointer b2 = locate_bucket(h % b_cnt + b_cnt); - - node_pointer mb = is_exp_proc(old_svar) ? locate_bucket(h % (b_cnt * 2)) : b1; - if (!is_relocating(old_svar)) - { - /// TODO: mm_pause()? - std::unique_lock g1{*b1, std::try_to_lock}; - if (!g1.owns_lock()) continue; - if (old_svar != svar.load() || old_svar % 2 != 0) continue; - - return bucket_tuple{old_svar, b_cnt, b1, nullptr, mb, std::move(g1)}; - } - else - { - std::unique_lock g1{*b1, std::try_to_lock}; - if (!g1.owns_lock()) continue; - std::unique_lock g2{*b2, std::try_to_lock}; - if (!g2.owns_lock()) continue; - if (old_svar != svar.load() || old_svar % 2 != 0) continue; - - return bucket_tuple{old_svar, b_cnt, b1, b2, mb, std::move(g1), std::move(g2)}; - } - } + for (size_t i = 0; i < embedded_block.size(); i++) + node::remove_appended_nodes(&embedded_block.at(i), alloc); + } + + Pointer lookup(Key const& key, lockless_t) const { + Hash h = hasher(key); + tag_type tag = make_tag(h); + while (true) { + Pointer p = nullptr; + bucket_tuple tuple = load_buckets(h); + if (!is_relocating(tuple.old_svar)) { + p = search_backward(key, tag, tuple.bucket1); + } else { + // Relocation from bucket1 to bucket2 + using std::swap; + if (is_shrk_proc(tuple.old_svar)) swap(tuple.bucket1, tuple.bucket2); + + // Look up the moved-from bucket first, then moved-to bucket. + p = search_backward(key, tag, tuple.bucket1); + p = (p != nullptr) ? p : search_backward(key, tag, tuple.bucket2); + } + + // svar not incremented, lookup result is valid, return. + if (tuple.old_svar == svar.load()) return p; } - - entry_pointer allocate_empty_entry(node_pointer np) - { - dynamic_assert(np != nullptr, ""); - for (size_t i = 0; i < field_cnt - 1; i++) - if (np->load_entry(i).pointer() == nullptr) - return entry_pointer{np, i}; - np = (np->next_node(alloc) == nullptr) - ? np->append_new_node(alloc) - : np->next_node(alloc); - return allocate_empty_entry(np); + } + + // deref_lim should be larger than 1. + // 1 is merely enough to maintain load factor at current level + // if we keep inserting and try_double_capacity(). + accessor lookup(Key const& key, acquire_lock_t, size_t deref_lim = 4) { + Hash h = hasher(key); + bucket_tuple tuple = load_and_lock_buckets(h); + + // Help relocation + if (is_relocating(tuple.old_svar)) { + if (tuple.bucket1->meta_mark() != mark_of(tuple.old_svar)) + deref_lim -= + is_exp_proc(tuple.old_svar) + ? relocate_bucket(tuple.bucket2, tuple.bucket1, deref_lim) + : relocate_bucket(tuple.bucket1, tuple.bucket2, deref_lim); + relocate_global(deref_lim, tuple.old_svar); } - size_t entry_count(node_pointer node) const - { - if (node == nullptr) return 0; - size_t cnt = 0; - for (size_t i = 0; i < field_cnt - 1; i++) - if (node->load_entry(i).pointer() != nullptr) - ++cnt; - return cnt + entry_count(node->next_node(alloc)); + // Actual lookup, done after relocation to prevent loc from + // being invalidated. + tag_type tag = make_tag(h); + entry_pointer loc{}; + if (!is_relocating(tuple.old_svar)) { + loc = search_forward(key, tag, tuple.bucket1); + } else { + loc = search_forward(key, tag, tuple.bucket1); + if (loc.np == nullptr) loc = search_forward(key, tag, tuple.bucket2); } - - void print(node_pointer node) - { - std::cout << std::endl; - if (node == nullptr) return; - for (size_t i = 0; i < field_cnt - 1; i++) - { - Pointer p = node->load_entry(i).pointer(); - if (p == nullptr) std::cout << "nullptr\t"; - else std::cout << extract(p) << "\t"; - } - print(node->next_node(alloc)); + return accessor{*this, std::move(tuple), loc}; + } + + iterator begin() { return iterator{*this, 0}; } + + iterator end() { return iterator{*this}; } + + // Estimate load factor by checking embedded nodes + double approx_load_factor() const { + size_t cnt = 0; + for (size_t i = 0; i < embed_cnt; i++) + cnt += entry_count(const_cast(&embedded_block[i])); + return static_cast(cnt) / ((field_cnt - 1) * embed_cnt); + } + + // Triggers rehash by write threads. + // Return false if rehash is under progress. + bool try_double_capacity() { + std::uint64_t old_svar = svar.load(); + if (old_svar % 8 != 0 || + !svar.compare_exchange_strong(old_svar, old_svar + 1)) + return false; + + old_svar = svar.load(); + size_t cnt = bucket_count(); + node_pointer new_block = node_alloc_traits::allocate(alloc, cnt); + for (size_t i = 0; i < cnt; i++) { + // Newly allocated buckets do not need relocation. + // For convenience, the new bucket has same mark as its conjugated bucket, + // the mark is cleared after scan + node_alloc_traits::construct(alloc, new_block + i, !mark_of(old_svar)); } - - static void relocate(node_pointer dst_node, size_t dst_idx, node_pointer src_node, size_t src_idx) - { - tagged_pointer tp = src_node->load_entry(src_idx); - dynamic_assert(tp.pointer() != nullptr, ""); - // Copy then erase, so that lockless readers will not miss the entry - dst_node->set_entry(dst_idx, tp); - src_node->erase_entry(src_idx); + // activate new block + node_blocks[block_index(cnt)] = new_block; + dynamic_assert(svar.load() % 4 == 1, ""); + pvar.store(0UL); + svar.fetch_add(1UL); + return true; + } + + // Triggers rehash by write threads. + // Return false if rehash is under progress. + bool try_halve_capacity() { + std::uint64_t old_svar = svar.load(); + if (old_svar % 8 != 0 || + !svar.compare_exchange_strong(old_svar, old_svar + 5)) { + return false; } - - void compress(node_pointer bkt) - { - std::deque empties; - node_pointer np = bkt; - while (np != nullptr) - { - for (size_t i = 0; i < field_cnt - 1; i++) - { - tagged_pointer tp = np->load_entry(i); - if (tp.pointer() == nullptr) - { - // Empty entry, reserved for compaction - empties.emplace_back(np, i); - continue; - } - - if (empties.empty()) continue; - - // Relocate - auto dst = empties.front(); - empties.pop_front(); - relocate(dst.np, dst.idx, np, i); - empties.emplace_back(np, i); - } - np = np->next_node(alloc); - } - node::remove_empty_nodes(bkt, alloc); + if (active_blocks.load() == 0x0001UL) { + // Cannot shrink embedded block + svar.fetch_add(3U); + return false; + } + // The last block is logically deactivated. + // A bucket pair forms a logical bucket internally. + active_blocks.fetch_sub((active_blocks.load() + 1) >> 1); + dynamic_assert(svar.load() % 4 == 1, ""); + pvar.store(0UL); + svar.fetch_add(1); + return true; + } + + size_t help_relocate() { + size_t old_svar = svar.load(); + if (!is_relocating(old_svar)) return 0; + + size_t n = relocate_global(-1UL, svar.load()); + help_end_relocation(); + return n; + } + + // Logical count of buckets + // During relocation, a pair of conjugated buckets are considered as + // a logical bucket. + // After shrinkage, the higher half of buckets are removed. + // After expansion, the pair is seperated. + size_t bucket_count() const { + std::uint64_t active = active_blocks.load(); + return embed_cnt * (active + 1) / 2; + } + + private: + static tag_type make_tag(Hash h) { return (h >> (pointer_bits + 1)); } + + enum class stage : std::uint64_t { + // No rehashing + stable = 0, + + // Initializing expansion, allocate space + exp_init = 1, + // Processing expansion, relocating entries + exp_proc = 2, + // Finalizing expansion + exp_fin = 3, + + // Block rehashing + blocking = 4, + + // Initializing shrinkage + shrk_init = 5, + // Processing shrinkage, relocating entries + shrk_proc = 6, + // Finalizing shrinkage, modify node_block pointers + shrk_fin = 7 + }; + // Initialization and finalizing stage are ignored by accessors. + // Accessors help relocation during processing stage. + + static stage stage_of(std::uint64_t old_svar) { + return static_cast(old_svar % 8); + } + + // Mark of svar is flipped each time + // try_halve_capacity() or try_double_capacity() is called. + // Relocation will check buckets and flip their marks accordingly. + static bool mark_of(std::uint64_t old_svar) { + return ((old_svar + 7) / 8) % 2 == 1; + } + + static bool is_relocating(std::uint64_t old_svar) { + return (is_exp_proc(old_svar) || is_shrk_proc(old_svar)); + } + + static bool is_exp_proc(std::uint64_t old_svar) { + return (stage_of(old_svar) == stage::exp_proc); + } + + static bool is_shrk_proc(std::uint64_t old_svar) { + return (stage_of(old_svar) == stage::shrk_proc); + } + + static size_t block_index(size_t bucket_index) { + return 64 - __lzcnt64(bucket_index / embed_cnt); + } + + static size_t block_offset(size_t bucket_index) { + return bucket_index - ((1UL << block_index(bucket_index)) >> 1) * embed_cnt; + } + + node_pointer locate_bucket(size_t idx) const { + size_t block_idx = block_index(idx); + size_t block_off = block_offset(idx); + node_pointer base = node_blocks[block_idx]; + return base + block_off; + } + + // Load a consistent triplet of + bucket_tuple load_buckets(Hash h) const { + while (true) { + std::uint64_t old_svar = svar.load(); + size_t b_cnt = bucket_count(); + node_pointer b1 = locate_bucket(h % b_cnt); + node_pointer b2 = locate_bucket(h % b_cnt + b_cnt); + + /// TODO: Investigate this carefully!!!! + node_pointer b = + is_exp_proc(old_svar) ? locate_bucket(h % (b_cnt * 2)) : b1; + // Initializing stages and finalizing stages are dangerous zones! + /// TODO: Investigate extremely carefully and optimize. + if (old_svar != svar.load() || old_svar % 2 != 0) { + mm_pause(); + continue; + } + if (!is_relocating(old_svar)) + return bucket_tuple{old_svar, b_cnt, b1, nullptr, b}; + else + return bucket_tuple{old_svar, b_cnt, b1, b2, b}; + } + } + + bucket_tuple load_and_lock_buckets(Hash h) { + while (true) { + std::uint64_t old_svar = svar.load(); + size_t b_cnt = bucket_count(); + node_pointer b1 = locate_bucket(h % b_cnt); + node_pointer b2 = locate_bucket(h % b_cnt + b_cnt); + + node_pointer mb = + is_exp_proc(old_svar) ? locate_bucket(h % (b_cnt * 2)) : b1; + if (!is_relocating(old_svar)) { + /// TODO: mm_pause()? + std::unique_lock g1{*b1, std::try_to_lock}; + if (!g1.owns_lock()) continue; + if (old_svar != svar.load() || old_svar % 2 != 0) continue; + + return bucket_tuple{old_svar, b_cnt, b1, nullptr, mb, std::move(g1)}; + } else { + std::unique_lock g1{*b1, std::try_to_lock}; + if (!g1.owns_lock()) continue; + std::unique_lock g2{*b2, std::try_to_lock}; + if (!g2.owns_lock()) continue; + if (old_svar != svar.load() || old_svar % 2 != 0) continue; + + return bucket_tuple{old_svar, b_cnt, b1, b2, + mb, std::move(g1), std::move(g2)}; + } + } + } + + entry_pointer allocate_empty_entry(node_pointer np) { + dynamic_assert(np != nullptr, ""); + for (size_t i = 0; i < field_cnt - 1; i++) + if (np->load_entry(i).pointer() == nullptr) return entry_pointer{np, i}; + np = (np->next_node(alloc) == nullptr) ? np->append_new_node(alloc) + : np->next_node(alloc); + return allocate_empty_entry(np); + } + + size_t entry_count(node_pointer node) const { + if (node == nullptr) return 0; + size_t cnt = 0; + for (size_t i = 0; i < field_cnt - 1; i++) + if (node->load_entry(i).pointer() != nullptr) ++cnt; + return cnt + entry_count(node->next_node(alloc)); + } + + void print(node_pointer node) { + std::cout << std::endl; + if (node == nullptr) return; + for (size_t i = 0; i < field_cnt - 1; i++) { + Pointer p = node->load_entry(i).pointer(); + if (p == nullptr) + std::cout << "nullptr\t"; + else + std::cout << extract(p) << "\t"; + } + print(node->next_node(alloc)); + } + + static void relocate(node_pointer dst_node, size_t dst_idx, + node_pointer src_node, size_t src_idx) { + tagged_pointer tp = src_node->load_entry(src_idx); + dynamic_assert(tp.pointer() != nullptr, ""); + // Copy then erase, so that lockless readers will not miss the entry + dst_node->set_entry(dst_idx, tp); + src_node->erase_entry(src_idx); + } + + void compress(node_pointer bkt) { + std::deque empties; + node_pointer np = bkt; + while (np != nullptr) { + for (size_t i = 0; i < field_cnt - 1; i++) { + tagged_pointer tp = np->load_entry(i); + if (tp.pointer() == nullptr) { + // Empty entry, reserved for compaction + empties.emplace_back(np, i); + continue; + } + + if (empties.empty()) continue; + + // Relocate + auto dst = empties.front(); + empties.pop_front(); + relocate(dst.np, dst.idx, np, i); + empties.emplace_back(np, i); + } + np = np->next_node(alloc); } + node::remove_empty_nodes(bkt, alloc); + } - // Recursively flip marks in a bucket that won't need relocation - // During shrinkage, lower half of buckets does not need relocation, - // but their mark has to be flipped. - void flip_mark(node_pointer node) - { - if (node == nullptr) return; + // Recursively flip marks in a bucket that won't need relocation + // During shrinkage, lower half of buckets does not need relocation, + // but their mark has to be flipped. + void flip_mark(node_pointer node) { + if (node == nullptr) return; - flip_mark(node->next_node(alloc)); - - for (size_t i = 0; i < field_cnt - 1; i++) - node->entry_mark_flip(i); + flip_mark(node->next_node(alloc)); - node->meta_mark_flip(); - return; - } + for (size_t i = 0; i < field_cnt - 1; i++) node->entry_mark_flip(i); - // Scan and relocate entries in sub-chain led by src_node in src_bucket - // It's guaranteed that the mark of dst_bucket and src_bucket are flipped together. - size_t relocate_helper(node_pointer dst_bucket, node_pointer src_bucket, node_pointer src_node, size_t deref_lim) - { - if (src_node == nullptr) return 0; + node->meta_mark_flip(); + return; + } - std::uint64_t old_svar = svar.load(); - dynamic_assert(is_relocating(old_svar), ""); - bool mark = mark_of(old_svar); - dynamic_assert(src_bucket->meta_mark() != mark, ""); - dynamic_assert(dst_bucket->meta_mark() != mark, ""); - - // The node already scanned and relocated - if (src_node->meta_mark() == mark) return 0; - - size_t deref_cnt = relocate_helper(dst_bucket, src_bucket, src_node->next_node(alloc), deref_lim); - deref_lim -= deref_cnt; - dynamic_assert(old_svar == svar.load(), ""); - if (deref_lim == 0) return deref_cnt; - - size_t b_cnt = bucket_count(); - - for (size_t i = 0; i < field_cnt - 1; i++) - { - if (deref_lim == 0) return deref_cnt; - - // Already relocated, skip - if (src_node->entry_mark(i) == mark) continue; - - src_node->entry_mark_flip(i); - tagged_pointer tp = src_node->load_entry(i); - - // Empty entry, skip - if (tp.pointer() == nullptr) continue; - - // extract() is the potential bottleneck - --deref_lim; - ++deref_cnt; - Hash h = hasher(extract(tp.pointer())); - - // No need to relocate, skip - if (h % (b_cnt * 2) < b_cnt) continue; - - // Relocation necessary, find an empty entry - entry_pointer dst = allocate_empty_entry(dst_bucket); - relocate(dst.np, dst.idx, src_node, i); - - dynamic_assert(old_svar == svar.load(), ""); - } - // Whole node scanned - // When src_node is the leading node, - // flip the mark means the whole bucket is relocated. - src_node->meta_mark_flip(); - - // Relocation is done - if (src_bucket == src_node) - { - compress(src_bucket); - compress(dst_bucket); - flip_mark(dst_bucket); - } + // Scan and relocate entries in sub-chain led by src_node in src_bucket + // It's guaranteed that the mark of dst_bucket and src_bucket are flipped + // together. + size_t relocate_helper(node_pointer dst_bucket, node_pointer src_bucket, + node_pointer src_node, size_t deref_lim) { + if (src_node == nullptr) return 0; - dynamic_assert(old_svar == svar.load(), ""); - return deref_cnt; - } + std::uint64_t old_svar = svar.load(); + dynamic_assert(is_relocating(old_svar), ""); + bool mark = mark_of(old_svar); + dynamic_assert(src_bucket->meta_mark() != mark, ""); + dynamic_assert(dst_bucket->meta_mark() != mark, ""); - size_t relocate_bucket(node_pointer dst_bucket, node_pointer src_bucket, size_t deref_lim) - { - return relocate_helper(dst_bucket, src_bucket, src_bucket, deref_lim); - } + // The node already scanned and relocated + if (src_node->meta_mark() == mark) return 0; - void help_end_relocation() - { - std::uint64_t old_svar = svar.load(); - if (!is_relocating(old_svar)) return; - - // Not done yet. - size_t b_cnt = bucket_count(); - if (pvar.load() != b_cnt) return; - - // Only one thread is allowed to continue - if (!pvar.compare_exchange_strong(b_cnt, -1UL)) return; - if (is_exp_proc(old_svar)) - { - // End of expansion - svar.fetch_add(1); - active_blocks.fetch_add(active_blocks.load() + 1); - svar.fetch_add(5); - } - else - { - dynamic_assert(is_shrk_proc(old_svar), ""); - // End of shrinkage - svar.fetch_add(1UL); - node_pointer old_block = node_blocks[block_index(b_cnt)]; - node_blocks[block_index(b_cnt)] = nullptr; - svar.fetch_add(1UL); - std::this_thread::sleep_for(std::chrono::milliseconds{1000}); - for (size_t i = 0; i < b_cnt; i++) - { - node_pointer np = old_block + i; - node::remove_appended_nodes(np, alloc); - node_alloc_traits::destroy(alloc, np); - } - node_alloc_traits::deallocate(alloc, old_block, b_cnt); - } - } - - // When bucket associated with accessor is relocated, - // accessor helps relocate entries in other buckets. - // Since locks are acquired inside, we have to pass old_svar from outside - /// TODO: can we load old_svar inside? - size_t relocate_global(size_t deref_lim, std::uint64_t old_svar) - { - // Logical bucket count - size_t b_cnt = bucket_count(); - size_t deref_cnt = 0; - - for (size_t idx = pvar.load(); idx < b_cnt && deref_lim != 0; idx++) - { - // Loading a new bucket pair also counts as a dereference - deref_lim--; - bucket_tuple tuple = load_buckets(idx); - if (old_svar != tuple.old_svar) return deref_cnt; - std::unique_lock guard1{*tuple.bucket1, std::try_to_lock}; - if (!guard1.owns_lock()) - { - idx += __rdtsc() % (b_cnt - idx); - continue; - } - std::unique_lock guard2{*tuple.bucket2, std::try_to_lock}; - if (!guard2.owns_lock()) continue; - - if (tuple.bucket1->meta_mark() != mark_of(old_svar)) - { - // Both buckets locked, - // meta_mark indicates necessity for relocation - // svar not incremented - dynamic_assert(is_relocating(old_svar), ""); - size_t cnt = is_exp_proc(old_svar) - ? relocate_bucket(tuple.bucket2, tuple.bucket1, deref_lim) - : relocate_bucket(tuple.bucket1, tuple.bucket2, deref_lim); - deref_lim -= cnt; - deref_cnt += cnt; - } - - // Relocated bucket pair, refresh relocation progress - // pvar is protected by lock on bucket indexed by idx1 - if (tuple.bucket1->meta_mark() == mark_of(old_svar)) - if (idx == pvar.load()) - pvar.fetch_add(1U); - } - return deref_cnt; - } + size_t deref_cnt = relocate_helper(dst_bucket, src_bucket, + src_node->next_node(alloc), deref_lim); + deref_lim -= deref_cnt; + dynamic_assert(old_svar == svar.load(), ""); + if (deref_lim == 0) return deref_cnt; - // Recursively traverse a bucket backwards. - // Since compaction only moves entry from back of a bucket to front, - // we avoid missing an entry during rehash. - Pointer search_backward(Key const& key, tag_type tag, node_pointer np) const - { - if (np == nullptr) return nullptr; - - Pointer p = search_backward(key, tag, np->next_node(alloc)); - if (p != nullptr) return p; - - // Next node does not exist or does not contain entry of interest. - // Lookup key in current node. - for (size_t i = 0; i < field_cnt - 1; i++) - { - size_t idx = field_cnt - 2 - i; - tagged_pointer tp = np->load_entry(idx); - - if (tp.pointer() == nullptr || tp.tag() != tag) continue; - if (!equal(extract(tp.pointer()), key)) continue; - return tp.pointer(); - } - return nullptr; - } + size_t b_cnt = bucket_count(); + + for (size_t i = 0; i < field_cnt - 1; i++) { + if (deref_lim == 0) return deref_cnt; - // Traverse a bucket in normal order. - entry_pointer search_forward(Key const& key, tag_type tag, node_pointer node) - { - if (node == nullptr) return entry_pointer{}; + // Already relocated, skip + if (src_node->entry_mark(i) == mark) continue; - mask_type match = node->match_tag(tag); - for (size_t idx = node::consume_mask(match); idx < field_cnt - 1; idx = node::consume_mask(match)) - if (equal(extract(node->load_entry(idx).pointer()), key)) - return entry_pointer{node, idx}; + src_node->entry_mark_flip(i); + tagged_pointer tp = src_node->load_entry(i); - return search_forward(key, tag, node->next_node(alloc)); + // Empty entry, skip + if (tp.pointer() == nullptr) continue; + + // extract() is the potential bottleneck + --deref_lim; + ++deref_cnt; + Hash h = hasher(extract(tp.pointer())); + + // No need to relocate, skip + if (h % (b_cnt * 2) < b_cnt) continue; + + // Relocation necessary, find an empty entry + entry_pointer dst = allocate_empty_entry(dst_bucket); + relocate(dst.np, dst.idx, src_node, i); + + dynamic_assert(old_svar == svar.load(), ""); + } + // Whole node scanned + // When src_node is the leading node, + // flip the mark means the whole bucket is relocated. + src_node->meta_mark_flip(); + + // Relocation is done + if (src_bucket == src_node) { + compress(src_bucket); + compress(dst_bucket); + flip_mark(dst_bucket); } - private: - node_alloc alloc; - // Nodes are organized into blocks. - // First embed_cnt nodes are put in embedded_block, - // and when rehashed, new nodes are allocated into node_blocks. - // allocated_nodes[0] points to embedded_block and - // allocated_nodes[k] contains embed_cnt * 2^(k-1) nodes - std::array embedded_block{}; - std::array node_blocks{}; - // Initially only embedded_block is active - std::atomic_uint64_t active_blocks{1U}; - - // state variable indicating the map shrinking or expanding - std::atomic_uint64_t svar{0UL}; - // progress variable indicating relocation progress - // 0 for start, -1 for finish - std::atomic pvar{-1UL}; - - typename maybe_add_pointer::type hasher; - typename maybe_add_pointer::type equal; - typename maybe_add_pointer::type extract; + dynamic_assert(old_svar == svar.load(), ""); + return deref_cnt; + } + + size_t relocate_bucket(node_pointer dst_bucket, node_pointer src_bucket, + size_t deref_lim) { + return relocate_helper(dst_bucket, src_bucket, src_bucket, deref_lim); + } + + void help_end_relocation() { + std::uint64_t old_svar = svar.load(); + if (!is_relocating(old_svar)) return; + + // Not done yet. + size_t b_cnt = bucket_count(); + if (pvar.load() != b_cnt) return; + + // Only one thread is allowed to continue + if (!pvar.compare_exchange_strong(b_cnt, -1UL)) return; + if (is_exp_proc(old_svar)) { + // End of expansion + svar.fetch_add(1); + active_blocks.fetch_add(active_blocks.load() + 1); + svar.fetch_add(5); + } else { + dynamic_assert(is_shrk_proc(old_svar), ""); + // End of shrinkage + svar.fetch_add(1UL); + node_pointer old_block = node_blocks[block_index(b_cnt)]; + node_blocks[block_index(b_cnt)] = nullptr; + svar.fetch_add(1UL); + std::this_thread::sleep_for(std::chrono::milliseconds{1000}); + for (size_t i = 0; i < b_cnt; i++) { + node_pointer np = old_block + i; + node::remove_appended_nodes(np, alloc); + node_alloc_traits::destroy(alloc, np); + } + node_alloc_traits::deallocate(alloc, old_block, b_cnt); + } + } + + // When bucket associated with accessor is relocated, + // accessor helps relocate entries in other buckets. + // Since locks are acquired inside, we have to pass old_svar from outside + /// TODO: can we load old_svar inside? + size_t relocate_global(size_t deref_lim, std::uint64_t old_svar) { + // Logical bucket count + size_t b_cnt = bucket_count(); + size_t deref_cnt = 0; + + for (size_t idx = pvar.load(); idx < b_cnt && deref_lim != 0; idx++) { + // Loading a new bucket pair also counts as a dereference + deref_lim--; + bucket_tuple tuple = load_buckets(idx); + if (old_svar != tuple.old_svar) return deref_cnt; + std::unique_lock guard1{*tuple.bucket1, std::try_to_lock}; + if (!guard1.owns_lock()) { + idx += __rdtsc() % (b_cnt - idx); + continue; + } + std::unique_lock guard2{*tuple.bucket2, std::try_to_lock}; + if (!guard2.owns_lock()) continue; + + if (tuple.bucket1->meta_mark() != mark_of(old_svar)) { + // Both buckets locked, + // meta_mark indicates necessity for relocation + // svar not incremented + dynamic_assert(is_relocating(old_svar), ""); + size_t cnt = + is_exp_proc(old_svar) + ? relocate_bucket(tuple.bucket2, tuple.bucket1, deref_lim) + : relocate_bucket(tuple.bucket1, tuple.bucket2, deref_lim); + deref_lim -= cnt; + deref_cnt += cnt; + } + + // Relocated bucket pair, refresh relocation progress + // pvar is protected by lock on bucket indexed by idx1 + if (tuple.bucket1->meta_mark() == mark_of(old_svar)) + if (idx == pvar.load()) pvar.fetch_add(1U); + } + return deref_cnt; + } + + // Recursively traverse a bucket backwards. + // Since compaction only moves entry from back of a bucket to front, + // we avoid missing an entry during rehash. + Pointer search_backward(Key const& key, tag_type tag, node_pointer np) const { + if (np == nullptr) return nullptr; + + Pointer p = search_backward(key, tag, np->next_node(alloc)); + if (p != nullptr) return p; + + // Next node does not exist or does not contain entry of interest. + // Lookup key in current node. + for (size_t i = 0; i < field_cnt - 1; i++) { + size_t idx = field_cnt - 2 - i; + tagged_pointer tp = np->load_entry(idx); + + if (tp.pointer() == nullptr || tp.tag() != tag) continue; + if (!equal(extract(tp.pointer()), key)) continue; + return tp.pointer(); + } + return nullptr; + } + + // Traverse a bucket in normal order. + entry_pointer search_forward(Key const& key, tag_type tag, + node_pointer node) { + if (node == nullptr) return entry_pointer{}; + + mask_type match = node->match_tag(tag); + for (size_t idx = node::consume_mask(match); idx < field_cnt - 1; + idx = node::consume_mask(match)) + if (equal(extract(node->load_entry(idx).pointer()), key)) + return entry_pointer{node, idx}; + + return search_forward(key, tag, node->next_node(alloc)); + } + + private: + node_alloc alloc; + // Nodes are organized into blocks. + // First embed_cnt nodes are put in embedded_block, + // and when rehashed, new nodes are allocated into node_blocks. + // allocated_nodes[0] points to embedded_block and + // allocated_nodes[k] contains embed_cnt * 2^(k-1) nodes + std::array embedded_block{}; + std::array node_blocks{}; + // Initially only embedded_block is active + std::atomic_uint64_t active_blocks{1U}; + + // state variable indicating the map shrinking or expanding + std::atomic_uint64_t svar{0UL}; + // progress variable indicating relocation progress + // 0 for start, -1 for finish + std::atomic pvar{-1UL}; + + typename maybe_add_pointer::type hasher; + typename maybe_add_pointer::type equal; + typename maybe_add_pointer::type extract; }; -} // KVDK_NAMESPACE +} // namespace KVDK_NAMESPACE diff --git a/engine/experimental/vhash.cpp b/engine/experimental/vhash.cpp index 40623f37..7644c054 100644 --- a/engine/experimental/vhash.cpp +++ b/engine/experimental/vhash.cpp @@ -1,151 +1,129 @@ #include "vhash.hpp" -#include "../alias.hpp" -#include "../macros.hpp" -#include "../version/old_records_cleaner.hpp" - -namespace KVDK_NAMESPACE -{ - -Status VHash::Get(StringView key, VHash::CopyFunc copy, void* dst) -{ - VHashKV* kvp = hpmap.lookup(key, hpmap.lockless); - if (kvp == nullptr) return Status::NotFound; - copy(dst, kvp->Value()); - return Status::Ok; + +namespace KVDK_NAMESPACE { + +bool VHash::Get(StringView key, StringView& value) { + VHashKV* kvp = hpmap.lookup(key, hpmap.lockless); + if (kvp == nullptr) return false; + value = kvp->Value(); + return true; } -Status VHash::Put(StringView key, StringView value) -{ - VHashKV* new_kv = kvb.NewKV(key, value); - VHashKV* old_kv = nullptr; - { - auto acc = hpmap.lookup(key, hpmap.acquire_lock); - old_kv = acc.pointer(); - acc.set_pointer(new_kv); - } - if (old_kv == nullptr) sz.fetch_add(1LL); - kvb.Recycle(old_kv); - return Status::Ok; +void VHash::Put(StringView key, StringView value) { + VHashKV* new_kv = kvb.NewKV(key, value); + VHashKV* old_kv = nullptr; + { + auto acc = hpmap.lookup(key, hpmap.acquire_lock); + old_kv = acc.pointer(); + acc.set_pointer(new_kv); + } + if (old_kv == nullptr) sz.fetch_add(1LL); + kvb.Recycle(old_kv); } -Status VHash::Delete(StringView key) -{ - VHashKV* old_kv = nullptr; - { - auto acc = hpmap.lookup(key, hpmap.acquire_lock); - old_kv = acc.pointer(); - acc.erase(); - } - if (old_kv != nullptr) sz.fetch_sub(1LL); - kvb.Recycle(old_kv); - return Status::Ok; +void VHash::Delete(StringView key) { + VHashKV* old_kv = nullptr; + { + auto acc = hpmap.lookup(key, hpmap.acquire_lock); + old_kv = acc.pointer(); + acc.erase(); + } + if (old_kv != nullptr) sz.fetch_sub(1LL); + kvb.Recycle(old_kv); } -Status VHash::deleteAll() -{ - for(auto iter = hpmap.begin(); iter != hpmap.end(); ++iter) - { - VHashKV* old_kv = iter->pointer(); - iter->erase(); - // It's safe to call Delete() instead of Recycle() here, - // As deleteAll() is called by cleaner. - kvb.Delete(old_kv); - sz.fetch_sub(1LL); - } - kvdk_assert(sz.load() == 0LL, ""); - return Status::Ok; +void VHash::deleteAll() { + kvdk_assert(ref_cnt.load() == 0UL, "Iterator outlives VHash!"); + for (auto iter = hpmap.begin(); iter != hpmap.end(); ++iter) { + VHashKV* old_kv = iter->pointer(); + iter->erase(); + // It's safe to call Delete() instead of Recycle() here, + // As deleteAll() is called by cleaner. + kvb.Delete(old_kv); + sz.fetch_sub(1LL); + } + kvdk_assert(sz.load() == 0LL, ""); } -Status VHash::Modify(StringView key, VHash::ModifyFunc modify, void* cb_args, VHash::Cleanup cleanup) -{ - VHashKV* old_kv = nullptr; - ModifyOperation op; - { - auto acc = hpmap.lookup(key, hpmap.acquire_lock); - old_kv = acc.pointer(); - StringView new_value; - StringView old_value; - old_value = old_kv ? old_kv->Value() : old_value; - op = modify(old_kv ? &old_value : nullptr, new_value, cb_args); - switch (op) - { - case ModifyOperation::Write: - { - VHashKV* new_kv = kvb.NewKV(key, new_value); - acc.set_pointer(new_kv); - if (old_kv == nullptr) sz.fetch_add(1LL); - kvb.Recycle(old_kv); - break; - } - case ModifyOperation::Delete: - { - kvdk_assert(old_kv != nullptr, "Invalid callback!"); - acc.erase(); - sz.fetch_sub(1LL); - kvb.Recycle(old_kv); - break; - } - case ModifyOperation::Noop: - case ModifyOperation::Abort: - { - break; - } - } - cleanup(new_value); +bool VHash::Modify(StringView key, VHash::ModifyFunc modify, void* cb_args, + VHash::Cleanup cleanup) { + VHashKV* old_kv = nullptr; + ModifyOperation op; + { + auto acc = hpmap.lookup(key, hpmap.acquire_lock); + old_kv = acc.pointer(); + StringView new_value; + StringView old_value; + old_value = old_kv ? old_kv->Value() : old_value; + op = modify(old_kv ? &old_value : nullptr, new_value, cb_args); + switch (op) { + case ModifyOperation::Write: { + VHashKV* new_kv = kvb.NewKV(key, new_value); + acc.set_pointer(new_kv); + if (old_kv == nullptr) sz.fetch_add(1LL); + kvb.Recycle(old_kv); + break; + } + case ModifyOperation::Delete: { + kvdk_assert(old_kv != nullptr, "Invalid callback!"); + acc.erase(); + sz.fetch_sub(1LL); + kvb.Recycle(old_kv); + break; + } + case ModifyOperation::Noop: + case ModifyOperation::Abort: { + break; + } } - return (op != ModifyOperation::Abort) ? Status::Ok : Status::Abort; + cleanup(new_value); + } + return (op != ModifyOperation::Abort); } -void VHash::Iterator::SeekToFirst() { - pos = owner.hpmap.begin(); -} +void VHash::Iterator::SeekToFirst() { pos = owner.hpmap.begin(); } void VHash::Iterator::Next() { - if (Valid()) ++pos; + if (Valid()) ++pos; } -bool VHash::Iterator::Valid() const { - return (pos != owner.hpmap.end()); -} +bool VHash::Iterator::Valid() const { return (pos != owner.hpmap.end()); } std::string VHash::Iterator::Key() const { - VHashKV* kv = pos->pointer(); - StringView key = kv->Key(); - return std::string(key.data(), key.size()); + VHashKV* kv = pos->pointer(); + StringView key = kv->Key(); + return std::string(key.data(), key.size()); } std::string VHash::Iterator::Value() const { - VHashKV* kv = pos->pointer(); - StringView value = kv->Value(); - return std::string(value.data(), value.size()); + VHashKV* kv = pos->pointer(); + StringView value = kv->Value(); + return std::string(value.data(), value.size()); } std::unique_ptr VHash::MakeIterator() { - // Initialized to end() iterator without acquiring lock. - return std::unique_ptr{new Iterator{*this, hpmap.end()}}; + ref_cnt.fetch_add(1UL); + // Initialized to end() iterator without acquiring lock. + return std::unique_ptr{new Iterator{*this, hpmap.end()}}; } -VHashBuilder::VHashBuilder(OldRecordsCleaner& c) : cleaner{c} -{ - cleaner.RegisterDelayDeleter(*this); +VHashBuilder::VHashBuilder(OldRecordsCleaner& c) : cleaner{c} { + cleaner.RegisterDelayDeleter(*this); } -VHash* VHashBuilder::NewVHash(StringView name, VHashKVBuilder& kvb) -{ - return new VHash{name, kvb}; +VHash* VHashBuilder::NewVHash(StringView name, VHashKVBuilder& kvb) { + return new VHash{name, kvb}; } -void VHashBuilder::Recycle(VHash* vhash) -{ - if (vhash == nullptr) return; - cleaner.DelayDelete(*this, vhash); +void VHashBuilder::Recycle(VHash* vhash) { + if (vhash == nullptr) return; + cleaner.DelayDelete(*this, vhash); } -void VHashBuilder::Delete(void* vhash) -{ - VHash* vh = static_cast(vhash); - vh->deleteAll(); - delete vh; +void VHashBuilder::Delete(void* vhash) { + VHash* vh = static_cast(vhash); + vh->deleteAll(); + delete vh; } -} // namespace KVDK_NAMESPACE +} // namespace KVDK_NAMESPACE diff --git a/engine/experimental/vhash.hpp b/engine/experimental/vhash.hpp index 991bbde2..37853e9e 100644 --- a/engine/experimental/vhash.hpp +++ b/engine/experimental/vhash.hpp @@ -4,91 +4,89 @@ #include #include +#include "../version/old_records_cleaner.hpp" #include "hashptr_map.hpp" -#include "vhash_kv.hpp" #include "kvdk/iterator.hpp" -#include "../version/old_records_cleaner.hpp" +#include "vhash_kv.hpp" -namespace KVDK_NAMESPACE -{ +namespace KVDK_NAMESPACE { /// TODO: Support dynamically choose a allocator when creating VHash -/// Currently VHash allocate KVs by VHashKVBuilder, +/// Currently VHash allocate KVs by VHashKVBuilder, /// which can bind to other allocators, -/// but VHashBuilder does not support custom allocator for +/// but VHashBuilder does not support custom allocator for /// hashptr_map and name -class VHash -{ -private: - hashptr_map hpmap; - VHashKVBuilder& kvb; - std::atomic_int64_t sz{0LL}; - std::string name; - -public: - VHash(StringView n, VHashKVBuilder& b) : hpmap{4, VHashKV::ExtractKey}, kvb{b}, name{n.data(), n.size()} {} - - StringView Name() const { return name; } - - static StringView ExtractName(VHash* vhash) - { - return vhash->Name(); - } - - size_t Size() const { return sz.load(); } - - using CopyFunc = std::function; - // copy() will copy StringView of value to dst - Status Get(StringView key, CopyFunc copy, void* dst); - - Status Put(StringView key, StringView value); - - Status Delete(StringView key); - - // Cleanup is for cleaning up memory allocated by ModifyFunc. - using ModifyFunc = std::function; - using Cleanup = std::function; - Status Modify(StringView key, ModifyFunc modify, void* cb_args, Cleanup cleanup); - - class Iterator : public VHashIterator - { - public: - void SeekToFirst() final; - void Next() final; - bool Valid() const final; - std::string Key() const final; - std::string Value() const final; - virtual ~Iterator() = default; - - private: - friend VHash; - using rep = typename decltype(hpmap)::iterator; - VHash& owner; - rep pos; - Iterator(VHash& o, rep&& p) : owner{o}, pos{std::move(p)} {} - }; - - std::unique_ptr MakeIterator(); - - private: - friend class VHashBuilder; - // Called by VHashBuilder::Delete() to Delete all VHashKVs inside it. - Status deleteAll(); +class VHash { + private: + hashptr_map hpmap; + VHashKVBuilder& kvb; + std::atomic_int64_t sz{0LL}; + std::string name; + // Number of iterators alive + // This is a temporary workaround for iterator outliving VHash. + /// TODO: Access token in KVDK for iterators + std::atomic_uint64_t ref_cnt{0ULL}; + + public: + VHash(StringView n, VHashKVBuilder& b) + : hpmap{4, VHashKV::ExtractKey}, kvb{b}, name{n.data(), n.size()} {} + + StringView Name() const { return name; } + + static StringView ExtractName(VHash* vhash) { return vhash->Name(); } + + size_t Size() const { return sz.load(); } + + bool Get(StringView key, StringView& value); + + void Put(StringView key, StringView value); + + void Delete(StringView key); + + // Cleanup is for cleaning up memory allocated by ModifyFunc. + using ModifyFunc = + std::function; + using Cleanup = std::function; + bool Modify(StringView key, ModifyFunc modify, void* cb_args, + Cleanup cleanup); + + class Iterator : public VHashIterator { + public: + void SeekToFirst() final; + void Next() final; + bool Valid() const final; + std::string Key() const final; + std::string Value() const final; + virtual ~Iterator() = default; + + private: + friend VHash; + using rep = typename decltype(hpmap)::iterator; + VHash& owner; + rep pos; + Iterator(VHash& o, rep&& p) : owner{o}, pos{std::move(p)} {} + }; + + std::unique_ptr MakeIterator(); + + private: + friend class VHashBuilder; + // Called by VHashBuilder::Delete() to Delete all VHashKVs inside it. + void deleteAll(); }; -class VHashBuilder : public IDeleter -{ -private: - OldRecordsCleaner& cleaner; +class VHashBuilder : public IDeleter { + private: + OldRecordsCleaner& cleaner; -public: - VHashBuilder(OldRecordsCleaner& c); + public: + VHashBuilder(OldRecordsCleaner& c); - // Called by VHashGroup to create a VHash. - VHash* NewVHash(StringView name, VHashKVBuilder& b); + // Called by VHashGroup to create a VHash. + VHash* NewVHash(StringView name, VHashKVBuilder& b); - void Recycle(VHash* vhash); + void Recycle(VHash* vhash); - void Delete(void* vhash) final; + void Delete(void* vhash) final; }; -} // KVDK_NAMESPACE +} // namespace KVDK_NAMESPACE diff --git a/engine/experimental/vhash_group.cpp b/engine/experimental/vhash_group.cpp index 5b4fb417..f98a17c3 100644 --- a/engine/experimental/vhash_group.cpp +++ b/engine/experimental/vhash_group.cpp @@ -1,39 +1,26 @@ #include "vhash_group.hpp" -#include "../macros.hpp" -namespace KVDK_NAMESPACE -{ +namespace KVDK_NAMESPACE { -Status VHashGroup::Create(StringView name) -KVDK_TRY -{ - auto acc = hpmap.lookup(name, hpmap.acquire_lock); - if (acc.pointer() != nullptr) return Status::Existed; - VHash* vhash = vhb.NewVHash(name, kvb); - acc.set_pointer(vhash); - return Status::Ok; +bool VHashGroup::Create(StringView name) { + auto acc = hpmap.lookup(name, hpmap.acquire_lock); + if (acc.pointer() != nullptr) return false; + VHash* vhash = vhb.NewVHash(name, kvb); + acc.set_pointer(vhash); + return true; } -KVDK_HANDLE_EXCEPTIONS -Status VHashGroup::Destroy(StringView name) -KVDK_TRY -{ - auto acc = hpmap.lookup(name, hpmap.acquire_lock); - VHash* vhash = acc.pointer(); - if (vhash == nullptr) return Status::NotFound; - acc.erase(); - vhb.Recycle(vhash); - return Status::Ok; +bool VHashGroup::Destroy(StringView name) { + auto acc = hpmap.lookup(name, hpmap.acquire_lock); + VHash* vhash = acc.pointer(); + if (vhash == nullptr) return false; + acc.erase(); + vhb.Recycle(vhash); + return true; } -KVDK_HANDLE_EXCEPTIONS -Status VHashGroup::Get(StringView name, VHash** vhash) -KVDK_TRY -{ - *vhash = hpmap.lookup(name, hpmap.lockless); - if (*vhash == nullptr) return Status::NotFound; - else return Status::Ok; +VHash* VHashGroup::Get(StringView name) { + return hpmap.lookup(name, hpmap.lockless); } -KVDK_HANDLE_EXCEPTIONS -} // namespace KVDK_NAMESPACE +} // namespace KVDK_NAMESPACE diff --git a/engine/experimental/vhash_group.hpp b/engine/experimental/vhash_group.hpp index cc7b6d1d..597abca6 100644 --- a/engine/experimental/vhash_group.hpp +++ b/engine/experimental/vhash_group.hpp @@ -6,30 +6,30 @@ #include "hashptr_map.hpp" #include "vhash.hpp" -namespace KVDK_NAMESPACE -{ +namespace KVDK_NAMESPACE { // A VHashGroup contains VHashes that share the same memory allocator for kvs. /// TODO: Add hpmap_alloc to allocate memory for hashptr_maps. -class VHashGroup -{ -private: - OldRecordsCleaner& cleaner; - IVolatileAllocator& kv_alloc; - VHashKVBuilder kvb{kv_alloc, cleaner}; - VHashBuilder vhb{cleaner}; +class VHashGroup { + private: + OldRecordsCleaner& cleaner; + IVolatileAllocator& kv_alloc; + VHashKVBuilder kvb{kv_alloc, cleaner}; + VHashBuilder vhb{cleaner}; - hashptr_map hpmap{4, VHash::ExtractName}; - std::atomic_int64_t sz{0LL}; + hashptr_map hpmap{ + 4, VHash::ExtractName}; + std::atomic_int64_t sz{0LL}; -public: - VHashGroup(IVolatileAllocator& a, OldRecordsCleaner& c) : kv_alloc{a}, cleaner{c} {} + public: + VHashGroup(IVolatileAllocator& a, OldRecordsCleaner& c) + : kv_alloc{a}, cleaner{c} {} - Status Create(StringView name); + bool Create(StringView name); - Status Destroy(StringView name); + bool Destroy(StringView name); - Status Get(StringView name, VHash** vhash); + VHash* Get(StringView name); }; -} // namespace KVDK_NAMESPACE +} // namespace KVDK_NAMESPACE diff --git a/engine/experimental/vhash_kv.cpp b/engine/experimental/vhash_kv.cpp index 7403a87f..46993724 100644 --- a/engine/experimental/vhash_kv.cpp +++ b/engine/experimental/vhash_kv.cpp @@ -1,33 +1,29 @@ #include "vhash_kv.hpp" + #include "../version/old_records_cleaner.hpp" -namespace KVDK_NAMESPACE -{ +namespace KVDK_NAMESPACE { -VHashKVBuilder::VHashKVBuilder(IVolatileAllocator& a, OldRecordsCleaner& c) : - alloc{a}, cleaner{c} -{ - c.RegisterDelayDeleter(*this); +VHashKVBuilder::VHashKVBuilder(IVolatileAllocator& a, OldRecordsCleaner& c) + : alloc{a}, cleaner{c} { + c.RegisterDelayDeleter(*this); } -VHashKV* VHashKVBuilder::NewKV(StringView key, StringView value) -{ - void* dst = alloc.Allocate(sizeof(VHashKV) + key.size() + value.size()); - new(dst) VHashKV{key, value}; - return static_cast(dst); +VHashKV* VHashKVBuilder::NewKV(StringView key, StringView value) { + void* dst = alloc.Allocate(sizeof(VHashKV) + key.size() + value.size()); + new (dst) VHashKV{key, value}; + return static_cast(dst); } -void VHashKVBuilder::Recycle(VHashKV* kv) -{ - if (kv == nullptr) return; - cleaner.DelayDelete(*this, kv); +void VHashKVBuilder::Recycle(VHashKV* kv) { + if (kv == nullptr) return; + cleaner.DelayDelete(*this, kv); } -void VHashKVBuilder::Delete(void* obj) -{ - VHashKV* kv = static_cast(obj); - kv->~VHashKV(); - alloc.Deallocate(kv, sizeof(VHashKV) + kv->Key().size() + kv->Value().size()); +void VHashKVBuilder::Delete(void* obj) { + VHashKV* kv = static_cast(obj); + kv->~VHashKV(); + alloc.Deallocate(kv, sizeof(VHashKV) + kv->Key().size() + kv->Value().size()); } -} // namespace KVDK_NAMESPACE +} // namespace KVDK_NAMESPACE diff --git a/engine/experimental/vhash_kv.hpp b/engine/experimental/vhash_kv.hpp index 625777d8..5b02c4d8 100644 --- a/engine/experimental/vhash_kv.hpp +++ b/engine/experimental/vhash_kv.hpp @@ -3,68 +3,53 @@ #include #include "../alias.hpp" -#include "../macros.hpp" #include "../allocator.hpp" +#include "../macros.hpp" #include "../version/old_records_cleaner.hpp" -namespace KVDK_NAMESPACE -{ +namespace KVDK_NAMESPACE { /// TODO: Add timestamp field for MVCC if necessary -class VHashKV -{ -private: - std::uint32_t key_sz; - std::uint32_t value_sz; - char data[]; - -public: - VHashKV(StringView key, StringView value) - { - kvdk_assert(key.size() <= std::numeric_limits::max(), ""); - kvdk_assert(value.size() <= std::numeric_limits::max(), ""); - key_sz = static_cast(key.size()); - value_sz = static_cast(value.size()); - memcpy(data, key.data(), key.size()); - memcpy(data + key_sz, value.data(), value.size()); - } - - StringView Key() const - { - return StringView{data, key_sz}; - } - - StringView Value() const - { - return StringView{data+key_sz, value_sz}; - } - - static StringView ExtractKey(VHashKV* kvp) - { - return kvp->Key(); - } +class VHashKV { + private: + std::uint32_t key_sz; + std::uint32_t value_sz; + char data[]; + + public: + VHashKV(StringView key, StringView value) { + kvdk_assert(key.size() <= std::numeric_limits::max(), ""); + kvdk_assert(value.size() <= std::numeric_limits::max(), ""); + key_sz = static_cast(key.size()); + value_sz = static_cast(value.size()); + memcpy(data, key.data(), key.size()); + memcpy(data + key_sz, value.data(), value.size()); + } + + StringView Key() const { return StringView{data, key_sz}; } + + StringView Value() const { return StringView{data + key_sz, value_sz}; } + + static StringView ExtractKey(VHashKV* kvp) { return kvp->Key(); } }; -class VHashKVBuilder : public IDeleter -{ -private: - IVolatileAllocator& alloc; - OldRecordsCleaner& cleaner; +class VHashKVBuilder : public IDeleter { + private: + IVolatileAllocator& alloc; + OldRecordsCleaner& cleaner; -public: - VHashKVBuilder(IVolatileAllocator& a, OldRecordsCleaner& c); - VHashKVBuilder(VHashKVBuilder const&) = delete; - VHashKVBuilder(VHashKVBuilder&&) = default; + public: + VHashKVBuilder(IVolatileAllocator& a, OldRecordsCleaner& c); + VHashKVBuilder(VHashKVBuilder const&) = delete; + VHashKVBuilder(VHashKVBuilder&&) = default; - VHashKV* NewKV(StringView key, StringView value); + VHashKV* NewKV(StringView key, StringView value); - // Recycle VHashKV to OldRecordsCleaner for later deletion. - void Recycle(VHashKV* kv); + // Recycle VHashKV to OldRecordsCleaner for later deletion. + void Recycle(VHashKV* kv); - // Called by OldRecordsCleaner to delete KV. - void Delete(void* kv) final; + // Called by OldRecordsCleaner to delete KV. + void Delete(void* kv) final; }; - - -} // KVDK_NAMESPACE +} // namespace KVDK_NAMESPACE diff --git a/engine/kv_engine.hpp b/engine/kv_engine.hpp index 5a7f4fdc..51095a64 100644 --- a/engine/kv_engine.hpp +++ b/engine/kv_engine.hpp @@ -238,8 +238,9 @@ class KVEngine : public Engine { Status VHashPut(StringView key, StringView field, StringView value) final; Status VHashDelete(StringView key, StringView field) final; Status VHashModify(StringView key, StringView field, ModifyFunc modify_func, - void* cb_args) final; - std::unique_ptrVHashIteratorCreate(StringView key, Status* s) final; + void* cb_args) final; + std::unique_ptr VHashIteratorCreate(StringView key, + Status* s) final; private: // Look up a first level key in hash table(e.g. collections or string, not diff --git a/engine/kv_engine_vhash.cpp b/engine/kv_engine_vhash.cpp index a1f075d7..8a7b7070 100644 --- a/engine/kv_engine_vhash.cpp +++ b/engine/kv_engine_vhash.cpp @@ -3,147 +3,145 @@ namespace KVDK_NAMESPACE { -Status KVEngine::VHashCreate(StringView key) -KVDK_TRY -{ +Status KVEngine::VHashCreate(StringView key) KVDK_TRY { Status s = MaybeInitAccessThread(); defer(ReleaseAccessThread()); if (s != Status::Ok) return s; if (!CheckKeySize(key)) return Status::InvalidDataSize; - return vhashes_.Create(key); + return vhashes_.Create(key) ? Status::Ok : Status::Existed; } KVDK_HANDLE_EXCEPTIONS -Status KVEngine::VHashDestroy(StringView key) -KVDK_TRY -{ +Status KVEngine::VHashDestroy(StringView key) KVDK_TRY { Status s = MaybeInitAccessThread(); defer(ReleaseAccessThread()); if (s != Status::Ok) return s; if (!CheckKeySize(key)) return Status::InvalidDataSize; - - return vhashes_.Destroy(key); + + return vhashes_.Destroy(key) ? Status::Ok : Status::NotFound; } KVDK_HANDLE_EXCEPTIONS -Status KVEngine::VHashSize(StringView key, size_t* len) -KVDK_TRY -{ - Status s = MaybeInitAccessThread(); - if (s != Status::Ok) return s; - if (!CheckKeySize(key)) return Status::InvalidDataSize; +Status KVEngine::VHashSize(StringView key, size_t* len) KVDK_TRY { + Status s = MaybeInitAccessThread(); + if (s != Status::Ok) return s; + if (!CheckKeySize(key)) return Status::InvalidDataSize; - auto token = version_controller_.GetLocalSnapshotHolder(); - VHash* vhash = nullptr; - s = vhashes_.Get(key, &vhash); - if (s != Status::Ok) return s; + auto token = version_controller_.GetLocalSnapshotHolder(); + VHash* vhash = vhashes_.Get(key); + if (vhash == nullptr) return Status::NotFound; - *len = vhash->Size(); - return Status::Ok; + *len = vhash->Size(); + return Status::Ok; } KVDK_HANDLE_EXCEPTIONS -Status KVEngine::VHashGet(StringView key, StringView field, std::string* value) -KVDK_TRY -{ - Status s = MaybeInitAccessThread(); - if (s != Status::Ok) return s; - if (!CheckKeySize(key) || !CheckKeySize(field)) return Status::InvalidDataSize; - - auto token = version_controller_.GetLocalSnapshotHolder(); - VHash* vhash = nullptr; - s = vhashes_.Get(key, &vhash); - if (s != Status::Ok) return s; - - auto CopyValue = [&](void* dst, StringView src) - { - static_cast(dst)->assign(src.data(), src.size()); - }; - return vhash->Get(field, CopyValue, value); - -return Status::NotSupported; +Status KVEngine::VHashGet(StringView key, StringView field, + std::string* value) KVDK_TRY { + Status s = MaybeInitAccessThread(); + if (s != Status::Ok) return s; + if (!CheckKeySize(key) || !CheckKeySize(field)) + return Status::InvalidDataSize; + + auto token = version_controller_.GetLocalSnapshotHolder(); + VHash* vhash = vhashes_.Get(key); + if (vhash == nullptr) return Status::NotFound; + + StringView val; + if (!vhash->Get(field, val)) return Status::NotFound; + + value->assign(val.data(), val.size()); + + return Status::Ok; } KVDK_HANDLE_EXCEPTIONS -Status KVEngine::VHashPut(StringView key, StringView field, StringView value) -KVDK_TRY -{ - Status s = MaybeInitAccessThread(); - if (s != Status::Ok) return s; - if (!CheckKeySize(key) || !CheckKeySize(field) || !CheckValueSize(value)) - return Status::InvalidDataSize; +Status KVEngine::VHashPut(StringView key, StringView field, + StringView value) KVDK_TRY { + Status s = MaybeInitAccessThread(); + if (s != Status::Ok) return s; + if (!CheckKeySize(key) || !CheckKeySize(field) || !CheckValueSize(value)) + return Status::InvalidDataSize; - auto token = version_controller_.GetLocalSnapshotHolder(); - VHash* vhash = nullptr; - s = vhashes_.Get(key, &vhash); - if (s != Status::Ok) return s; + auto token = version_controller_.GetLocalSnapshotHolder(); + VHash* vhash = vhashes_.Get(key); + if (vhash == nullptr) return s; - return vhash->Put(field, value); + vhash->Put(field, value); + return Status::Ok; } KVDK_HANDLE_EXCEPTIONS -Status KVEngine::VHashDelete(StringView key, StringView field) -KVDK_TRY -{ - Status s = MaybeInitAccessThread(); - if (s != Status::Ok) return s; - if (!CheckKeySize(key) || !CheckKeySize(field)) return Status::InvalidDataSize; +Status KVEngine::VHashDelete(StringView key, StringView field) KVDK_TRY { + Status s = MaybeInitAccessThread(); + if (s != Status::Ok) return s; + if (!CheckKeySize(key) || !CheckKeySize(field)) + return Status::InvalidDataSize; - auto token = version_controller_.GetLocalSnapshotHolder(); - VHash* vhash = nullptr; - s = vhashes_.Get(key, &vhash); - if (s != Status::Ok) return s; + auto token = version_controller_.GetLocalSnapshotHolder(); + VHash* vhash = vhashes_.Get(key); + if (vhash == nullptr) return s; - return vhash->Delete(field); + vhash->Delete(field); + return Status::Ok; } KVDK_HANDLE_EXCEPTIONS -Status KVEngine::VHashModify(StringView key, StringView field, ModifyFunc modify_func, - void* cb_args) -KVDK_TRY -{ - Status s = MaybeInitAccessThread(); - if (s != Status::Ok) return s; - if (!CheckKeySize(key) || !CheckKeySize(field)) return Status::InvalidDataSize; - - std::string old_value; - std::string new_value; - auto modify = [&](StringView const* old_val, StringView& new_val, void* args) - { - if (old_val != nullptr) old_value.assign(old_val->data(), old_val->size()); - ModifyOperation op = modify_func(old_val ? &old_value : nullptr, &new_value, args); - new_val = new_value; - return op; - }; - - auto cleanup = [&](StringView) { return; }; - - auto token = version_controller_.GetLocalSnapshotHolder(); - VHash* vhash = nullptr; - s = vhashes_.Get(key, &vhash); - if (s != Status::Ok) return s; - - return vhash->Modify(field, modify, cb_args, cleanup); +Status KVEngine::VHashModify(StringView key, StringView field, + ModifyFunc modify_func, void* cb_args) KVDK_TRY { + Status s = MaybeInitAccessThread(); + if (s != Status::Ok) return s; + if (!CheckKeySize(key) || !CheckKeySize(field)) + return Status::InvalidDataSize; + + std::string old_value; + std::string new_value; + auto modify = [&](StringView const* old_val, StringView& new_val, + void* args) { + if (old_val != nullptr) old_value.assign(old_val->data(), old_val->size()); + ModifyOperation op = + modify_func(old_val ? &old_value : nullptr, &new_value, args); + new_val = new_value; + return op; + }; + + auto cleanup = [&](StringView) { return; }; + + auto token = version_controller_.GetLocalSnapshotHolder(); + VHash* vhash = vhashes_.Get(key); + if (vhash == nullptr) return s; + + return (vhash->Modify(field, modify, cb_args, cleanup)) ? Status::Ok + : Status::Abort; } KVDK_HANDLE_EXCEPTIONS -std::unique_ptrKVEngine::VHashIteratorCreate(StringView key, Status* s) -{ - Status sink; - s = (s != nullptr) ? s : &sink; - - *s = MaybeInitAccessThread(); - if (*s != Status::Ok) return nullptr; - if (!CheckKeySize(key)) return nullptr; - - /// TODO: iterator should hold an access token to keep VHash valid. - auto token = version_controller_.GetLocalSnapshotHolder(); - VHash* vhash = nullptr; - *s = vhashes_.Get(key, &vhash); - if (*s != Status::Ok) return nullptr; - - return vhash->MakeIterator(); +std::unique_ptr KVEngine::VHashIteratorCreate(StringView key, + Status* s) try { + Status sink; + s = (s != nullptr) ? s : &sink; + + *s = MaybeInitAccessThread(); + if (*s != Status::Ok) return nullptr; + if (!CheckKeySize(key)) return nullptr; + + /// TODO: iterator should hold an access token to keep VHash valid. + auto token = version_controller_.GetLocalSnapshotHolder(); + VHash* vhash = vhashes_.Get(key); + if (vhash == nullptr) return nullptr; + + return vhash->MakeIterator(); +} catch (std::exception const& ex) { + Status sink; + s = (s != nullptr) ? s : &sink; + *s = ExceptionToStatus(ex); + return nullptr; +} catch (...) { + Status sink; + s = (s != nullptr) ? s : &sink; + *s = Status::Abort; + return nullptr; } -} // namespace KVDK_NAMESPACE +} // namespace KVDK_NAMESPACE diff --git a/engine/macros.hpp b/engine/macros.hpp index 9a8d3359..641c8522 100644 --- a/engine/macros.hpp +++ b/engine/macros.hpp @@ -23,16 +23,24 @@ } \ } -namespace KVDK_NAMESPACE -{ -#define KVDK_TRY try +namespace KVDK_NAMESPACE { +inline Status ExceptionToStatus(std::exception const& ex) { + if (dynamic_cast(&ex)) return Status::MemoryOverflow; + if (dynamic_cast(&ex)) return Status::OutOfRange; + if (dynamic_cast(&ex)) + return Status::InvalidArgument; -#define KVDK_HANDLE_EXCEPTIONS \ -catch(std::bad_alloc const&) { return Status::MemoryOverflow; } \ -catch(std::out_of_range const&) { return Status::OutOfRange; } \ -catch(std::invalid_argument const&) { return Status::InvalidArgument; } \ -catch(...) { return Status::Abort; } + return Status::Abort; +} -} // KVDK_NAMESPACE +#define KVDK_TRY try +#define KVDK_HANDLE_EXCEPTIONS \ + catch (std::exception const& ex) { \ + return ExceptionToStatus(ex); \ + } \ + catch (...) { \ + return Status::Abort; \ + } +} // namespace KVDK_NAMESPACE diff --git a/engine/version/old_records_cleaner.cpp b/engine/version/old_records_cleaner.cpp index bd2c043e..fb8d285f 100644 --- a/engine/version/old_records_cleaner.cpp +++ b/engine/version/old_records_cleaner.cpp @@ -43,42 +43,39 @@ void OldRecordsCleaner::PushToPendingFree(void* addr, TimeStampType ts) { } } -void OldRecordsCleaner::RegisterDelayDeleter(IDeleter& deleter) -{ - size_t idx = global_queues_.size(); - delay_deleters_[&deleter] = idx; - global_queues_.emplace_back(); - for (size_t i = 0; i < cleaner_thread_cache_.size(); i++) - cleaner_thread_cache_[i].local_queues_.emplace_back(); +void OldRecordsCleaner::RegisterDelayDeleter(IDeleter& deleter) { + size_t idx = global_queues_.size(); + delay_deleters_[&deleter] = idx; + global_queues_.emplace_back(); + for (size_t i = 0; i < cleaner_thread_cache_.size(); i++) + cleaner_thread_cache_[i].local_queues_.emplace_back(); } -void OldRecordsCleaner::DelayDelete(IDeleter& deleter, void* obj) -{ - kvdk_assert(access_thread.id >= 0, ""); - auto& tc = cleaner_thread_cache_[access_thread.id]; - std::lock_guard guard{tc.old_records_lock}; - +void OldRecordsCleaner::DelayDelete(IDeleter& deleter, void* obj) { + kvdk_assert(access_thread.id >= 0, ""); + auto& tc = cleaner_thread_cache_[access_thread.id]; + std::lock_guard guard{tc.old_records_lock}; - TimeStampType ts = kv_engine_->version_controller_.GetCurrentTimestamp(); + TimeStampType ts = kv_engine_->version_controller_.GetCurrentTimestamp(); - size_t idx = delay_deleters_.at(&deleter); - tc.local_queues_[idx].emplace_back(ts, obj); + size_t idx = delay_deleters_.at(&deleter); + tc.local_queues_[idx].emplace_back(ts, obj); - constexpr size_t kMaxFreePending = 16; - tryPurge(deleter, tc.local_queues_[idx], kMaxFreePending); + constexpr size_t kMaxFreePending = 16; + tryPurge(deleter, tc.local_queues_[idx], kMaxFreePending); } -void OldRecordsCleaner::tryPurge(IDeleter& deleter, PendingQueue& pending_kvs, size_t lim) -{ - maybeUpdateOldestSnapshot(); - TimeStampType acc_ts = kv_engine_->version_controller_.LocalOldestSnapshotTS(); - for (size_t i = 0; i < lim && !pending_kvs.empty(); i++) - { - auto const& pair = pending_kvs.front(); - if (pair.first >= acc_ts) break; - deleter.Delete(pair.second); - pending_kvs.pop_front(); - } +void OldRecordsCleaner::tryPurge(IDeleter& deleter, PendingQueue& pending_kvs, + size_t lim) { + maybeUpdateOldestSnapshot(); + TimeStampType acc_ts = + kv_engine_->version_controller_.LocalOldestSnapshotTS(); + for (size_t i = 0; i < lim && !pending_kvs.empty(); i++) { + auto const& pair = pending_kvs.front(); + if (pair.first >= acc_ts) break; + deleter.Delete(pair.second); + pending_kvs.pop_front(); + } } void OldRecordsCleaner::TryGlobalClean() { @@ -107,7 +104,8 @@ void OldRecordsCleaner::TryGlobalClean() { std::lock_guard lg(cleaner_thread_cache.old_records_lock); for (auto const& del_idx : delay_deleters_) - tryPurge(*del_idx.first, cleaner_thread_cache.local_queues_[del_idx.second], -1U); + tryPurge(*del_idx.first, + cleaner_thread_cache.local_queues_[del_idx.second], -1U); } auto iter = global_pending_free_space_entries_.begin(); diff --git a/engine/version/old_records_cleaner.hpp b/engine/version/old_records_cleaner.hpp index 77b1a4ce..fa32e1cc 100644 --- a/engine/version/old_records_cleaner.hpp +++ b/engine/version/old_records_cleaner.hpp @@ -6,8 +6,8 @@ #include #include #include -#include #include +#include #include "../alias.hpp" #include "../collection.hpp" @@ -21,11 +21,10 @@ namespace KVDK_NAMESPACE { class KVEngine; -class IDeleter -{ -public: - // Called by OldRecordsCleaner for actual deletion. - virtual void Delete(void* obj) = 0; +class IDeleter { + public: + // Called by OldRecordsCleaner for actual deletion. + virtual void Delete(void* obj) = 0; }; // OldRecordsCleaner is used to clean old version PMem records of kvdk @@ -62,7 +61,8 @@ class OldRecordsCleaner { void maybeUpdateOldestSnapshot(); - // Try purging some entries from a locked PendingQueue with associated deleter. + // Try purging some entries from a locked PendingQueue with associated + // deleter. void tryPurge(IDeleter& deleter, PendingQueue& pending_kvs, size_t lim); KVEngine* kv_engine_; diff --git a/include/kvdk/engine.hpp b/include/kvdk/engine.hpp index 3a7a0018..93d861a0 100644 --- a/include/kvdk/engine.hpp +++ b/include/kvdk/engine.hpp @@ -340,12 +340,15 @@ class Engine { virtual Status VHashCreate(StringView key) = 0; virtual Status VHashDestroy(StringView key) = 0; virtual Status VHashSize(StringView key, size_t* len) = 0; - virtual Status VHashGet(StringView key, StringView field, std::string* value) = 0; - virtual Status VHashPut(StringView key, StringView field, StringView value) = 0; + virtual Status VHashGet(StringView key, StringView field, + std::string* value) = 0; + virtual Status VHashPut(StringView key, StringView field, + StringView value) = 0; virtual Status VHashDelete(StringView key, StringView field) = 0; - virtual Status VHashModify(StringView key, StringView field, ModifyFunc modify_func, - void* cb_args) = 0; - virtual std::unique_ptrVHashIteratorCreate(StringView key, Status* s) = 0; + virtual Status VHashModify(StringView key, StringView field, + ModifyFunc modify_func, void* cb_args) = 0; + virtual std::unique_ptr VHashIteratorCreate(StringView key, + Status* s) = 0; /// Other /////////////////////////////////////////////////////////////////// From d7c22eab7127a9e699cc540608c09b792eac30e8 Mon Sep 17 00:00:00 2001 From: ZiyanShi Date: Wed, 7 Sep 2022 21:06:32 -0400 Subject: [PATCH 04/23] Add benchmark Signed-off-by: ZiyanShi --- benchmark/bench.cpp | 60 ++++++++++++++++++++++++++++- engine/experimental/vhash.cpp | 5 ++- engine/experimental/vhash.hpp | 13 +++++-- engine/experimental/vhash_group.cpp | 9 ++++- engine/experimental/vhash_group.hpp | 4 +- engine/kv_engine.hpp | 2 +- engine/kv_engine_vhash.cpp | 4 +- engine/thread_manager.cpp | 4 +- engine/thread_manager.hpp | 2 +- include/kvdk/engine.hpp | 2 +- scripts/benchmark_impl.py | 2 + scripts/run_benchmark.py | 4 +- 12 files changed, 95 insertions(+), 16 deletions(-) diff --git a/benchmark/bench.cpp b/benchmark/bench.cpp index 93cf6031..b13fba3c 100644 --- a/benchmark/bench.cpp +++ b/benchmark/bench.cpp @@ -122,12 +122,28 @@ std::vector> latencies; std::vector random_engines; std::vector ranges; -enum class DataType { String, Sorted, Hashes, List, Blackhole } bench_data_type; +enum class DataType { + String, + Sorted, + Hashes, + List, + VHash, + Blackhole +} bench_data_type; enum class KeyDistribution { Range, Uniform, Zipf } key_dist; enum class ValueSizeDistribution { Constant, Uniform } vsz_dist; +void LaunchNThreads(int n_thread, std::function func, + int id_start = 0) { + std::vector ts; + for (int i = id_start; i < id_start + n_thread; i++) { + ts.emplace_back(std::thread(func, i)); + } + for (auto& t : ts) t.join(); +} + std::uint64_t generate_key(size_t tid) { static std::uint64_t max_key = FLAGS_existing_keys_ratio == 0 ? UINT64_MAX @@ -164,6 +180,22 @@ size_t generate_value_size(size_t tid) { } } +void FillVHash(size_t tid) { + std::string key(8, ' '); + for (size_t i = 0; i < FLAGS_num_kv / FLAGS_num_collection; ++i) { + std::uint64_t num = ranges[tid].gen(); + std::uint64_t cid = num % FLAGS_num_collection; + memcpy(&key[0], &num, 8); + StringView value = StringView(value_pool.data(), generate_value_size(tid)); + + Status s = engine->VHashPut(collections[cid], key, value); + + if (s != Status::Ok) { + throw std::runtime_error{"VHashPut error"}; + } + } +} + void DBWrite(int tid) { std::string key(8, ' '); std::unique_ptr batch; @@ -230,6 +262,10 @@ void DBWrite(int tid) { s = engine->ListPushFront(collections[cid], value); break; } + case DataType::VHash: { + s = engine->VHashPut(collections[cid], key, value); + break; + } case DataType::Blackhole: { s = Status::Ok; break; @@ -366,6 +402,10 @@ void DBRead(int tid) { s = engine->ListPopBack(collections[cid], &value_sink); break; } + case DataType::VHash: { + s = engine->VHashGet(collections[cid], key, &value_sink); + break; + } case DataType::Blackhole: { s = Status::Ok; break; @@ -412,6 +452,8 @@ void ProcessBenchmarkConfigs() { bench_data_type = DataType::Hashes; } else if (FLAGS_type == "list") { bench_data_type = DataType::List; + } else if (FLAGS_type == "vhash") { + bench_data_type = DataType::VHash; } else if (FLAGS_type == "blackhole") { bench_data_type = DataType::Blackhole; } else { @@ -425,6 +467,7 @@ void ProcessBenchmarkConfigs() { } case DataType::Hashes: case DataType::List: + case DataType::VHash: case DataType::Sorted: { collections.resize(FLAGS_num_collection); for (size_t i = 0; i < FLAGS_num_collection; i++) { @@ -437,6 +480,9 @@ void ProcessBenchmarkConfigs() { if (FLAGS_batch_size > 0 && (bench_data_type == DataType::List)) { throw std::invalid_argument{R"(List does not support batch write.)"}; } + if (FLAGS_batch_size > 0 && (bench_data_type == DataType::VHash)) { + throw std::invalid_argument{R"(VHash does not support batch write.)"}; + } // Check for scan flag switch (bench_data_type) { @@ -458,6 +504,7 @@ void ProcessBenchmarkConfigs() { random_engines.resize(FLAGS_threads); if (FLAGS_fill) { + assert(bench_data_type != DataType::VHash && "VHash don't need fill"); assert(FLAGS_read_ratio == 0); key_dist = KeyDistribution::Range; operations_per_thread = FLAGS_num_kv / FLAGS_max_access_threads + 1; @@ -558,6 +605,17 @@ int main(int argc, char** argv) { engine->ReleaseAccessThread(); break; } + case DataType::VHash: { + for (auto col : collections) { + Status s = + engine->VHashCreate(col, FLAGS_num_kv / FLAGS_num_collection); + if (s != Status::Ok) { + throw std::runtime_error{"Fail to create VHash"}; + } + } + LaunchNThreads(FLAGS_threads, FillVHash); + break; + } default: { break; } diff --git a/engine/experimental/vhash.cpp b/engine/experimental/vhash.cpp index 7644c054..fe482169 100644 --- a/engine/experimental/vhash.cpp +++ b/engine/experimental/vhash.cpp @@ -111,8 +111,9 @@ VHashBuilder::VHashBuilder(OldRecordsCleaner& c) : cleaner{c} { cleaner.RegisterDelayDeleter(*this); } -VHash* VHashBuilder::NewVHash(StringView name, VHashKVBuilder& kvb) { - return new VHash{name, kvb}; +VHash* VHashBuilder::NewVHash(StringView name, VHashKVBuilder& kvb, + size_t capacity) { + return new VHash{name, kvb, capacity}; } void VHashBuilder::Recycle(VHash* vhash) { diff --git a/engine/experimental/vhash.hpp b/engine/experimental/vhash.hpp index 37853e9e..8adde27e 100644 --- a/engine/experimental/vhash.hpp +++ b/engine/experimental/vhash.hpp @@ -27,8 +27,12 @@ class VHash { std::atomic_uint64_t ref_cnt{0ULL}; public: - VHash(StringView n, VHashKVBuilder& b) - : hpmap{4, VHashKV::ExtractKey}, kvb{b}, name{n.data(), n.size()} {} + VHash(StringView n, VHashKVBuilder& b, size_t capacity) + : hpmap{capacity / 4, VHashKV::ExtractKey}, + kvb{b}, + name{n.data(), n.size()} {} + + ~VHash() { deleteAll(); } StringView Name() const { return name; } @@ -36,6 +40,9 @@ class VHash { size_t Size() const { return sz.load(); } + /// TODO: Reserve() API for performance. + // void Reserve(size_t n); + bool Get(StringView key, StringView& value); void Put(StringView key, StringView value); @@ -82,7 +89,7 @@ class VHashBuilder : public IDeleter { VHashBuilder(OldRecordsCleaner& c); // Called by VHashGroup to create a VHash. - VHash* NewVHash(StringView name, VHashKVBuilder& b); + VHash* NewVHash(StringView name, VHashKVBuilder& b, size_t capacity = 16); void Recycle(VHash* vhash); diff --git a/engine/experimental/vhash_group.cpp b/engine/experimental/vhash_group.cpp index f98a17c3..d9193961 100644 --- a/engine/experimental/vhash_group.cpp +++ b/engine/experimental/vhash_group.cpp @@ -2,10 +2,15 @@ namespace KVDK_NAMESPACE { -bool VHashGroup::Create(StringView name) { +VHashGroup::~VHashGroup() { + for (auto iter = hpmap.begin(); iter != hpmap.end(); ++iter) + delete iter->pointer(); +} + +bool VHashGroup::Create(StringView name, size_t capacity) { auto acc = hpmap.lookup(name, hpmap.acquire_lock); if (acc.pointer() != nullptr) return false; - VHash* vhash = vhb.NewVHash(name, kvb); + VHash* vhash = vhb.NewVHash(name, kvb, capacity); acc.set_pointer(vhash); return true; } diff --git a/engine/experimental/vhash_group.hpp b/engine/experimental/vhash_group.hpp index 597abca6..dc6e78d7 100644 --- a/engine/experimental/vhash_group.hpp +++ b/engine/experimental/vhash_group.hpp @@ -25,7 +25,9 @@ class VHashGroup { VHashGroup(IVolatileAllocator& a, OldRecordsCleaner& c) : kv_alloc{a}, cleaner{c} {} - bool Create(StringView name); + ~VHashGroup(); + + bool Create(StringView name, size_t capacity); bool Destroy(StringView name); diff --git a/engine/kv_engine.hpp b/engine/kv_engine.hpp index 51095a64..d0b19403 100644 --- a/engine/kv_engine.hpp +++ b/engine/kv_engine.hpp @@ -231,7 +231,7 @@ class KVEngine : public Engine { void HashIteratorRelease(HashIterator*) final; // Volatile Hash - Status VHashCreate(StringView key) final; + Status VHashCreate(StringView key, size_t capacity) final; Status VHashDestroy(StringView key) final; Status VHashSize(StringView key, size_t* len) final; Status VHashGet(StringView key, StringView field, std::string* value) final; diff --git a/engine/kv_engine_vhash.cpp b/engine/kv_engine_vhash.cpp index 8a7b7070..352eb3a2 100644 --- a/engine/kv_engine_vhash.cpp +++ b/engine/kv_engine_vhash.cpp @@ -3,13 +3,13 @@ namespace KVDK_NAMESPACE { -Status KVEngine::VHashCreate(StringView key) KVDK_TRY { +Status KVEngine::VHashCreate(StringView key, size_t capacity) KVDK_TRY { Status s = MaybeInitAccessThread(); defer(ReleaseAccessThread()); if (s != Status::Ok) return s; if (!CheckKeySize(key)) return Status::InvalidDataSize; - return vhashes_.Create(key) ? Status::Ok : Status::Existed; + return vhashes_.Create(key, capacity) ? Status::Ok : Status::Existed; } KVDK_HANDLE_EXCEPTIONS diff --git a/engine/thread_manager.cpp b/engine/thread_manager.cpp index 1bbea1c7..ab1100ed 100644 --- a/engine/thread_manager.cpp +++ b/engine/thread_manager.cpp @@ -41,11 +41,13 @@ Status ThreadManager::MaybeInitThread(Thread& t) { return Status::Ok; } -void ThreadManager::Release(const Thread& t) { +void ThreadManager::Release(Thread& t) { if (t.id >= 0) { assert(static_cast(t.id) < max_threads_); std::lock_guard lg(spin_); usable_id_.insert(t.id); + // Prevent double-release. + t.id = -1; } } diff --git a/engine/thread_manager.hpp b/engine/thread_manager.hpp index 748987dc..526a4d39 100644 --- a/engine/thread_manager.hpp +++ b/engine/thread_manager.hpp @@ -29,7 +29,7 @@ class ThreadManager : public std::enable_shared_from_this { ThreadManager(uint32_t max_threads) : max_threads_(max_threads), ids_(0) {} Status MaybeInitThread(Thread& t); - void Release(const Thread& t); + void Release(Thread& t); private: uint32_t max_threads_; diff --git a/include/kvdk/engine.hpp b/include/kvdk/engine.hpp index 93d861a0..9ff98269 100644 --- a/include/kvdk/engine.hpp +++ b/include/kvdk/engine.hpp @@ -337,7 +337,7 @@ class Engine { /// Volatile Hash APIs ////////////////////////////////////////////////////// - virtual Status VHashCreate(StringView key) = 0; + virtual Status VHashCreate(StringView key, size_t capacity = (1UL << 20)) = 0; virtual Status VHashDestroy(StringView key) = 0; virtual Status VHashSize(StringView key, size_t* len) = 0; virtual Status VHashGet(StringView key, StringView field, diff --git a/scripts/benchmark_impl.py b/scripts/benchmark_impl.py index 30a009ee..a2111477 100644 --- a/scripts/benchmark_impl.py +++ b/scripts/benchmark_impl.py @@ -9,6 +9,8 @@ def __fill(exec, shared_para, data_type, report_path): + if data_type == 'vhash' + return new_para = shared_para + " -fill=1 -type={}".format(data_type) report = report_path + "_fill" print("Fill {}".format(data_type)) diff --git a/scripts/run_benchmark.py b/scripts/run_benchmark.py index ce8990f2..9439a894 100644 --- a/scripts/run_benchmark.py +++ b/scripts/run_benchmark.py @@ -44,8 +44,10 @@ data_types = ['list'] elif sys.argv[1] == 'blackhole': data_types = ['blackhole'] + elif sys.argv[1] == 'vhash': + data_types = ['vhash'] elif sys.argv[1] == 'all': - data_types = ['blackhole', 'string', 'sorted', 'hash', 'list'] + data_types = ['blackhole', 'string', 'sorted', 'hash', 'list', 'vhash'] else: print(usage) exit(1) From cd6a7926e745ffd9ccf8dd01198ef097e0117690 Mon Sep 17 00:00:00 2001 From: ZiyanShi Date: Thu, 15 Sep 2022 22:32:03 -0400 Subject: [PATCH 05/23] rename Signed-off-by: ZiyanShi --- engine/kv_engine_vhash.cpp | 32 +++++++++++++------------- engine/version/old_records_cleaner.cpp | 4 ++-- engine/version/old_records_cleaner.hpp | 2 +- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/engine/kv_engine_vhash.cpp b/engine/kv_engine_vhash.cpp index 352eb3a2..733cf6bf 100644 --- a/engine/kv_engine_vhash.cpp +++ b/engine/kv_engine_vhash.cpp @@ -4,29 +4,29 @@ namespace KVDK_NAMESPACE { Status KVEngine::VHashCreate(StringView key, size_t capacity) KVDK_TRY { - Status s = MaybeInitAccessThread(); + Status s = maybeInitAccessThread(); defer(ReleaseAccessThread()); if (s != Status::Ok) return s; - if (!CheckKeySize(key)) return Status::InvalidDataSize; + if (!checkKeySize(key)) return Status::InvalidDataSize; return vhashes_.Create(key, capacity) ? Status::Ok : Status::Existed; } KVDK_HANDLE_EXCEPTIONS Status KVEngine::VHashDestroy(StringView key) KVDK_TRY { - Status s = MaybeInitAccessThread(); + Status s = maybeInitAccessThread(); defer(ReleaseAccessThread()); if (s != Status::Ok) return s; - if (!CheckKeySize(key)) return Status::InvalidDataSize; + if (!checkKeySize(key)) return Status::InvalidDataSize; return vhashes_.Destroy(key) ? Status::Ok : Status::NotFound; } KVDK_HANDLE_EXCEPTIONS Status KVEngine::VHashSize(StringView key, size_t* len) KVDK_TRY { - Status s = MaybeInitAccessThread(); + Status s = maybeInitAccessThread(); if (s != Status::Ok) return s; - if (!CheckKeySize(key)) return Status::InvalidDataSize; + if (!checkKeySize(key)) return Status::InvalidDataSize; auto token = version_controller_.GetLocalSnapshotHolder(); VHash* vhash = vhashes_.Get(key); @@ -39,9 +39,9 @@ KVDK_HANDLE_EXCEPTIONS Status KVEngine::VHashGet(StringView key, StringView field, std::string* value) KVDK_TRY { - Status s = MaybeInitAccessThread(); + Status s = maybeInitAccessThread(); if (s != Status::Ok) return s; - if (!CheckKeySize(key) || !CheckKeySize(field)) + if (!checkKeySize(key) || !checkKeySize(field)) return Status::InvalidDataSize; auto token = version_controller_.GetLocalSnapshotHolder(); @@ -59,9 +59,9 @@ KVDK_HANDLE_EXCEPTIONS Status KVEngine::VHashPut(StringView key, StringView field, StringView value) KVDK_TRY { - Status s = MaybeInitAccessThread(); + Status s = maybeInitAccessThread(); if (s != Status::Ok) return s; - if (!CheckKeySize(key) || !CheckKeySize(field) || !CheckValueSize(value)) + if (!checkKeySize(key) || !checkKeySize(field) || !checkValueSize(value)) return Status::InvalidDataSize; auto token = version_controller_.GetLocalSnapshotHolder(); @@ -74,9 +74,9 @@ Status KVEngine::VHashPut(StringView key, StringView field, KVDK_HANDLE_EXCEPTIONS Status KVEngine::VHashDelete(StringView key, StringView field) KVDK_TRY { - Status s = MaybeInitAccessThread(); + Status s = maybeInitAccessThread(); if (s != Status::Ok) return s; - if (!CheckKeySize(key) || !CheckKeySize(field)) + if (!checkKeySize(key) || !checkKeySize(field)) return Status::InvalidDataSize; auto token = version_controller_.GetLocalSnapshotHolder(); @@ -90,9 +90,9 @@ KVDK_HANDLE_EXCEPTIONS Status KVEngine::VHashModify(StringView key, StringView field, ModifyFunc modify_func, void* cb_args) KVDK_TRY { - Status s = MaybeInitAccessThread(); + Status s = maybeInitAccessThread(); if (s != Status::Ok) return s; - if (!CheckKeySize(key) || !CheckKeySize(field)) + if (!checkKeySize(key) || !checkKeySize(field)) return Status::InvalidDataSize; std::string old_value; @@ -122,9 +122,9 @@ std::unique_ptr KVEngine::VHashIteratorCreate(StringView key, Status sink; s = (s != nullptr) ? s : &sink; - *s = MaybeInitAccessThread(); + *s = maybeInitAccessThread(); if (*s != Status::Ok) return nullptr; - if (!CheckKeySize(key)) return nullptr; + if (!checkKeySize(key)) return nullptr; /// TODO: iterator should hold an access token to keep VHash valid. auto token = version_controller_.GetLocalSnapshotHolder(); diff --git a/engine/version/old_records_cleaner.cpp b/engine/version/old_records_cleaner.cpp index a6ba5a1f..467b1eea 100644 --- a/engine/version/old_records_cleaner.cpp +++ b/engine/version/old_records_cleaner.cpp @@ -22,7 +22,7 @@ void OldRecordsCleaner::DelayDelete(IDeleter& deleter, void* obj) { auto& tc = cleaner_thread_cache_[access_thread.id]; std::lock_guard guard{tc.old_records_lock}; - TimeStampType ts = kv_engine_->version_controller_.GetCurrentTimestamp(); + TimestampType ts = kv_engine_->version_controller_.GetCurrentTimestamp(); size_t idx = delay_deleters_.at(&deleter); tc.local_queues_[idx].emplace_back(ts, obj); @@ -34,7 +34,7 @@ void OldRecordsCleaner::DelayDelete(IDeleter& deleter, void* obj) { void OldRecordsCleaner::tryPurge(IDeleter& deleter, PendingQueue& pending_kvs, size_t lim) { maybeUpdateOldestSnapshot(); - TimeStampType acc_ts = + TimestampType acc_ts = kv_engine_->version_controller_.LocalOldestSnapshotTS(); for (size_t i = 0; i < lim && !pending_kvs.empty(); i++) { auto const& pair = pending_kvs.front(); diff --git a/engine/version/old_records_cleaner.hpp b/engine/version/old_records_cleaner.hpp index bfb34419..d4aaeda3 100644 --- a/engine/version/old_records_cleaner.hpp +++ b/engine/version/old_records_cleaner.hpp @@ -48,7 +48,7 @@ class OldRecordsCleaner { void TryGlobalClean(); private: - using PendingQueue = std::deque>; + using PendingQueue = std::deque>; struct CleanerThreadCache { std::deque pending_free_space_entries{}; From bf2d10eec97c9fa65d9b032249e0d79067547857 Mon Sep 17 00:00:00 2001 From: ZiyanShi Date: Fri, 16 Sep 2022 01:57:04 -0400 Subject: [PATCH 06/23] Add cmake option Signed-off-by: ZiyanShi --- CMakeLists.txt | 19 ++++++++++++++----- benchmark/bench.cpp | 15 +++++++++++++++ engine/kv_engine.hpp | 9 ++++++++- include/kvdk/engine.hpp | 2 ++ include/kvdk/iterator.hpp | 2 ++ 5 files changed, 41 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 249dcd2a..caa8b7b2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,8 +11,9 @@ include(GNUInstallDirs) set(CMAKE_CXX_STANDARD 11) option(COVERAGE "code coverage" OFF) +option(KVDK_ENABLE_VHASH "Enable experimental VHash in KVDK" ON) -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mrdseed -mrdrnd -mclwb -mclflushopt -mlzcnt -mbmi -mavx512bw -mavx512vl") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mrdseed -mrdrnd -mclwb -mclflushopt") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") if (CMAKE_BUILD_TYPE STREQUAL "Release") @@ -52,9 +53,6 @@ set(SOURCES engine/c/kvdk_list.cpp engine/c/kvdk_sorted.cpp engine/c/kvdk_string.cpp - engine/experimental/vhash_kv.cpp - engine/experimental/vhash.cpp - engine/experimental/vhash_group.cpp engine/utils/utils.cpp engine/utils/sync_point.cpp engine/engine.cpp @@ -64,7 +62,6 @@ set(SOURCES engine/kv_engine_list.cpp engine/kv_engine_sorted.cpp engine/kv_engine_string.cpp - engine/kv_engine_vhash.cpp engine/logger.cpp engine/hash_table.cpp engine/sorted_collection/skiplist.cpp @@ -79,7 +76,19 @@ set(SOURCES engine/data_record.cpp engine/dl_list.cpp engine/version/old_records_cleaner.cpp + ) + +if (KVDK_ENABLE_VHASH) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mlzcnt -mbmi -mavx512bw -mavx512vl") + add_compile_definitions(KVDK_ENABLE_VHASH) + set(SOURCES + ${SOURCES} + engine/kv_engine_vhash.cpp + engine/experimental/vhash_kv.cpp + engine/experimental/vhash.cpp + engine/experimental/vhash_group.cpp ) +endif() # .so library add_library(engine SHARED ${SOURCES}) diff --git a/benchmark/bench.cpp b/benchmark/bench.cpp index b13fba3c..54c4c313 100644 --- a/benchmark/bench.cpp +++ b/benchmark/bench.cpp @@ -180,6 +180,7 @@ size_t generate_value_size(size_t tid) { } } +#ifdef KVDK_ENABLE_VHASH void FillVHash(size_t tid) { std::string key(8, ' '); for (size_t i = 0; i < FLAGS_num_kv / FLAGS_num_collection; ++i) { @@ -195,6 +196,7 @@ void FillVHash(size_t tid) { } } } +#endif void DBWrite(int tid) { std::string key(8, ' '); @@ -263,7 +265,11 @@ void DBWrite(int tid) { break; } case DataType::VHash: { +#ifdef KVDK_ENABLE_VHASH s = engine->VHashPut(collections[cid], key, value); +#else + s = Status::NotSupported; +#endif break; } case DataType::Blackhole: { @@ -403,7 +409,11 @@ void DBRead(int tid) { break; } case DataType::VHash: { +#ifdef KVDK_ENABLE_VHASH s = engine->VHashGet(collections[cid], key, &value_sink); +#else + s = Status::NotSupported; +#endif break; } case DataType::Blackhole: { @@ -606,6 +616,7 @@ int main(int argc, char** argv) { break; } case DataType::VHash: { +#ifdef KVDK_ENABLE_VHASH for (auto col : collections) { Status s = engine->VHashCreate(col, FLAGS_num_kv / FLAGS_num_collection); @@ -614,6 +625,10 @@ int main(int argc, char** argv) { } } LaunchNThreads(FLAGS_threads, FillVHash); +#else + throw std::runtime_error{"VHash not supported!"}; +#endif + break; } default: { diff --git a/engine/kv_engine.hpp b/engine/kv_engine.hpp index 9f0cdba7..3a930c2e 100644 --- a/engine/kv_engine.hpp +++ b/engine/kv_engine.hpp @@ -22,7 +22,6 @@ #include "alias.hpp" #include "data_record.hpp" #include "dram_allocator.hpp" -#include "experimental/vhash_group.hpp" #include "hash_collection/hash_list.hpp" #include "hash_collection/rebuilder.hpp" #include "hash_table.hpp" @@ -41,6 +40,10 @@ #include "version/version_controller.hpp" #include "write_batch_impl.hpp" +#ifdef KVDK_ENABLE_VHASH +#include "experimental/vhash_group.hpp" +#endif + namespace KVDK_NAMESPACE { class KVEngine : public Engine { friend class SortedCollectionRebuilder; @@ -157,6 +160,7 @@ class KVEngine : public Engine { Status* s) final; void HashIteratorRelease(HashIterator*) final; +#ifdef KVDK_ENABLE_VHASH // Volatile Hash Status VHashCreate(StringView key, size_t capacity) final; Status VHashDestroy(StringView key) final; @@ -168,6 +172,7 @@ class KVEngine : public Engine { void* cb_args) final; std::unique_ptr VHashIteratorCreate(StringView key, Status* s) final; +#endif // BatchWrite // It takes 3 stages @@ -613,8 +618,10 @@ class KVEngine : public Engine { OldRecordsCleaner old_records_cleaner_; Cleaner cleaner_; +#ifdef KVDK_ENABLE_VHASH CharAllocator char_alloc_; VHashGroup vhashes_{char_alloc_, old_records_cleaner_}; +#endif ComparatorTable comparators_; diff --git a/include/kvdk/engine.hpp b/include/kvdk/engine.hpp index dd173687..e076ed4e 100644 --- a/include/kvdk/engine.hpp +++ b/include/kvdk/engine.hpp @@ -337,6 +337,7 @@ class Engine { /// Volatile Hash APIs ////////////////////////////////////////////////////// +#ifdef KVDK_ENABLE_VHASH virtual Status VHashCreate(StringView key, size_t capacity = (1UL << 20)) = 0; virtual Status VHashDestroy(StringView key) = 0; virtual Status VHashSize(StringView key, size_t* len) = 0; @@ -349,6 +350,7 @@ class Engine { ModifyFunc modify_func, void* cb_args) = 0; virtual std::unique_ptr VHashIteratorCreate(StringView key, Status* s) = 0; +#endif /// Other /////////////////////////////////////////////////////////////////// diff --git a/include/kvdk/iterator.hpp b/include/kvdk/iterator.hpp index 3fd1c76b..2eeab302 100644 --- a/include/kvdk/iterator.hpp +++ b/include/kvdk/iterator.hpp @@ -79,6 +79,7 @@ class HashIterator { virtual ~HashIterator() = default; }; +#ifdef KVDK_ENABLE_VHASH class VHashIterator { public: virtual void SeekToFirst() = 0; @@ -93,5 +94,6 @@ class VHashIterator { virtual ~VHashIterator() = default; }; +#endif } // namespace KVDK_NAMESPACE \ No newline at end of file From 81bc3981aa7481446bd40bae4e86a6d1a3fe2fc8 Mon Sep 17 00:00:00 2001 From: ZiyanShi Date: Fri, 16 Sep 2022 02:55:08 -0400 Subject: [PATCH 07/23] bug fix Signed-off-by: ZiyanShi --- CMakeLists.txt | 3 ++- benchmark/bench.cpp | 21 +++++++++++++++++++++ engine/experimental/hashptr_map.hpp | 2 ++ engine/experimental/vhash.cpp | 1 - engine/experimental/vhash.hpp | 6 ++++-- include/kvdk/engine.hpp | 4 ++-- scripts/benchmark_impl.py | 3 ++- scripts/run_benchmark.py | 9 +++++---- 8 files changed, 38 insertions(+), 11 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index caa8b7b2..dbb50f02 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,7 +8,8 @@ set(KVDK_ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR}) include(${KVDK_ROOT_DIR}/cmake/functions.cmake) include(GNUInstallDirs) -set(CMAKE_CXX_STANDARD 11) +# set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD 17) option(COVERAGE "code coverage" OFF) option(KVDK_ENABLE_VHASH "Enable experimental VHash in KVDK" ON) diff --git a/benchmark/bench.cpp b/benchmark/bench.cpp index 54c4c313..186cbb13 100644 --- a/benchmark/bench.cpp +++ b/benchmark/bench.cpp @@ -355,6 +355,19 @@ void DBScan(int tid) { engine->HashIteratorRelease(iter); break; } + case DataType::VHash: { + auto iter = engine->VHashIteratorCreate(collections[cid]); + if (!iter) throw std::runtime_error{"Fail creating VHashIterator"}; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + key = iter->Key(); + value_sink = iter->Value(); + ++operations; + if (operations > operations_counted + 1000) { + read_ops += (operations - operations_counted); + operations_counted = operations; + } + } + } case DataType::Blackhole: { operations += 1024; read_ops.fetch_add(1024); @@ -532,6 +545,14 @@ void ProcessBenchmarkConfigs() { throw std::invalid_argument{"Invalid key distribution"}; } } + if (bench_data_type == DataType::VHash) { + // Vhash needs fill for read and update benchmarks + operations_per_thread = FLAGS_num_kv / FLAGS_max_access_threads + 1; + for (int i = 0; i < FLAGS_max_access_threads; i++) { + ranges.emplace_back(i * operations_per_thread, + (i + 1) * operations_per_thread); + } + } if (FLAGS_value_size_distribution == "constant") { vsz_dist = ValueSizeDistribution::Constant; diff --git a/engine/experimental/hashptr_map.hpp b/engine/experimental/hashptr_map.hpp index 80463306..db659bb8 100644 --- a/engine/experimental/hashptr_map.hpp +++ b/engine/experimental/hashptr_map.hpp @@ -26,6 +26,8 @@ #include "../macros.hpp" #define dynamic_assert kvdk_assert +/// TODO: allocator for aligned allocation to work under C++11 + namespace KVDK_NAMESPACE { // Internal helpers diff --git a/engine/experimental/vhash.cpp b/engine/experimental/vhash.cpp index fe482169..24c4fa70 100644 --- a/engine/experimental/vhash.cpp +++ b/engine/experimental/vhash.cpp @@ -102,7 +102,6 @@ std::string VHash::Iterator::Value() const { } std::unique_ptr VHash::MakeIterator() { - ref_cnt.fetch_add(1UL); // Initialized to end() iterator without acquiring lock. return std::unique_ptr{new Iterator{*this, hpmap.end()}}; } diff --git a/engine/experimental/vhash.hpp b/engine/experimental/vhash.hpp index 8adde27e..c50d84ea 100644 --- a/engine/experimental/vhash.hpp +++ b/engine/experimental/vhash.hpp @@ -63,14 +63,16 @@ class VHash { bool Valid() const final; std::string Key() const final; std::string Value() const final; - virtual ~Iterator() = default; + virtual ~Iterator() { owner.ref_cnt--; } private: friend VHash; using rep = typename decltype(hpmap)::iterator; VHash& owner; rep pos; - Iterator(VHash& o, rep&& p) : owner{o}, pos{std::move(p)} {} + Iterator(VHash& o, rep&& p) : owner{o}, pos{std::move(p)} { + owner.ref_cnt++; + } }; std::unique_ptr MakeIterator(); diff --git a/include/kvdk/engine.hpp b/include/kvdk/engine.hpp index e076ed4e..e64f79b0 100644 --- a/include/kvdk/engine.hpp +++ b/include/kvdk/engine.hpp @@ -348,8 +348,8 @@ class Engine { virtual Status VHashDelete(StringView key, StringView field) = 0; virtual Status VHashModify(StringView key, StringView field, ModifyFunc modify_func, void* cb_args) = 0; - virtual std::unique_ptr VHashIteratorCreate(StringView key, - Status* s) = 0; + virtual std::unique_ptr VHashIteratorCreate( + StringView key, Status* s = nullptr) = 0; #endif /// Other /////////////////////////////////////////////////////////////////// diff --git a/scripts/benchmark_impl.py b/scripts/benchmark_impl.py index a2111477..2a5e6933 100644 --- a/scripts/benchmark_impl.py +++ b/scripts/benchmark_impl.py @@ -9,7 +9,7 @@ def __fill(exec, shared_para, data_type, report_path): - if data_type == 'vhash' + if data_type == "vhash": return new_para = shared_para + " -fill=1 -type={}".format(data_type) report = report_path + "_fill" @@ -143,6 +143,7 @@ def run_benchmark( timestamp) os.system("mkdir -p {}".format(report_path)) + populate_on_fill = populate_on_fill if (data_type != "vhash") else 0 # run benchmarks print("Run benchmarks for data type :{}, value size distribution: {}".format( data_type, value_size_distribution)) diff --git a/scripts/run_benchmark.py b/scripts/run_benchmark.py index 9439a894..18bce9b8 100644 --- a/scripts/run_benchmark.py +++ b/scripts/run_benchmark.py @@ -7,14 +7,15 @@ bin = "../build/bench" exec = "numactl --cpunodebind={0} --membind={0} {1}".format(numanode, bin) -num_thread = 64 -value_sizes = [120] +num_thread = 48 +value_sizes = [24] # constant: value size always be "value_size", # random: value size uniformly distributed in [1, value_size] value_size_distributions = ['constant'] timeout = 30 # For operations other than fill -populate_on_fill = 1 # For fill only -pmem_size = 384 * 1024 * 1024 * 1024 # we need enough space to test insert +populate_on_fill = 0 # For fill only +# pmem_size = 384 * 1024 * 1024 * 1024 # we need enough space to test insert +pmem_size = 64 * 1024 * 1024 * 1024 num_collection = 16 benchmarks = [ From 8648bff7c30c35029aeebc06f61a74433ce5c188 Mon Sep 17 00:00:00 2001 From: iyupeng Date: Thu, 22 Sep 2022 10:46:34 +0800 Subject: [PATCH 08/23] Switch CI environment to ubuntu-20.04 since 18.04 is deprecated (#337) Signed-off-by: Yu, Peng --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2420c0d6..31e3948f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -4,7 +4,7 @@ on: [push, pull_request] jobs: build: - runs-on: ubuntu-18.04 + runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 From 63f706110d4fb690f41edda73867901907eaf823 Mon Sep 17 00:00:00 2001 From: Jiayu Wu Date: Fri, 23 Sep 2022 15:59:41 +0800 Subject: [PATCH 09/23] Return not found while delete keys from a non-existed sorted collection (#334) Signed-off-by: Jiayu Wu --- engine/kv_engine_sorted.cpp | 5 +++-- include/kvdk/engine.hpp | 15 +++++++++------ java/src/test/java/io/pmem/kvdk/EngineTest.java | 9 +++++++-- tests/tests.cpp | 2 +- 4 files changed, 20 insertions(+), 11 deletions(-) diff --git a/engine/kv_engine_sorted.cpp b/engine/kv_engine_sorted.cpp index ab2d573d..99488719 100644 --- a/engine/kv_engine_sorted.cpp +++ b/engine/kv_engine_sorted.cpp @@ -198,8 +198,9 @@ Status KVEngine::SortedDelete(const StringView collection, Skiplist* skiplist = nullptr; auto ret = lookupKey(collection, RecordType::SortedHeader); if (ret.s != Status::Ok) { - return (ret.s == Status::Outdated || ret.s == Status::NotFound) ? Status::Ok - : ret.s; + return (ret.s == Status::Outdated || ret.s == Status::NotFound) + ? Status::NotFound + : ret.s; } kvdk_assert(ret.entry.GetIndexType() == PointerType::Skiplist, diff --git a/include/kvdk/engine.hpp b/include/kvdk/engine.hpp index fcc1880b..002b06d8 100644 --- a/include/kvdk/engine.hpp +++ b/include/kvdk/engine.hpp @@ -140,8 +140,11 @@ class Engine { std::string* value) = 0; // Remove SORTED-type KV of "key" in the sorted collection "collection". - // Return Ok on success or the "collection"/"key" did not exist, return non-Ok - // on any error. + // Return: + // Status::Ok on success or key not existed in collection + // Status::NotFound if collection not exist + // Status::WrongType if collection exists but is not a sorted collection + // Status::PMemOverflow if PMem exhausted. virtual Status SortedDelete(const StringView collection, const StringView key) = 0; @@ -149,7 +152,7 @@ class Engine { // Create an empty List. // Return: - // Status::WrongType if list name is not a List. + // Status::WrongType if list name existed but is not a List. // Status::Existed if a List named list already exists. // Status::PMemOverflow if PMem exhausted. // Status::Ok if successfully created the List. @@ -157,9 +160,9 @@ class Engine { // Destroy a List associated with key // Return: - // Status::WrongType if list name is not a List. - // Status::Ok if successfully destroyed the List. - // Status::NotFound if list does not exist. + // Status::WrongType if list name is not a List. + // Status::Ok if successfully destroyed the List or the List not existed + // Status::PMemOverflow if PMem exhausted virtual Status ListDestroy(StringView list) = 0; // Total elements in List. diff --git a/java/src/test/java/io/pmem/kvdk/EngineTest.java b/java/src/test/java/io/pmem/kvdk/EngineTest.java index ea6cab42..73683ee4 100644 --- a/java/src/test/java/io/pmem/kvdk/EngineTest.java +++ b/java/src/test/java/io/pmem/kvdk/EngineTest.java @@ -90,8 +90,13 @@ public void testSortedCollection() throws KVDKException { // destroy destroyed sorted collection: OK kvdkEngine.sortedDestroy(nameHandle); - // delete on destroyed sorted collection: OK - kvdkEngine.sortedDelete(nameHandle, key.getBytes()); + // delete on destroyed sorted collection: Not OK + try { + kvdkEngine.sortedDelete(nameHandle, key.getBytes()); + } catch (KVDKException ex) { + // should be NotFound + assertEquals(ex.getStatus().getCode(), Code.NotFound); + } // put on destroyed sorted collection: Not OK try { diff --git a/tests/tests.cpp b/tests/tests.cpp index f9be5ae5..f3ec1ce7 100644 --- a/tests/tests.cpp +++ b/tests/tests.cpp @@ -315,7 +315,7 @@ class EngineBasicTest : public testing::Test { ASSERT_EQ(DestroyFunc(collection), Status::Ok); ASSERT_EQ(PutFunc(collection, key, val), Status::NotFound); ASSERT_EQ(GetFunc(collection, key, &got_val), Status::NotFound); - ASSERT_EQ(DeleteFunc(collection, key), Status::Ok); + ASSERT_EQ(DeleteFunc(collection, key), Status::NotFound); } void createBasicOperationTest(const std::string& collection, From 7f01a4fad54820736cd647b53e719dffe174d6f2 Mon Sep 17 00:00:00 2001 From: iyupeng Date: Wed, 28 Sep 2022 09:15:42 +0800 Subject: [PATCH 10/23] Check Java code style in Github CI (#342) Signed-off-by: Yu, Peng --- .github/workflows/ci.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 31e3948f..33909e10 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -9,6 +9,11 @@ jobs: steps: - uses: actions/checkout@v2 + - name: Check Java codestyle + run: | + cd java + mvn spotless:check + - name: Get cmake uses: lukka/get-cmake@v3.19.2 From 627b9266b952fb1d68cc93f6ccb30dfa2b1807dd Mon Sep 17 00:00:00 2001 From: iyupeng Date: Wed, 28 Sep 2022 09:49:33 +0800 Subject: [PATCH 11/23] Fix compare_string_view with unsigned char comparison (#340) Signed-off-by: Yu, Peng --- .github/workflows/ci.yml | 4 +++- engine/utils/utils.hpp | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 33909e10..36850fa0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -18,7 +18,9 @@ jobs: uses: lukka/get-cmake@v3.19.2 - name: Install packages - run: sudo apt install make clang-format-9 pkg-config g++ autoconf libtool asciidoctor libkmod-dev libudev-dev uuid-dev libjson-c-dev libkeyutils-dev pandoc libhwloc-dev libgflags-dev libtext-diff-perl bash-completion systemd wget git + run: | + sudo apt update + sudo apt install make clang-format-9 pkg-config g++ autoconf libtool asciidoctor libkmod-dev libudev-dev uuid-dev libjson-c-dev libkeyutils-dev pandoc libhwloc-dev libgflags-dev libtext-diff-perl bash-completion systemd wget git - name: Install ndctl run: | diff --git a/engine/utils/utils.hpp b/engine/utils/utils.hpp index 39cd6062..3926727a 100644 --- a/engine/utils/utils.hpp +++ b/engine/utils/utils.hpp @@ -155,7 +155,7 @@ inline int compare_string_view(const StringView& src, auto size = std::min(src.size(), target.size()); for (uint32_t i = 0; i < size; i++) { if (src[i] != target[i]) { - return src[i] - target[i]; + return (unsigned char)src[i] - (unsigned char)target[i]; } } return src.size() - target.size(); From 78f1afd8d713246e6d8b432b29d2aa5e8d9272ad Mon Sep 17 00:00:00 2001 From: Jiayu Wu Date: Wed, 28 Sep 2022 16:49:54 +0800 Subject: [PATCH 12/23] Refactor access thread, remove limitation on foreground thread and instance number (#341) Signed-off-by: Jiayu Wu --- CMakeLists.txt | 2 + benchmark/bench.cpp | 9 +- engine/c/kvdk_basic_op.cpp | 4 - engine/dram_allocator.cpp | 19 ++-- engine/hash_collection/rebuilder.hpp | 30 +++--- engine/hash_table.cpp | 8 +- engine/kv_engine.cpp | 61 +++++------- engine/kv_engine.hpp | 53 ++++++++-- engine/kv_engine_cleaner.cpp | 10 +- engine/kv_engine_hash.cpp | 48 +++------ engine/kv_engine_list.cpp | 97 ++++++------------- engine/kv_engine_sorted.cpp | 36 ++----- engine/kv_engine_string.cpp | 22 +---- engine/list_collection/rebuilder.hpp | 26 ++--- engine/pmem_allocator/free_list.cpp | 9 +- engine/pmem_allocator/pmem_allocator.cpp | 15 +-- engine/pmem_allocator/pmem_allocator.hpp | 12 ++- engine/sorted_collection/rebuilder.cpp | 24 ++--- engine/sorted_collection/rebuilder.hpp | 13 ++- engine/thread_manager.cpp | 47 +++++---- engine/thread_manager.hpp | 31 +++--- engine/version/version_controller.hpp | 43 ++++---- include/kvdk/configs.hpp | 10 +- include/kvdk/engine.h | 1 - include/kvdk/engine.hpp | 4 - include/kvdk/types.h | 1 - .../io/pmem/kvdk/benchmark/KVDKBenchmark.java | 1 - java/kvdkjni/engine.cc | 11 --- java/kvdkjni/kvdkjni.h | 14 ++- java/src/main/java/io/pmem/kvdk/Engine.java | 6 -- java/src/main/java/io/pmem/kvdk/Status.java | 13 ++- .../java/io/pmem/kvdk/EngineTestBase.java | 1 - tests/allocator.hpp | 7 -- tests/pmem_allocator_bench.cpp | 3 - tests/test_pmem_allocator.cpp | 23 +---- tests/tests.cpp | 72 +------------- 36 files changed, 305 insertions(+), 481 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3b5a73ab..9b24bbfb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,6 +16,8 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mrdseed -mrdrnd -mclwb -mclfl set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") if (CMAKE_BUILD_TYPE STREQUAL "Release") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O2") elseif (CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") elseif (CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") elseif (CMAKE_BUILD_TYPE STREQUAL "Debug") diff --git a/benchmark/bench.cpp b/benchmark/bench.cpp index 93cf6031..6593bdb6 100644 --- a/benchmark/bench.cpp +++ b/benchmark/bench.cpp @@ -78,7 +78,7 @@ DEFINE_bool( "Populate pmem space while creating a new instance. This can improve write " "performance in runtime, but will take long time to init the instance"); -DEFINE_int32(max_access_threads, 32, "Max access threads of the instance"); +DEFINE_uint64(max_access_threads, 64, "Max access threads of the instance"); DEFINE_uint64(space, (256ULL << 30), "Max usable PMem space of the instance"); @@ -460,8 +460,8 @@ void ProcessBenchmarkConfigs() { if (FLAGS_fill) { assert(FLAGS_read_ratio == 0); key_dist = KeyDistribution::Range; - operations_per_thread = FLAGS_num_kv / FLAGS_max_access_threads + 1; - for (int i = 0; i < FLAGS_max_access_threads; i++) { + operations_per_thread = FLAGS_num_kv / FLAGS_threads + 1; + for (size_t i = 0; i < FLAGS_threads; i++) { ranges.emplace_back(i * operations_per_thread, (i + 1) * operations_per_thread); } @@ -535,7 +535,6 @@ int main(int argc, char** argv) { throw std::runtime_error{"Fail to create Sorted collection"}; } } - engine->ReleaseAccessThread(); break; } case DataType::Hashes: { @@ -545,7 +544,6 @@ int main(int argc, char** argv) { throw std::runtime_error{"Fail to create Hashset"}; } } - engine->ReleaseAccessThread(); break; } case DataType::List: { @@ -555,7 +553,6 @@ int main(int argc, char** argv) { throw std::runtime_error{"Fail to create List"}; } } - engine->ReleaseAccessThread(); break; } default: { diff --git a/engine/c/kvdk_basic_op.cpp b/engine/c/kvdk_basic_op.cpp index 81b2bfdb..03ec0586 100644 --- a/engine/c/kvdk_basic_op.cpp +++ b/engine/c/kvdk_basic_op.cpp @@ -96,10 +96,6 @@ KVDKStatus KVDKRestore(const char* name, const char* backup_log, return s; } -void KVDKReleaseAccessThread(KVDKEngine* engine) { - engine->rep->ReleaseAccessThread(); -} - KVDKSnapshot* KVDKGetSnapshot(KVDKEngine* engine, int make_checkpoint) { KVDKSnapshot* snapshot = new KVDKSnapshot; snapshot->rep = engine->rep->GetSnapshot(make_checkpoint); diff --git a/engine/dram_allocator.cpp b/engine/dram_allocator.cpp index 67ea8e75..87fdcfdf 100644 --- a/engine/dram_allocator.cpp +++ b/engine/dram_allocator.cpp @@ -13,31 +13,34 @@ void ChunkBasedAllocator::Free(const SpaceEntry&) { } SpaceEntry ChunkBasedAllocator::Allocate(uint64_t size) { + kvdk_assert(ThreadManager::ThreadID() >= 0, ""); SpaceEntry entry; + auto& tc = dalloc_thread_cache_[ThreadManager::ThreadID() % + dalloc_thread_cache_.size()]; if (size > chunk_size_) { void* addr = aligned_alloc(64, size); if (addr != nullptr) { entry.size = chunk_size_; entry.offset = addr2offset(addr); - dalloc_thread_cache_[access_thread.id].allocated_chunks.push_back(addr); + tc.allocated_chunks.push_back(addr); } return entry; } - if (dalloc_thread_cache_[access_thread.id].usable_bytes < size) { + if (tc.usable_bytes < size) { void* addr = aligned_alloc(64, chunk_size_); if (addr == nullptr) { return entry; } - dalloc_thread_cache_[access_thread.id].chunk_addr = (char*)addr; - dalloc_thread_cache_[access_thread.id].usable_bytes = chunk_size_; - dalloc_thread_cache_[access_thread.id].allocated_chunks.push_back(addr); + tc.chunk_addr = (char*)addr; + tc.usable_bytes = chunk_size_; + tc.allocated_chunks.push_back(addr); } entry.size = size; - entry.offset = addr2offset(dalloc_thread_cache_[access_thread.id].chunk_addr); - dalloc_thread_cache_[access_thread.id].chunk_addr += size; - dalloc_thread_cache_[access_thread.id].usable_bytes -= size; + entry.offset = addr2offset(tc.chunk_addr); + tc.chunk_addr += size; + tc.usable_bytes -= size; return entry; } } // namespace KVDK_NAMESPACE \ No newline at end of file diff --git a/engine/hash_collection/rebuilder.hpp b/engine/hash_collection/rebuilder.hpp index 8388478f..b8b114fa 100644 --- a/engine/hash_collection/rebuilder.hpp +++ b/engine/hash_collection/rebuilder.hpp @@ -23,14 +23,13 @@ class HashListRebuilder { }; HashListRebuilder(PMEMAllocator* pmem_allocator, HashTable* hash_table, - LockTable* lock_table, ThreadManager* thread_manager, - uint64_t num_rebuild_threads, const CheckPoint& checkpoint) + LockTable* lock_table, uint64_t num_rebuild_threads, + const CheckPoint& checkpoint) : recovery_utils_(pmem_allocator), rebuilder_thread_cache_(num_rebuild_threads), pmem_allocator_(pmem_allocator), hash_table_(hash_table), lock_table_(lock_table), - thread_manager_(thread_manager), num_rebuild_threads_(num_rebuild_threads), checkpoint_(checkpoint) {} @@ -122,11 +121,6 @@ class HashListRebuilder { bool recoverToCheckPoint() { return checkpoint_.Valid(); } Status initRebuildLists() { - Status s = thread_manager_->MaybeInitThread(access_thread); - if (s != Status::Ok) { - return s; - } - // Keep headers with same id together for recognize outdated ones auto cmp = [](const DLRecord* header1, const DLRecord* header2) { auto id1 = HashList::FetchID(header1); @@ -258,11 +252,8 @@ class HashListRebuilder { } Status rebuildIndex(HashList* hlist) { - Status s = thread_manager_->MaybeInitThread(access_thread); - if (s != Status::Ok) { - return s; - } - defer(thread_manager_->Release(access_thread)); + this_thread.id = next_tid_.fetch_add(1); + size_t num_elems = 0; auto iter = hlist->GetDLList()->GetRecordIterator(); @@ -314,9 +305,10 @@ class HashListRebuilder { } void addUnlinkedRecord(DLRecord* pmem_record) { - assert(access_thread.id >= 0); - rebuilder_thread_cache_[access_thread.id].unlinked_records.push_back( - pmem_record); + kvdk_assert(ThreadManager::ThreadID() >= 0, ""); + rebuilder_thread_cache_[ThreadManager::ThreadID() % + rebuilder_thread_cache_.size()] + .unlinked_records.push_back(pmem_record); } void cleanInvalidRecords() { @@ -354,7 +346,6 @@ class HashListRebuilder { PMEMAllocator* pmem_allocator_; HashTable* hash_table_; LockTable* lock_table_; - ThreadManager* thread_manager_; const size_t num_rebuild_threads_; CheckPoint checkpoint_; SpinMutex lock_; @@ -363,5 +354,10 @@ class HashListRebuilder { std::unordered_map> rebuild_hlists_; CollectionIDType max_recovered_id_; + + // We manually allocate recovery thread id for no conflict in multi-thread + // recovering + // Todo: do not hard code + std::atomic next_tid_{0}; }; } // namespace KVDK_NAMESPACE \ No newline at end of file diff --git a/engine/hash_table.cpp b/engine/hash_table.cpp index e6ec2a0e..5cc5986f 100644 --- a/engine/hash_table.cpp +++ b/engine/hash_table.cpp @@ -174,10 +174,10 @@ HashTable::LookupResult HashTable::Insert(const StringView& key, } Status HashTable::allocateEntry(HashBucketIterator& bucket_iter) { - kvdk_assert(bucket_iter.hash_table_ == this && - bucket_iter.entry_idx_ == - hash_bucket_entries_[bucket_iter.bucket_idx_], - "Only allocate new hash entry at end of hash bucket"); + kvdk_assert(bucket_iter.hash_table_ == this, ""); + kvdk_assert( + bucket_iter.entry_idx_ == hash_bucket_entries_[bucket_iter.bucket_idx_], + "Only allocate new hash entry at end of hash bucket"); assert(bucket_iter.bucket_ptr_ != nullptr); if (hash_bucket_entries_[bucket_iter.bucket_idx_] > 0 && hash_bucket_entries_[bucket_iter.bucket_idx_] % kNumEntryPerBucket == 0) { diff --git a/engine/kv_engine.cpp b/engine/kv_engine.cpp index 4f3af3ab..e177dbe6 100644 --- a/engine/kv_engine.cpp +++ b/engine/kv_engine.cpp @@ -189,14 +189,12 @@ Status KVEngine::init(const std::string& name, const Configs& configs) { configs_.pmem_block_size, configs_.max_access_threads, configs_.populate_pmem_space, configs_.use_devdax_mode, &version_controller_)); - thread_manager_.reset(new (std::nothrow) - ThreadManager(configs_.max_access_threads)); hash_table_.reset(HashTable::NewHashTable( configs_.hash_bucket_num, configs_.num_buckets_per_slot, pmem_allocator_.get(), configs_.max_access_threads)); dllist_locks_.reset(new LockTable{1UL << 20}); if (pmem_allocator_ == nullptr || hash_table_ == nullptr || - thread_manager_ == nullptr || dllist_locks_ == nullptr) { + dllist_locks_ == nullptr) { GlobalLogger.Error("Init kvdk basic components error\n"); return Status::Abort; } @@ -208,16 +206,16 @@ Status KVEngine::init(const std::string& name, const Configs& configs) { } Status KVEngine::restoreData() { - Status s = maybeInitAccessThread(); - if (s != Status::Ok) { - return s; - } + this_thread.id = next_recovery_tid_.fetch_add(1); + EngineThreadCache& engine_thread_cache = - engine_thread_cache_[access_thread.id]; + engine_thread_cache_[ThreadManager::ThreadID() % + configs_.max_access_threads]; SpaceEntry segment_recovering; DataEntry data_entry_cached; uint64_t cnt = 0; + Status s; while (true) { if (segment_recovering.size == 0) { if (!pmem_allocator_->FetchSegment(&segment_recovering)) { @@ -342,7 +340,6 @@ Status KVEngine::restoreData() { } } restored_.fetch_add(cnt); - ReleaseAccessThread(); return s; } @@ -548,13 +545,8 @@ Status KVEngine::initOrRestoreCheckpoint() { Status KVEngine::restoreDataFromBackup(const std::string& backup_log) { // TODO: make this multi-thread - Status s = maybeInitAccessThread(); - if (s != Status::Ok) { - return s; - } - defer(ReleaseAccessThread()); BackupLog backup; - s = backup.Open(backup_log); + Status s = backup.Open(backup_log); if (s != Status::Ok) { return s; } @@ -715,20 +707,15 @@ Status KVEngine::restoreDataFromBackup(const std::string& backup_log) { } Status KVEngine::restoreExistingData() { - access_thread.id = 0; - defer(access_thread.id = -1); - sorted_rebuilder_.reset(new SortedCollectionRebuilder( this, configs_.opt_large_sorted_collection_recovery, configs_.max_access_threads, *persist_checkpoint_)); - hash_rebuilder_.reset( - new HashListRebuilder(pmem_allocator_.get(), hash_table_.get(), - dllist_locks_.get(), thread_manager_.get(), - configs_.max_access_threads, *persist_checkpoint_)); - list_rebuilder_.reset( - new ListRebuilder(pmem_allocator_.get(), hash_table_.get(), - dllist_locks_.get(), thread_manager_.get(), - configs_.max_access_threads, *persist_checkpoint_)); + hash_rebuilder_.reset(new HashListRebuilder( + pmem_allocator_.get(), hash_table_.get(), dllist_locks_.get(), + configs_.max_access_threads, *persist_checkpoint_)); + list_rebuilder_.reset(new ListRebuilder( + pmem_allocator_.get(), hash_table_.get(), dllist_locks_.get(), + configs_.max_access_threads, *persist_checkpoint_)); Status s = batchWriteRollbackLogs(); if (s != Status::Ok) { @@ -883,12 +870,13 @@ Status KVEngine::checkConfigs(const Configs& configs) { } Status KVEngine::maybeInitBatchLogFile() { - auto& tc = engine_thread_cache_[access_thread.id]; + kvdk_assert(ThreadManager::ThreadID() >= 0, ""); + auto work_id = ThreadManager::ThreadID() % configs_.max_access_threads; + auto& tc = engine_thread_cache_[work_id]; if (tc.batch_log == nullptr) { int is_pmem; size_t mapped_len; - std::string log_file_name = - batch_log_dir_ + std::to_string(access_thread.id); + std::string log_file_name = batch_log_dir_ + std::to_string(work_id); void* addr = pmem_map_file(log_file_name.c_str(), BatchWriteLog::MaxBytes(), PMEM_FILE_CREATE, 0666, &mapped_len, &is_pmem); if (addr == NULL) { @@ -916,12 +904,9 @@ Status KVEngine::batchWriteImpl(WriteBatchImpl const& batch) { return Status::InvalidBatchSize; } - Status s = maybeInitAccessThread(); - if (s != Status::Ok) { - return s; - } + auto thread_holder = AcquireAccessThread(); - s = maybeInitBatchLogFile(); + Status s = maybeInitBatchLogFile(); if (s != Status::Ok) { return s; } @@ -1045,7 +1030,8 @@ Status KVEngine::batchWriteImpl(WriteBatchImpl const& batch) { // Preparation done. Persist BatchLog for rollback. BatchWriteLog log; log.SetTimestamp(bw_token.Timestamp()); - auto& tc = engine_thread_cache_[access_thread.id]; + auto& tc = engine_thread_cache_[ThreadManager::ThreadID() % + configs_.max_access_threads]; for (auto& args : string_args) { if (args.space.size == 0) { continue; @@ -1291,10 +1277,7 @@ Status KVEngine::TypeOf(StringView key, ValueType* type) { } Status KVEngine::Expire(const StringView str, TTLType ttl_time) { - Status s = maybeInitAccessThread(); - if (s != Status::Ok) { - return s; - } + auto thread_holder = AcquireAccessThread(); int64_t base_time = TimeUtils::millisecond_time(); if (!TimeUtils::CheckTTL(ttl_time, base_time)) { diff --git a/engine/kv_engine.hpp b/engine/kv_engine.hpp index 8b48be29..afdbb8d6 100644 --- a/engine/kv_engine.hpp +++ b/engine/kv_engine.hpp @@ -173,8 +173,6 @@ class KVEngine : public Engine { return std::unique_ptr{new WriteBatchImpl{}}; } - void ReleaseAccessThread() override { access_thread.Release(); } - // For test cases const std::unordered_map>& GetSkiplists() { @@ -189,7 +187,8 @@ class KVEngine : public Engine { friend Cleaner; KVEngine(const Configs& configs) - : engine_thread_cache_(configs.max_access_threads), + : access_thread_cv_(configs.max_access_threads), + engine_thread_cache_(configs.max_access_threads), cleaner_thread_cache_(configs.max_access_threads), version_controller_(configs.max_access_threads), old_records_cleaner_(this, configs.max_access_threads), @@ -222,6 +221,42 @@ class KVEngine : public Engine { SpinMutex mtx; }; + struct AccessThreadCV { + public: + struct Holder { + public: + Holder(AccessThreadCV* cv) : cv_(cv) { cv->Acquire(); } + ~Holder() { cv_->Release(); } + + private: + AccessThreadCV* cv_; + }; + + private: + void Acquire() { + std::unique_lock ul(spin_); + while (holder_id_ != ThreadManager::ThreadID() && holder_id_ != -1) { + cv_.wait(ul); + } + holder_id_ = ThreadManager::ThreadID(); + } + + void Release() { + std::unique_lock ul(spin_); + holder_id_ = -1; + cv_.notify_one(); + } + + int64_t holder_id_ = -1; + SpinMutex spin_; + std::condition_variable_any cv_; + }; + + AccessThreadCV::Holder AcquireAccessThread() { + return AccessThreadCV::Holder(&access_thread_cv_[ThreadManager::ThreadID() % + access_thread_cv_.size()]); + } + bool checkKeySize(const StringView& key) { return key.size() <= UINT16_MAX; } bool checkValueSize(const StringView& value) { @@ -234,10 +269,6 @@ class KVEngine : public Engine { Status hashGetImpl(const StringView& key, std::string* value, uint16_t type_mask); - inline Status maybeInitAccessThread() { - return thread_manager_->MaybeInitThread(access_thread); - } - bool registerComparator(const StringView& collection_name, Comparator comp_func) { return comparators_.RegisterComparator(collection_name, comp_func); @@ -560,6 +591,8 @@ class KVEngine : public Engine { void terminateBackgroundWorks(); + Array access_thread_cv_; + Array engine_thread_cache_; Array cleaner_thread_cache_; @@ -586,7 +619,6 @@ class KVEngine : public Engine { std::string dir_; std::string batch_log_dir_; std::string db_file_; - std::shared_ptr thread_manager_; std::unique_ptr pmem_allocator_; Configs configs_; bool closing_{false}; @@ -619,6 +651,11 @@ class KVEngine : public Engine { BackgroundWorkSignals bg_work_signals_; std::atomic round_robin_id_{0}; + + // We manually allocate recovery thread id for no conflict in multi-thread + // recovering + // Todo: do not hard code + std::atomic next_recovery_tid_{0}; }; } // namespace KVDK_NAMESPACE diff --git a/engine/kv_engine_cleaner.cpp b/engine/kv_engine_cleaner.cpp index ce53a6a8..c240c4c5 100644 --- a/engine/kv_engine_cleaner.cpp +++ b/engine/kv_engine_cleaner.cpp @@ -44,8 +44,9 @@ template void KVEngine::removeAndCacheOutdatedVersion(T* record) { static_assert(std::is_same::value || std::is_same::value); - kvdk_assert(access_thread.id >= 0, ""); - auto& tc = cleaner_thread_cache_[access_thread.id]; + kvdk_assert(ThreadManager::ThreadID() >= 0, ""); + auto& tc = cleaner_thread_cache_[ThreadManager::ThreadID() % + configs_.max_access_threads]; if (std::is_same::value) { StringRecord* old_record = removeOutDatedVersion( (StringRecord*)record, version_controller_.GlobalOldestSnapshotTs()); @@ -87,8 +88,9 @@ void KVEngine::cleanOutdatedRecordImpl(T* old_record) { } void KVEngine::tryCleanCachedOutdatedRecord() { - kvdk_assert(access_thread.id >= 0, ""); - auto& tc = cleaner_thread_cache_[access_thread.id]; + kvdk_assert(ThreadManager::ThreadID() >= 0, ""); + auto& tc = cleaner_thread_cache_[ThreadManager::ThreadID() % + configs_.max_access_threads]; // Regularly update local oldest snapshot thread_local uint64_t round = 0; if (++round % kForegroundUpdateSnapshotInterval == 0) { diff --git a/engine/kv_engine_hash.cpp b/engine/kv_engine_hash.cpp index 705b4de6..8523bcc6 100644 --- a/engine/kv_engine_hash.cpp +++ b/engine/kv_engine_hash.cpp @@ -7,10 +7,7 @@ namespace KVDK_NAMESPACE { Status KVEngine::HashCreate(StringView collection) { - Status s = maybeInitAccessThread(); - if (s != Status::Ok) { - return s; - } + auto thread_holder = AcquireAccessThread(); if (!checkKeySize(collection)) { return Status::InvalidDataSize; @@ -60,11 +57,7 @@ Status KVEngine::buildHashlist(const StringView& collection, } Status KVEngine::HashDestroy(StringView collection) { - auto s = maybeInitAccessThread(); - defer(ReleaseAccessThread()); - if (s != Status::Ok) { - return s; - } + auto thread_holder = AcquireAccessThread(); if (!checkKeySize(collection)) { return Status::InvalidDataSize; @@ -74,7 +67,7 @@ Status KVEngine::HashDestroy(StringView collection) { auto snapshot_holder = version_controller_.GetLocalSnapshotHolder(); auto new_ts = snapshot_holder.Timestamp(); HashList* hlist; - s = hashListFind(collection, &hlist); + Status s = hashListFind(collection, &hlist); if (s == Status::Ok) { DLRecord* header = hlist->HeaderRecord(); kvdk_assert(header->GetRecordType() == RecordType::HashHeader, ""); @@ -105,9 +98,7 @@ Status KVEngine::HashSize(StringView collection, size_t* len) { if (!checkKeySize(collection)) { return Status::InvalidDataSize; } - if (maybeInitAccessThread() != Status::Ok) { - return Status::TooManyAccessThreads; - } + auto thread_holder = AcquireAccessThread(); auto token = version_controller_.GetLocalSnapshotHolder(); HashList* hlist; @@ -121,17 +112,13 @@ Status KVEngine::HashSize(StringView collection, size_t* len) { Status KVEngine::HashGet(StringView collection, StringView key, std::string* value) { - Status s = maybeInitAccessThread(); - - if (s != Status::Ok) { - return s; - } + auto thread_holder = AcquireAccessThread(); // Hold current snapshot in this thread auto holder = version_controller_.GetLocalSnapshotHolder(); HashList* hlist; - s = hashListFind(collection, &hlist); + Status s = hashListFind(collection, &hlist); if (s == Status::Ok) { s = hlist->Get(key, value); } @@ -140,17 +127,13 @@ Status KVEngine::HashGet(StringView collection, StringView key, Status KVEngine::HashPut(StringView collection, StringView key, StringView value) { - Status s = maybeInitAccessThread(); - - if (s != Status::Ok) { - return s; - } + auto thread_holder = AcquireAccessThread(); // Hold current snapshot in this thread auto holder = version_controller_.GetLocalSnapshotHolder(); HashList* hlist; - s = hashListFind(collection, &hlist); + Status s = hashListFind(collection, &hlist); if (s == Status::Ok) { std::string collection_key(hlist->InternalKey(key)); if (!checkKeySize(collection_key) || !checkValueSize(value)) { @@ -172,17 +155,13 @@ Status KVEngine::HashPut(StringView collection, StringView key, } Status KVEngine::HashDelete(StringView collection, StringView key) { - Status s = maybeInitAccessThread(); - - if (s != Status::Ok) { - return s; - } + auto thread_holder = AcquireAccessThread(); // Hold current snapshot in this thread auto holder = version_controller_.GetLocalSnapshotHolder(); HashList* hlist; - s = hashListFind(collection, &hlist); + Status s = hashListFind(collection, &hlist); if (s == Status::Ok) { std::string collection_key(hlist->InternalKey(key)); if (!checkKeySize(collection_key)) { @@ -204,15 +183,12 @@ Status KVEngine::HashDelete(StringView collection, StringView key) { Status KVEngine::HashModify(StringView collection, StringView key, ModifyFunc modify_func, void* cb_args) { - Status s = maybeInitAccessThread(); - if (s != Status::Ok) { - return s; - } + auto thread_holder = AcquireAccessThread(); // Hold current snapshot in this thread auto holder = version_controller_.GetLocalSnapshotHolder(); HashList* hlist; - s = hashListFind(collection, &hlist); + Status s = hashListFind(collection, &hlist); if (s == Status::Ok) { std::string internal_key(hlist->InternalKey(key)); auto ul = hash_table_->AcquireLock(internal_key); diff --git a/engine/kv_engine_list.cpp b/engine/kv_engine_list.cpp index 0427f525..9b931b5a 100644 --- a/engine/kv_engine_list.cpp +++ b/engine/kv_engine_list.cpp @@ -4,10 +4,7 @@ namespace KVDK_NAMESPACE { Status KVEngine::ListCreate(StringView list_name) { - Status s = maybeInitAccessThread(); - if (s != Status::Ok) { - return s; - } + auto thread_holder = AcquireAccessThread(); if (!checkKeySize(list_name)) { return Status::InvalidDataSize; @@ -59,9 +56,7 @@ Status KVEngine::ListDestroy(StringView collection) { if (!checkKeySize(collection)) { return Status::InvalidDataSize; } - if (maybeInitAccessThread() != Status::Ok) { - return Status::TooManyAccessThreads; - } + auto thread_holder = AcquireAccessThread(); auto ul = hash_table_->AcquireLock(collection); auto snapshot_holder = version_controller_.GetLocalSnapshotHolder(); @@ -98,9 +93,7 @@ Status KVEngine::ListSize(StringView list_name, size_t* sz) { if (!checkKeySize(list_name)) { return Status::InvalidDataSize; } - if (maybeInitAccessThread() != Status::Ok) { - return Status::TooManyAccessThreads; - } + auto thread_holder = AcquireAccessThread(); auto token = version_controller_.GetLocalSnapshotHolder(); @@ -118,9 +111,7 @@ Status KVEngine::ListPushFront(StringView collection, StringView elem) { if (!checkKeySize(collection) || !checkValueSize(elem)) { return Status::InvalidDataSize; } - if (maybeInitAccessThread() != Status::Ok) { - return Status::TooManyAccessThreads; - } + auto thread_holder = AcquireAccessThread(); auto token = version_controller_.GetLocalSnapshotHolder(); List* list; @@ -137,9 +128,7 @@ Status KVEngine::ListPushBack(StringView list_name, StringView elem) { if (!checkKeySize(list_name) || !checkValueSize(elem)) { return Status::InvalidDataSize; } - if (maybeInitAccessThread() != Status::Ok) { - return Status::TooManyAccessThreads; - } + auto thread_holder = AcquireAccessThread(); auto token = version_controller_.GetLocalSnapshotHolder(); List* list; @@ -156,9 +145,7 @@ Status KVEngine::ListPopFront(StringView list_name, std::string* elem) { if (!checkKeySize(list_name)) { return Status::InvalidDataSize; } - if (maybeInitAccessThread() != Status::Ok) { - return Status::TooManyAccessThreads; - } + auto thread_holder = AcquireAccessThread(); auto token = version_controller_.GetLocalSnapshotHolder(); List* list; @@ -189,9 +176,7 @@ Status KVEngine::ListPopBack(StringView list_name, std::string* elem) { if (!checkKeySize(list_name)) { return Status::InvalidDataSize; } - if (maybeInitAccessThread() != Status::Ok) { - return Status::TooManyAccessThreads; - } + auto thread_holder = AcquireAccessThread(); auto token = version_controller_.GetLocalSnapshotHolder(); List* list; @@ -242,11 +227,9 @@ Status KVEngine::ListBatchPushFront(StringView list_name, return Status::InvalidDataSize; } } - Status s = maybeInitAccessThread(); - if (s != Status::Ok) { - return s; - } - s = maybeInitBatchLogFile(); + auto thread_holder = AcquireAccessThread(); + + Status s = maybeInitBatchLogFile(); if (s != Status::Ok) { return s; } @@ -272,11 +255,9 @@ Status KVEngine::ListBatchPushBack(StringView list_name, return Status::InvalidDataSize; } } - Status s = maybeInitAccessThread(); - if (s != Status::Ok) { - return s; - } - s = maybeInitBatchLogFile(); + auto thread_holder = AcquireAccessThread(); + + Status s = maybeInitBatchLogFile(); if (s != Status::Ok) { return s; } @@ -288,11 +269,9 @@ Status KVEngine::ListBatchPopFront(StringView list_name, size_t n, if (!checkKeySize(list_name)) { return Status::InvalidDataSize; } - Status s = maybeInitAccessThread(); - if (s != Status::Ok) { - return s; - } - s = maybeInitBatchLogFile(); + auto thread_holder = AcquireAccessThread(); + + Status s = maybeInitBatchLogFile(); if (s != Status::Ok) { return s; } @@ -304,11 +283,8 @@ Status KVEngine::ListBatchPopBack(StringView list_name, size_t n, if (!checkKeySize(list_name)) { return Status::InvalidDataSize; } - Status s = maybeInitAccessThread(); - if (s != Status::Ok) { - return s; - } - s = maybeInitBatchLogFile(); + auto thread_holder = AcquireAccessThread(); + Status s = maybeInitBatchLogFile(); if (s != Status::Ok) { return s; } @@ -321,12 +297,9 @@ Status KVEngine::ListMove(StringView src, ListPos src_pos, StringView dst, if (!checkKeySize(src) || !checkKeySize(dst)) { return Status::InvalidDataSize; } - Status s = maybeInitAccessThread(); - if (s != Status::Ok) { - return s; - } + auto thread_holder = AcquireAccessThread(); - s = maybeInitBatchLogFile(); + Status s = maybeInitBatchLogFile(); if (s != Status::Ok) { return s; } @@ -390,7 +363,8 @@ Status KVEngine::ListMove(StringView src, ListPos src_pos, StringView dst, log.ListDelete(pop_args.spaces[0].offset); log.ListEmplace(push_args.spaces[0].offset); - auto& tc = engine_thread_cache_[access_thread.id]; + auto& tc = engine_thread_cache_[ThreadManager::ThreadID() % + configs_.max_access_threads]; log.EncodeTo(tc.batch_log); BatchWriteLog::MarkProcessing(tc.batch_log); @@ -410,9 +384,8 @@ Status KVEngine::ListInsertAt(StringView list_name, StringView elem, if (!checkValueSize(elem)) { return Status::InvalidDataSize; } - if (maybeInitAccessThread() != Status::Ok) { - return Status::TooManyAccessThreads; - } + auto thread_holder = AcquireAccessThread(); + auto token = version_controller_.GetLocalSnapshotHolder(); List* list; Status s = listFind(list_name, &list); @@ -429,9 +402,7 @@ Status KVEngine::ListInsertBefore(StringView list_name, StringView elem, if (!checkValueSize(elem)) { return Status::InvalidDataSize; } - if (maybeInitAccessThread() != Status::Ok) { - return Status::TooManyAccessThreads; - } + auto thread_holder = AcquireAccessThread(); auto token = version_controller_.GetLocalSnapshotHolder(); List* list; @@ -451,9 +422,7 @@ Status KVEngine::ListInsertAfter(StringView collection, StringView elem, if (!checkValueSize(elem)) { return Status::InvalidDataSize; } - if (maybeInitAccessThread() != Status::Ok) { - return Status::TooManyAccessThreads; - } + auto thread_holder = AcquireAccessThread(); auto token = version_controller_.GetLocalSnapshotHolder(); List* list; @@ -469,9 +438,7 @@ Status KVEngine::ListInsertAfter(StringView collection, StringView elem, Status KVEngine::ListErase(StringView list_name, long index, std::string* elem) { - if (maybeInitAccessThread() != Status::Ok) { - return Status::TooManyAccessThreads; - } + auto thread_holder = AcquireAccessThread(); auto token = version_controller_.GetLocalSnapshotHolder(); List* list; @@ -499,9 +466,7 @@ Status KVEngine::ListErase(StringView list_name, long index, // Replace the element at pos Status KVEngine::ListReplace(StringView collection, long index, StringView elem) { - if (maybeInitAccessThread() != Status::Ok) { - return Status::TooManyAccessThreads; - } + auto thread_holder = AcquireAccessThread(); auto token = version_controller_.GetLocalSnapshotHolder(); List* list; @@ -597,7 +562,8 @@ Status KVEngine::listBatchPushImpl(StringView list_name, ListPos pos, log.ListEmplace(space.offset); } - auto& tc = engine_thread_cache_[access_thread.id]; + auto& tc = engine_thread_cache_[ThreadManager::ThreadID() % + configs_.max_access_threads]; log.EncodeTo(tc.batch_log); BatchWriteLog::MarkProcessing(tc.batch_log); @@ -630,7 +596,8 @@ Status KVEngine::listBatchPopImpl(StringView list_name, ListPos pos, size_t n, log.ListDelete(space.offset); } - auto& tc = engine_thread_cache_[access_thread.id]; + auto& tc = engine_thread_cache_[ThreadManager::ThreadID() % + configs_.max_access_threads]; log.EncodeTo(tc.batch_log); BatchWriteLog::MarkProcessing(tc.batch_log); diff --git a/engine/kv_engine_sorted.cpp b/engine/kv_engine_sorted.cpp index 99488719..dfff0fbd 100644 --- a/engine/kv_engine_sorted.cpp +++ b/engine/kv_engine_sorted.cpp @@ -8,11 +8,7 @@ namespace KVDK_NAMESPACE { Status KVEngine::SortedCreate(const StringView collection_name, const SortedCollectionConfigs& s_configs) { - Status s = maybeInitAccessThread(); - defer(ReleaseAccessThread()); - if (s != Status::Ok) { - return s; - } + auto thread_holder = AcquireAccessThread(); if (!checkKeySize(collection_name)) { return Status::InvalidDataSize; @@ -74,11 +70,8 @@ Status KVEngine::buildSkiplist(const StringView& collection_name, } Status KVEngine::SortedDestroy(const StringView collection_name) { - auto s = maybeInitAccessThread(); - defer(ReleaseAccessThread()); - if (s != Status::Ok) { - return s; - } + auto thread_holder = AcquireAccessThread(); + auto ul = hash_table_->AcquireLock(collection_name); auto snapshot_holder = version_controller_.GetLocalSnapshotHolder(); auto new_ts = snapshot_holder.Timestamp(); @@ -118,11 +111,7 @@ Status KVEngine::SortedDestroy(const StringView collection_name) { } Status KVEngine::SortedSize(const StringView collection, size_t* size) { - Status s = maybeInitAccessThread(); - - if (s != Status::Ok) { - return s; - } + auto thread_holder = AcquireAccessThread(); auto holder = version_controller_.GetLocalSnapshotHolder(); @@ -141,11 +130,7 @@ Status KVEngine::SortedSize(const StringView collection, size_t* size) { Status KVEngine::SortedGet(const StringView collection, const StringView user_key, std::string* value) { - Status s = maybeInitAccessThread(); - - if (s != Status::Ok) { - return s; - } + auto thread_holder = AcquireAccessThread(); // Hold current snapshot in this thread auto holder = version_controller_.GetLocalSnapshotHolder(); @@ -166,10 +151,7 @@ Status KVEngine::SortedGet(const StringView collection, Status KVEngine::SortedPut(const StringView collection, const StringView user_key, const StringView value) { - Status s = maybeInitAccessThread(); - if (s != Status::Ok) { - return s; - } + auto thread_holder = AcquireAccessThread(); auto snapshot_holder = version_controller_.GetLocalSnapshotHolder(); @@ -188,10 +170,8 @@ Status KVEngine::SortedPut(const StringView collection, Status KVEngine::SortedDelete(const StringView collection, const StringView user_key) { - Status s = maybeInitAccessThread(); - if (s != Status::Ok) { - return s; - } + auto thread_holder = AcquireAccessThread(); + // Hold current snapshot in this thread auto holder = version_controller_.GetLocalSnapshotHolder(); diff --git a/engine/kv_engine_string.cpp b/engine/kv_engine_string.cpp index bc2b4b87..ef372e58 100644 --- a/engine/kv_engine_string.cpp +++ b/engine/kv_engine_string.cpp @@ -14,10 +14,7 @@ Status KVEngine::Modify(const StringView key, ModifyFunc modify_func, return Status::InvalidArgument; } - Status s = maybeInitAccessThread(); - if (s != Status::Ok) { - return s; - } + auto thread_holder = AcquireAccessThread(); auto ul = hash_table_->AcquireLock(key); auto holder = version_controller_.GetLocalSnapshotHolder(); @@ -106,10 +103,7 @@ Status KVEngine::Modify(const StringView key, ModifyFunc modify_func, Status KVEngine::Put(const StringView key, const StringView value, const WriteOptions& options) { - Status s = maybeInitAccessThread(); - if (s != Status::Ok) { - return s; - } + auto thread_holder = AcquireAccessThread(); if (!checkKeySize(key) || !checkValueSize(value)) { return Status::InvalidDataSize; @@ -119,11 +113,7 @@ Status KVEngine::Put(const StringView key, const StringView value, } Status KVEngine::Get(const StringView key, std::string* value) { - Status s = maybeInitAccessThread(); - - if (s != Status::Ok) { - return s; - } + auto thread_holder = AcquireAccessThread(); if (!checkKeySize(key)) { return Status::InvalidDataSize; @@ -144,11 +134,7 @@ Status KVEngine::Get(const StringView key, std::string* value) { } Status KVEngine::Delete(const StringView key) { - Status s = maybeInitAccessThread(); - - if (s != Status::Ok) { - return s; - } + auto thread_holder = AcquireAccessThread(); if (!checkKeySize(key)) { return Status::InvalidDataSize; diff --git a/engine/list_collection/rebuilder.hpp b/engine/list_collection/rebuilder.hpp index 3f0b49f2..c2cb694c 100644 --- a/engine/list_collection/rebuilder.hpp +++ b/engine/list_collection/rebuilder.hpp @@ -22,14 +22,13 @@ class ListRebuilder { }; ListRebuilder(PMEMAllocator* pmem_allocator, HashTable* hash_table, - LockTable* lock_table, ThreadManager* thread_manager, - uint64_t num_rebuild_threads, const CheckPoint& checkpoint) + LockTable* lock_table, uint64_t num_rebuild_threads, + const CheckPoint& checkpoint) : recovery_utils_(pmem_allocator), rebuilder_thread_cache_(num_rebuild_threads), pmem_allocator_(pmem_allocator), hash_table_(hash_table), lock_table_(lock_table), - thread_manager_(thread_manager), num_rebuild_threads_(num_rebuild_threads), checkpoint_(checkpoint) {} @@ -222,12 +221,10 @@ class ListRebuilder { } Status rebuildIndex(List* list) { + this_thread.id = next_tid_.fetch_add(1); + auto ul = list->AcquireLock(); - Status s = thread_manager_->MaybeInitThread(access_thread); - if (s != Status::Ok) { - return s; - } - defer(thread_manager_->Release(access_thread)); + auto iter = list->GetDLList()->GetRecordIterator(); iter->SeekToFirst(); while (iter->Valid()) { @@ -253,9 +250,10 @@ class ListRebuilder { } void addUnlinkedRecord(DLRecord* pmem_record) { - assert(access_thread.id >= 0); - rebuilder_thread_cache_[access_thread.id].unlinked_records.push_back( - pmem_record); + kvdk_assert(ThreadManager::ThreadID() >= 0, ""); + rebuilder_thread_cache_[ThreadManager::ThreadID() % + rebuilder_thread_cache_.size()] + .unlinked_records.push_back(pmem_record); } void cleanInvalidRecords() { @@ -296,12 +294,16 @@ class ListRebuilder { PMEMAllocator* pmem_allocator_; HashTable* hash_table_; LockTable* lock_table_; - ThreadManager* thread_manager_; const size_t num_rebuild_threads_; CheckPoint checkpoint_; SpinMutex lock_; std::unordered_map> invalid_lists_; std::unordered_map> rebuild_lists_; CollectionIDType max_recovered_id_; + + // We manually allocate recovery thread id for no conflict in multi-thread + // recovering + // Todo: do not hard code + std::atomic next_tid_{0}; }; } // namespace KVDK_NAMESPACE \ No newline at end of file diff --git a/engine/pmem_allocator/free_list.cpp b/engine/pmem_allocator/free_list.cpp index e1aba2a5..88c16db6 100644 --- a/engine/pmem_allocator/free_list.cpp +++ b/engine/pmem_allocator/free_list.cpp @@ -280,7 +280,8 @@ void Freelist::Push(const SpaceEntry& entry) { auto b_offset = entry.offset / block_size_; space_map_.Set(b_offset, b_size); - auto& flist_thread_cache = flist_thread_cache_[access_thread.id]; + auto& flist_thread_cache = flist_thread_cache_[ThreadManager::ThreadID() % + flist_thread_cache_.size()]; if (b_size >= flist_thread_cache.small_entry_offsets_.size()) { size_t size_index = blockSizeIndex(b_size); std::lock_guard lg( @@ -387,7 +388,8 @@ void Freelist::MoveCachedEntriesToPool() { bool Freelist::getSmallEntry(uint32_t size, SpaceEntry* space_entry) { auto b_size = size / block_size_; - auto& flist_thread_cache = flist_thread_cache_[access_thread.id]; + auto& flist_thread_cache = flist_thread_cache_[ThreadManager::ThreadID() % + flist_thread_cache_.size()]; for (uint32_t i = b_size; i < flist_thread_cache.small_entry_offsets_.size(); i++) { search_entry: @@ -418,7 +420,8 @@ bool Freelist::getSmallEntry(uint32_t size, SpaceEntry* space_entry) { bool Freelist::getLargeEntry(uint32_t size, SpaceEntry* space_entry) { auto b_size = size / block_size_; - auto& flist_thread_cache = flist_thread_cache_[access_thread.id]; + auto& flist_thread_cache = flist_thread_cache_[ThreadManager::ThreadID() % + flist_thread_cache_.size()]; auto size_index = blockSizeIndex(b_size); for (size_t i = size_index; i < flist_thread_cache.large_entries_.size(); diff --git a/engine/pmem_allocator/pmem_allocator.cpp b/engine/pmem_allocator/pmem_allocator.cpp index 119c482f..f7307c97 100644 --- a/engine/pmem_allocator/pmem_allocator.cpp +++ b/engine/pmem_allocator/pmem_allocator.cpp @@ -35,7 +35,7 @@ void PMEMAllocator::Free(const SpaceEntry& space_entry) { if (space_entry.size > 0) { assert(space_entry.size % block_size_ == 0); free_list_.Push(space_entry); - LogDeallocation(access_thread.id, space_entry.size); + LogDeallocation(ThreadManager::ThreadID(), space_entry.size); } } @@ -237,7 +237,8 @@ SpaceEntry PMEMAllocator::Allocate(uint64_t size) { if (aligned_size > segment_size_) { return space_entry; } - auto& palloc_thread_cache = palloc_thread_cache_[access_thread.id]; + auto& palloc_thread_cache = palloc_thread_cache_[ThreadManager::ThreadID() % + palloc_thread_cache_.size()]; while (palloc_thread_cache.segment_entry.size < aligned_size) { // allocate from free list space if (palloc_thread_cache.free_entry.size >= aligned_size) { @@ -260,13 +261,14 @@ SpaceEntry PMEMAllocator::Allocate(uint64_t size) { } palloc_thread_cache.free_entry.size -= aligned_size; palloc_thread_cache.free_entry.offset += aligned_size; - LogAllocation(access_thread.id, aligned_size); + LogAllocation(ThreadManager::ThreadID(), aligned_size); return space_entry; } if (palloc_thread_cache.free_entry.size > 0) { // Not a true free - LogAllocation(access_thread.id, palloc_thread_cache.free_entry.size); + LogAllocation(ThreadManager::ThreadID(), + palloc_thread_cache.free_entry.size); Free(palloc_thread_cache.free_entry); palloc_thread_cache.free_entry.size = 0; } @@ -276,7 +278,8 @@ SpaceEntry PMEMAllocator::Allocate(uint64_t size) { continue; } - LogAllocation(access_thread.id, palloc_thread_cache.segment_entry.size); + LogAllocation(ThreadManager::ThreadID(), + palloc_thread_cache.segment_entry.size); Free(palloc_thread_cache.segment_entry); // allocate a new segment, add remainning space of the old one // to the free list @@ -293,7 +296,7 @@ SpaceEntry PMEMAllocator::Allocate(uint64_t size) { persistSpaceEntry(space_entry.offset, space_entry.size); palloc_thread_cache.segment_entry.offset += space_entry.size; palloc_thread_cache.segment_entry.size -= space_entry.size; - LogAllocation(access_thread.id, space_entry.size); + LogAllocation(ThreadManager::ThreadID(), space_entry.size); return space_entry; } diff --git a/engine/pmem_allocator/pmem_allocator.hpp b/engine/pmem_allocator/pmem_allocator.hpp index a60cbb69..4a608ae3 100644 --- a/engine/pmem_allocator/pmem_allocator.hpp +++ b/engine/pmem_allocator/pmem_allocator.hpp @@ -112,25 +112,27 @@ class PMEMAllocator : public Allocator { void BatchFree(const std::vector& entries) { if (entries.size() > 0) { uint64_t freed = free_list_.BatchPush(entries); - LogDeallocation(access_thread.id, freed); + LogDeallocation(ThreadManager::ThreadID(), freed); } } - void LogAllocation(int tid, size_t sz) { + void LogAllocation(int64_t tid, size_t sz) { if (tid == -1) { global_allocated_size_.fetch_add(sz); } else { assert(tid >= 0); - palloc_thread_cache_[tid].allocated_sz += sz; + palloc_thread_cache_[tid % palloc_thread_cache_.size()].allocated_sz += + sz; } } - void LogDeallocation(int tid, size_t sz) { + void LogDeallocation(int64_t tid, size_t sz) { if (tid == -1) { global_allocated_size_.fetch_sub(sz); } else { assert(tid >= 0); - palloc_thread_cache_[tid].allocated_sz -= sz; + palloc_thread_cache_[tid % palloc_thread_cache_.size()].allocated_sz -= + sz; } } diff --git a/engine/sorted_collection/rebuilder.cpp b/engine/sorted_collection/rebuilder.cpp index 2c868bf3..e5666dd7 100644 --- a/engine/sorted_collection/rebuilder.cpp +++ b/engine/sorted_collection/rebuilder.cpp @@ -74,7 +74,8 @@ Status SortedCollectionRebuilder::AddElement(DLRecord* record) { } } else { if (segment_based_rebuild_ && - ++rebuilder_thread_cache_[access_thread.id] + ++rebuilder_thread_cache_[ThreadManager::ThreadID() % + rebuilder_thread_cache_.size()] .visited_skiplists[Skiplist::FetchID(record)] % kRestoreSkiplistStride == 0 && @@ -117,10 +118,6 @@ Status SortedCollectionRebuilder::Rollback( Status SortedCollectionRebuilder::initRebuildLists() { PMEMAllocator* pmem_allocator = kv_engine_->pmem_allocator_.get(); - Status s = kv_engine_->maybeInitAccessThread(); - if (s != Status::Ok) { - return s; - } // Keep headers with same id together for recognize outdated ones auto cmp = [](const DLRecord* header1, const DLRecord* header2) { @@ -245,7 +242,7 @@ Status SortedCollectionRebuilder::initRebuildLists() { } } linked_headers_.clear(); - return s; + return Status::Ok; } Status SortedCollectionRebuilder::segmentBasedIndexRebuild() { @@ -253,11 +250,7 @@ Status SortedCollectionRebuilder::segmentBasedIndexRebuild() { std::vector> fs; auto rebuild_segments_index = [&]() -> Status { - Status s = this->kv_engine_->maybeInitAccessThread(); - if (s != Status::Ok) { - return s; - } - defer(this->kv_engine_->ReleaseAccessThread()); + this_thread.id = next_tid_.fetch_add(1); for (auto iter = this->recovery_segments_.begin(); iter != this->recovery_segments_.end(); iter++) { if (!iter->second.visited) { @@ -467,6 +460,8 @@ void SortedCollectionRebuilder::linkSegmentDramNodes(SkiplistNode* start_node, } Status SortedCollectionRebuilder::linkHighDramNodes(Skiplist* skiplist) { + this_thread.id = next_tid_.fetch_add(1); + Splice splice(skiplist); for (uint8_t i = 1; i <= kMaxHeight; i++) { splice.prevs[i] = skiplist->HeaderNode(); @@ -492,12 +487,7 @@ Status SortedCollectionRebuilder::linkHighDramNodes(Skiplist* skiplist) { } Status SortedCollectionRebuilder::rebuildSkiplistIndex(Skiplist* skiplist) { - Status s = kv_engine_->maybeInitAccessThread(); - if (s != Status::Ok) { - return s; - } - defer(kv_engine_->ReleaseAccessThread()); - + this_thread.id = next_tid_.fetch_add(1); size_t num_elems = 0; Splice splice(skiplist); diff --git a/engine/sorted_collection/rebuilder.hpp b/engine/sorted_collection/rebuilder.hpp index feafba84..278ed7a1 100644 --- a/engine/sorted_collection/rebuilder.hpp +++ b/engine/sorted_collection/rebuilder.hpp @@ -108,9 +108,10 @@ class SortedCollectionRebuilder { PointerType index_type); void addUnlinkedRecord(DLRecord* pmem_record) { - assert(access_thread.id >= 0); - rebuilder_thread_cache_[access_thread.id].unlinked_records.push_back( - pmem_record); + assert(ThreadManager::ThreadID() >= 0); + rebuilder_thread_cache_[ThreadManager::ThreadID() % + rebuilder_thread_cache_.size()] + .unlinked_records.push_back(pmem_record); } struct ThreadCache { @@ -140,6 +141,12 @@ class SortedCollectionRebuilder { CollectionIDType max_recovered_id_ = 0; // Select elements as a segment start point for segment based rebuild every // kRestoreSkiplistStride elements per skiplist + + // We manually allocate recovery thread id for no conflict in multi-thread + // recovering + // Todo: do not hard code + std::atomic next_tid_{0}; + const uint64_t kRestoreSkiplistStride = 10000; }; } // namespace KVDK_NAMESPACE \ No newline at end of file diff --git a/engine/thread_manager.cpp b/engine/thread_manager.cpp index 1bbea1c7..a6510fbc 100644 --- a/engine/thread_manager.cpp +++ b/engine/thread_manager.cpp @@ -8,47 +8,44 @@ namespace KVDK_NAMESPACE { -void Thread::Release() { - assert(id == -1 || thread_manager != nullptr); - if (thread_manager) { - thread_manager->Release(*this); - thread_manager = nullptr; +constexpr size_t kMaxRecycleID = 1024; + +std::shared_ptr ThreadManager::manager_(new ThreadManager); + +Thread::~Thread() { + if (manager != nullptr) { + manager->Release(*this); } - id = -1; } -Thread::~Thread() { Release(); } - -Status ThreadManager::MaybeInitThread(Thread& t) { +void ThreadManager::MaybeInitThread(Thread& t) { if (t.id < 0) { - if (!usable_id_.empty()) { + if (!recycle_id_.empty()) { std::lock_guard lg(spin_); - if (!usable_id_.empty()) { - auto it = usable_id_.begin(); + if (!recycle_id_.empty()) { + auto it = recycle_id_.begin(); + t.manager = shared_from_this(); t.id = *it; - usable_id_.erase(it); - t.thread_manager = shared_from_this(); - return Status::Ok; + recycle_id_.erase(it); + return; } } int id = ids_.fetch_add(1, std::memory_order_relaxed); - if (static_cast(id) >= max_threads_) { - return Status::TooManyAccessThreads; - } + t.manager = shared_from_this(); t.id = id; - t.thread_manager = shared_from_this(); } - return Status::Ok; } -void ThreadManager::Release(const Thread& t) { - if (t.id >= 0) { - assert(static_cast(t.id) < max_threads_); +void ThreadManager::Release(Thread& t) { + if (t.manager.get() == this && t.id >= 0 && + recycle_id_.size() < kMaxRecycleID) { std::lock_guard lg(spin_); - usable_id_.insert(t.id); + recycle_id_.insert(t.id); } + t.id = -1; + t.manager = nullptr; } -thread_local Thread access_thread; +thread_local Thread this_thread; } // namespace KVDK_NAMESPACE diff --git a/engine/thread_manager.hpp b/engine/thread_manager.hpp index 748987dc..741f4122 100644 --- a/engine/thread_manager.hpp +++ b/engine/thread_manager.hpp @@ -17,27 +17,32 @@ class ThreadManager; struct Thread { public: - Thread() : id(-1), thread_manager(nullptr) {} + Thread() : id(-1), manager(nullptr) {} + int64_t id; + std::shared_ptr manager; + ~Thread(); - void Release(); - int id; - std::shared_ptr thread_manager; }; +extern thread_local Thread this_thread; + class ThreadManager : public std::enable_shared_from_this { public: - ThreadManager(uint32_t max_threads) : max_threads_(max_threads), ids_(0) {} - Status MaybeInitThread(Thread& t); - - void Release(const Thread& t); + static ThreadManager* Get() { return manager_.get(); } + static int64_t ThreadID() { + Get()->MaybeInitThread(this_thread); + return this_thread.id; + } + void MaybeInitThread(Thread& t); + void Release(Thread& t); private: - uint32_t max_threads_; - std::atomic ids_; - std::unordered_set usable_id_; + ThreadManager() : ids_(0), recycle_id_(), spin_() {} + + static std::shared_ptr manager_; + std::atomic ids_; + std::unordered_set recycle_id_; SpinMutex spin_; }; -extern thread_local Thread access_thread; - } // namespace KVDK_NAMESPACE \ No newline at end of file diff --git a/engine/version/version_controller.hpp b/engine/version/version_controller.hpp index 7c714d32..97f1b50f 100644 --- a/engine/version/version_controller.hpp +++ b/engine/version/version_controller.hpp @@ -82,7 +82,9 @@ class VersionController { public: LocalSnapshotHolder(VersionController* o) : owner_{o} { owner_->HoldLocalSnapshot(); - ts_ = owner_->version_thread_cache_[access_thread.id] + ts_ = owner_ + ->version_thread_cache_[ThreadManager::ThreadID() % + owner_->version_thread_cache_.size()] .holding_snapshot.timestamp; }; LocalSnapshotHolder(LocalSnapshotHolder const&) = delete; @@ -145,7 +147,9 @@ class VersionController { BatchWriteToken(VersionController* o) : owner_{o}, ts_{owner_->GetCurrentTimestamp()} { ts_ = owner_->GetCurrentTimestamp(); - auto& tc = owner_->version_thread_cache_[access_thread.id]; + auto& tc = + owner_->version_thread_cache_[ThreadManager::ThreadID() % + owner_->version_thread_cache_.size()]; kvdk_assert(tc.batch_write_ts == kMaxTimestamp, ""); tc.batch_write_ts = ts_; } @@ -158,7 +162,9 @@ class VersionController { } ~BatchWriteToken() { if (owner_ != nullptr) { - auto& tc = owner_->version_thread_cache_[access_thread.id]; + auto& tc = + owner_->version_thread_cache_[ThreadManager::ThreadID() % + owner_->version_thread_cache_.size()]; tc.batch_write_ts = kMaxTimestamp; } } @@ -196,23 +202,23 @@ class VersionController { } inline void HoldLocalSnapshot() { - kvdk_assert(access_thread.id >= 0 && static_cast(access_thread.id) < - version_thread_cache_.size(), + kvdk_assert(ThreadManager::ThreadID() >= 0, "Uninitialized thread in NewLocalSnapshot"); - kvdk_assert( - version_thread_cache_[access_thread.id].holding_snapshot.timestamp == - kMaxTimestamp, - "Previous LocalSnapshot not released yet!"); - version_thread_cache_[access_thread.id].holding_snapshot.timestamp = - GetCurrentTimestamp(); + kvdk_assert(version_thread_cache_[ThreadManager::ThreadID() % + version_thread_cache_.size()] + .holding_snapshot.timestamp == kMaxTimestamp, + "Previous LocalSnapshot not released yet!"); + version_thread_cache_[ThreadManager::ThreadID() % + version_thread_cache_.size()] + .holding_snapshot.timestamp = GetCurrentTimestamp(); } inline void ReleaseLocalSnapshot() { - kvdk_assert(access_thread.id >= 0 && static_cast(access_thread.id) < - version_thread_cache_.size(), + kvdk_assert(ThreadManager::ThreadID() >= 0, "Uninitialized thread in ReleaseLocalSnapshot"); - version_thread_cache_[access_thread.id].holding_snapshot.timestamp = - kMaxTimestamp; + version_thread_cache_[ThreadManager::ThreadID() % + version_thread_cache_.size()] + .holding_snapshot.timestamp = kMaxTimestamp; } inline const SnapshotImpl& GetLocalSnapshot(size_t thread_num) { @@ -222,10 +228,11 @@ class VersionController { } inline const SnapshotImpl& GetLocalSnapshot() { - kvdk_assert(access_thread.id >= 0 && static_cast(access_thread.id) < - version_thread_cache_.size(), + kvdk_assert(ThreadManager::ThreadID() >= 0, "Uninitialized thread in GetLocalSnapshot"); - return version_thread_cache_[access_thread.id].holding_snapshot; + return version_thread_cache_[ThreadManager::ThreadID() % + version_thread_cache_.size()] + .holding_snapshot; } // Create a new global snapshot diff --git a/include/kvdk/configs.hpp b/include/kvdk/configs.hpp index 1bf89b0a..3d010591 100644 --- a/include/kvdk/configs.hpp +++ b/include/kvdk/configs.hpp @@ -31,11 +31,13 @@ struct SortedCollectionConfigs { }; struct Configs { - // Max number of access threads to read/write data to kvdk instance. + // Max number of concurrent threads read/write the kvdk instance internally. + // Set it >= your CPU core number to get best performance // - // Notice that the allocated resources of a access thread would be released - // only if the thread exited or call KVEngine::ReleaseAccessThread(). - uint64_t max_access_threads = 48; + // Notice: you can call KVDK API with any number of threads, but if your + // parallel threads more than max_access_threads, the performance will be + // damaged due to synchronization cost + uint64_t max_access_threads = 64; // Size of PMem space to store KV data, this is not scalable in current // edition. diff --git a/include/kvdk/engine.h b/include/kvdk/engine.h index e4879732..8bd12871 100644 --- a/include/kvdk/engine.h +++ b/include/kvdk/engine.h @@ -63,7 +63,6 @@ extern KVDKStatus KVDKBackup(KVDKEngine* engine, const char* backup_path, extern KVDKStatus KVDKRestore(const char* name, const char* backup_log, const KVDKConfigs* config, FILE* log_file, KVDKEngine** engine); -extern void KVDKReleaseAccessThread(KVDKEngine* engine); extern void KVDKCloseEngine(KVDKEngine* engine); extern void KVDKRemovePMemContents(const char* name); extern KVDKSnapshot* KVDKGetSnapshot(KVDKEngine* engine, int make_checkpoint); diff --git a/include/kvdk/engine.hpp b/include/kvdk/engine.hpp index 002b06d8..44ba87e2 100644 --- a/include/kvdk/engine.hpp +++ b/include/kvdk/engine.hpp @@ -374,10 +374,6 @@ class Engine { // Release a sorted iterator virtual void SortedIteratorRelease(SortedIterator*) = 0; - // Release resources occupied by this access thread so new thread can take - // part. New write requests of this thread need to re-request write resources. - virtual void ReleaseAccessThread() = 0; - // Register a customized comparator to the engine on runtime // // Return true on success, return false if a comparator of comparator_name diff --git a/include/kvdk/types.h b/include/kvdk/types.h index 8150154d..10554404 100644 --- a/include/kvdk/types.h +++ b/include/kvdk/types.h @@ -65,7 +65,6 @@ __attribute__((unused)) static char const* KVDKValueTypeString[] = { GEN(NotSupported) \ GEN(PMemMapFileError) \ GEN(InvalidBatchSize) \ - GEN(TooManyAccessThreads) \ GEN(InvalidDataSize) \ GEN(InvalidArgument) \ GEN(IOError) \ diff --git a/java/benchmark/src/main/java/io/pmem/kvdk/benchmark/KVDKBenchmark.java b/java/benchmark/src/main/java/io/pmem/kvdk/benchmark/KVDKBenchmark.java index 0ee0d558..b672f6f6 100644 --- a/java/benchmark/src/main/java/io/pmem/kvdk/benchmark/KVDKBenchmark.java +++ b/java/benchmark/src/main/java/io/pmem/kvdk/benchmark/KVDKBenchmark.java @@ -275,7 +275,6 @@ private void createSortedCollections() throws KVDKException { closeableObjects.add(nameHandle); kvdkEngine.sortedCreate(nameHandle); } - kvdkEngine.releaseAccessThread(); } private void startTasks() { diff --git a/java/kvdkjni/engine.cc b/java/kvdkjni/engine.cc index b3bf7871..324319bc 100644 --- a/java/kvdkjni/engine.cc +++ b/java/kvdkjni/engine.cc @@ -436,14 +436,3 @@ void Java_io_pmem_kvdk_Engine_batchWrite(JNIEnv* env, jobject, KVDK_NAMESPACE::KVDKExceptionJni::ThrowNew(env, s); } } - -/* - * Class: io_pmem_kvdk_Engine - * Method: releaseAccessThread - * Signature: (JJ)V - */ -void Java_io_pmem_kvdk_Engine_releaseAccessThread(JNIEnv*, jobject, - jlong engine_handle) { - auto* engine = reinterpret_cast(engine_handle); - engine->ReleaseAccessThread(); -} diff --git a/java/kvdkjni/kvdkjni.h b/java/kvdkjni/kvdkjni.h index 3c32ed57..fa6b9ee7 100644 --- a/java/kvdkjni/kvdkjni.h +++ b/java/kvdkjni/kvdkjni.h @@ -116,20 +116,18 @@ class StatusJni : public JavaClass { return 0xA; case KVDK_NAMESPACE::Status::InvalidBatchSize: return 0xB; - case KVDK_NAMESPACE::Status::TooManyAccessThreads: - return 0xC; case KVDK_NAMESPACE::Status::InvalidDataSize: - return 0xD; + return 0xC; case KVDK_NAMESPACE::Status::InvalidArgument: - return 0xE; + return 0xD; case KVDK_NAMESPACE::Status::IOError: - return 0xF; + return 0xE; case KVDK_NAMESPACE::Status::InvalidConfiguration: - return 0x10; + return 0xF; case KVDK_NAMESPACE::Status::Fail: - return 0x11; + return 0x10; case KVDK_NAMESPACE::Status::Abort: - return 0x12; + return 0x11; default: return 0x7F; // undefined } diff --git a/java/src/main/java/io/pmem/kvdk/Engine.java b/java/src/main/java/io/pmem/kvdk/Engine.java index 38e3893c..ec952aa9 100644 --- a/java/src/main/java/io/pmem/kvdk/Engine.java +++ b/java/src/main/java/io/pmem/kvdk/Engine.java @@ -278,10 +278,6 @@ public void batchWrite(WriteBatch batch) throws KVDKException { batchWrite(nativeHandle_, batch.getNativeHandle()); } - public void releaseAccessThread() { - releaseAccessThread(nativeHandle_); - } - // Native methods @Override protected final native void closeInternal(long handle); @@ -347,8 +343,6 @@ private native void sortedDelete( private native void batchWrite(long engineHandle, long batchHandle); - private native void releaseAccessThread(long engineHandle); - private enum LibraryState { NOT_LOADED, LOADING, diff --git a/java/src/main/java/io/pmem/kvdk/Status.java b/java/src/main/java/io/pmem/kvdk/Status.java index b6fec1b7..728a2e48 100644 --- a/java/src/main/java/io/pmem/kvdk/Status.java +++ b/java/src/main/java/io/pmem/kvdk/Status.java @@ -61,13 +61,12 @@ public enum Code { NotSupported((byte) 0x9), PMemMapFileError((byte) 0xA), InvalidBatchSize((byte) 0xB), - TooManyAccessThreads((byte) 0xC), - InvalidDataSize((byte) 0xD), - InvalidArgument((byte) 0xE), - IOError((byte) 0xF), - InvalidConfiguration((byte) 0x10), - Fail((byte) 0x11), - Abort((byte) 0x12), + InvalidDataSize((byte) 0xC), + InvalidArgument((byte) 0xD), + IOError((byte) 0xE), + InvalidConfiguration((byte) 0xF), + Fail((byte) 0x10), + Abort((byte) 0x11), Undefined((byte) 0x7F); private final byte value; diff --git a/java/src/test/java/io/pmem/kvdk/EngineTestBase.java b/java/src/test/java/io/pmem/kvdk/EngineTestBase.java index 2cbef86c..76ef6326 100644 --- a/java/src/test/java/io/pmem/kvdk/EngineTestBase.java +++ b/java/src/test/java/io/pmem/kvdk/EngineTestBase.java @@ -34,7 +34,6 @@ public void init() throws KVDKException, IOException { @After public void teardown() { - kvdkEngine.releaseAccessThread(); kvdkEngine.close(); } } diff --git a/tests/allocator.hpp b/tests/allocator.hpp index fca45946..a1d02ff7 100644 --- a/tests/allocator.hpp +++ b/tests/allocator.hpp @@ -22,7 +22,6 @@ class AllocatorAdaptor { public: virtual op_alloc_info wrapped_malloc(uint64_t size) = 0; virtual op_alloc_info wrapped_free(op_alloc_info* data) = 0; - virtual void InitThread() {} virtual ~AllocatorAdaptor(void) {} }; @@ -45,7 +44,6 @@ class PMemAllocatorWrapper : public AllocatorAdaptor { void InitPMemAllocator(const std::string& pmem_path, uint64_t pmem_size, uint64_t num_segment_blocks, uint32_t block_size, uint32_t num_write_threads) { - thread_manager_.reset(new ThreadManager(num_write_threads)); pmem_alloc_ = PMEMAllocator::NewPMEMAllocator( pmem_path, pmem_size, num_segment_blocks, block_size, num_write_threads, true, false, nullptr); @@ -71,10 +69,6 @@ class PMemAllocatorWrapper : public AllocatorAdaptor { } } - void InitThread() override { - thread_manager_->MaybeInitThread(access_thread); - } - ~PMemAllocatorWrapper(void) { closing_ = true; // background thread exit; @@ -88,5 +82,4 @@ class PMemAllocatorWrapper : public AllocatorAdaptor { PMEMAllocator* pmem_alloc_; bool closing_ = false; std::vector background; - std::shared_ptr thread_manager_; }; diff --git a/tests/pmem_allocator_bench.cpp b/tests/pmem_allocator_bench.cpp index f767a250..4457eecd 100644 --- a/tests/pmem_allocator_bench.cpp +++ b/tests/pmem_allocator_bench.cpp @@ -98,7 +98,6 @@ class AllocatorBench { std::vector> records(num_thread); double elapesd_time = 0; auto RandomBench = [&](uint64_t id) { - allocator->InitThread(); std::vector work_sets(iter_num); std::chrono::system_clock::time_point to_begin = std::chrono::high_resolution_clock::now(); @@ -125,8 +124,6 @@ class AllocatorBench { std::cout << "Total execute time: " << std::fixed << std::setprecision(5) << elapesd_time << " seconds\n"; - // Clear all memory to avoid memory leak - allocator->InitThread(); for (auto record : records) { for (auto r : record) { allocator->wrapped_free(&r); diff --git a/tests/test_pmem_allocator.cpp b/tests/test_pmem_allocator.cpp index dbcf10b5..4f644c09 100644 --- a/tests/test_pmem_allocator.cpp +++ b/tests/test_pmem_allocator.cpp @@ -27,7 +27,6 @@ class EnginePMemAllocatorTest : public testing::Test { protected: Engine* engine = nullptr; Configs configs; - std::shared_ptr thread_manager_; std::string pmem_path; virtual void SetUp() override { @@ -38,16 +37,14 @@ class EnginePMemAllocatorTest : public testing::Test { int res __attribute__((unused)) = system(cmd); } - void RemovePathAndReleaseThread() { - // Release thread. - access_thread.Release(); + void RemovePath() { // delete db_path. char cmd[1024]; sprintf(cmd, "rm -rf %s\n", pmem_path.c_str()); int res __attribute__((unused)) = system(cmd); } - virtual void TearDown() { RemovePathAndReleaseThread(); } + virtual void TearDown() { RemovePath(); } }; TEST_F(EnginePMemAllocatorTest, TestBasicAlloc) { @@ -61,8 +58,6 @@ TEST_F(EnginePMemAllocatorTest, TestBasicAlloc) { for (auto num_segment_block : num_segment_blocks) { for (auto block_size : block_sizes) { for (auto num_thread : num_threads) { - // init pmem allocator and thread_manager. - thread_manager_.reset(new ThreadManager(num_thread)); PMEMAllocator* pmem_alloc = PMEMAllocator::NewPMEMAllocator( pmem_path, pmem_size, num_segment_block, block_size, num_thread, true, false, nullptr); @@ -76,7 +71,6 @@ TEST_F(EnginePMemAllocatorTest, TestBasicAlloc) { // Test function: allocate all pmem, and free all under multi-threaded // scenario. auto TestPmemAlloc = [&](size_t) { - thread_manager_->MaybeInitThread(access_thread); std::vector records; for (uint64_t j = 0; j < num_segment_block; ++j) { auto space_entry = pmem_alloc->Allocate(alloc_size); @@ -92,10 +86,7 @@ TEST_F(EnginePMemAllocatorTest, TestBasicAlloc) { Freelist* free_list = pmem_alloc->GetFreeList(); free_list->MoveCachedEntriesToPool(); free_list->MergeSpaceInPool(); - access_thread.Release(); - // Then allocate all pmem. - thread_manager_->MaybeInitThread(access_thread); int alloc_cnt = 0; while (true) { SpaceEntry space_entry = pmem_alloc->Allocate(alloc_size); @@ -105,7 +96,7 @@ TEST_F(EnginePMemAllocatorTest, TestBasicAlloc) { } ASSERT_EQ(pmem_size / block_size, alloc_cnt); delete pmem_alloc; - RemovePathAndReleaseThread(); + RemovePath(); } } } @@ -117,7 +108,6 @@ TEST_F(EnginePMemAllocatorTest, TestPMemFragmentation) { uint64_t num_segment_block = 1024; uint64_t block_size = 64; std::vector alloc_size{8 * 64, 8 * 64, 16 * 64, 32 * 64}; - thread_manager_.reset(new ThreadManager(num_thread)); PMEMAllocator* pmem_alloc = PMEMAllocator::NewPMEMAllocator( pmem_path, pmem_size, num_segment_block, block_size, num_thread, true, false, nullptr); @@ -127,13 +117,11 @@ TEST_F(EnginePMemAllocatorTest, TestPMemFragmentation) { * | 8 | 8 | 16 | 32 | 8 | 8 | 16 | 32 | 8 | 8 | 16 | 32 | 8 | 8 | 16 | 32 | */ std::vector records(num_thread); - thread_manager_->MaybeInitThread(access_thread); for (uint32_t i = 0; i < records.size(); ++i) { SpaceEntry space_entry = pmem_alloc->Allocate(alloc_size[i % 4]); records[i] = space_entry; ASSERT_NE(space_entry.size, 0); } - access_thread.Release(); /* Allocated pmem status: * | null | null | null | 32 | null | null | null | 32 | null | null | null @@ -141,19 +129,16 @@ TEST_F(EnginePMemAllocatorTest, TestPMemFragmentation) { */ // Notice threads (more than one) may share the same list of space pool. auto TestPmemFree = [&](uint64_t id) { - thread_manager_->MaybeInitThread(access_thread); if ((id + 1) % 4 != 0) { pmem_alloc->Free(records[id]); } }; - access_thread.Release(); LaunchNThreads(num_thread, TestPmemFree); Freelist* free_list = pmem_alloc->GetFreeList(); free_list->MoveCachedEntriesToPool(); free_list->MergeSpaceInPool(); // Test merge free memory - thread_manager_->MaybeInitThread(access_thread); for (uint32_t id = 0; id < num_thread / 4; ++id) { SpaceEntry space_entry = pmem_alloc->Allocate(alloc_size[3]); ASSERT_NE(space_entry.size, 0); @@ -168,13 +153,11 @@ TEST_F(EnginePMemAllocatorTest, TestPMemAllocFreeList) { uint64_t block_size = 64; uint64_t pmem_size = num_segment_block * block_size * num_thread; std::deque records; - thread_manager_.reset(new ThreadManager(num_thread)); PMEMAllocator* pmem_alloc = PMEMAllocator::NewPMEMAllocator( pmem_path, pmem_size, num_segment_block, block_size, num_thread, true, false, nullptr); ASSERT_NE(pmem_alloc, nullptr); - thread_manager_->MaybeInitThread(access_thread); // allocate 1024 bytes records.push_back(pmem_alloc->Allocate(1024ULL)); ASSERT_EQ(pmem_alloc->PMemUsageInBytes(), 1024LL); diff --git a/tests/tests.cpp b/tests/tests.cpp index f3ec1ce7..62c4da8d 100644 --- a/tests/tests.cpp +++ b/tests/tests.cpp @@ -56,7 +56,7 @@ class EngineBasicTest : public testing::Test { configs.pmem_segment_blocks = 8 * 1024; // For faster test, no interval so it would not block engine closing configs.background_work_interval = 0.1; - configs.max_access_threads = 1; + configs.max_access_threads = 8; db_path = FLAGS_path; backup_path = FLAGS_path + "_backup"; backup_log = FLAGS_path + ".backup"; @@ -119,9 +119,6 @@ class EngineBasicTest : public testing::Test { // Return the current configuration. Configs CurrentConfigs() { switch (config_option_) { - case MultiThread: - configs.max_access_threads = 16; - break; case OptRestore: configs.opt_large_sorted_collection_recovery = true; break; @@ -300,7 +297,6 @@ class EngineBasicTest : public testing::Test { ASSERT_EQ(val, got_val); ASSERT_EQ(DeleteFunc(collection, key), Status::Ok); ASSERT_EQ(GetFunc(collection, key, &got_val), Status::NotFound); - engine->ReleaseAccessThread(); } void testDestroy(const std::string& collection, DestroyFunc DestroyFunc, @@ -505,33 +501,10 @@ TEST_F(EngineBasicTest, TypeOfKey) { delete engine; } -TEST_F(EngineBasicTest, TestThreadManager) { - int max_access_threads = 1; - configs.max_access_threads = max_access_threads; - ASSERT_EQ(Engine::Open(db_path.c_str(), &engine, configs, stdout), - Status::Ok); - std::string key("k"); - std::string val("value"); - ASSERT_EQ(engine->Put(key, val, WriteOptions()), Status::Ok); - - // Reach max access threads - auto s = std::async(&Engine::Put, engine, key, val, WriteOptions()); - ASSERT_EQ(s.get(), Status::TooManyAccessThreads); - // Manually release access thread - engine->ReleaseAccessThread(); - s = std::async(&Engine::Put, engine, key, val, WriteOptions()); - ASSERT_EQ(s.get(), Status::Ok); - // Release access thread on thread exits - s = std::async(&Engine::Put, engine, key, val, WriteOptions()); - ASSERT_EQ(s.get(), Status::Ok); - delete engine; -} - // Test iterator/backup/checkpoint on a snapshot TEST_F(EngineBasicTest, TestBasicSnapshot) { uint32_t num_threads = 16; int count = 100; - configs.max_access_threads = num_threads; ASSERT_EQ(Engine::Open(db_path.c_str(), &engine, configs, stdout), Status::Ok); @@ -545,7 +518,6 @@ TEST_F(EngineBasicTest, TestBasicSnapshot) { ASSERT_EQ(engine->SortedCreate(sorted_collection), Status::Ok); ASSERT_EQ(engine->HashCreate(hash_collection), Status::Ok); ASSERT_EQ(engine->ListCreate(list), Status::Ok); - engine->ReleaseAccessThread(); bool snapshot_done(false); std::atomic_uint64_t set_finished_threads(0); @@ -572,7 +544,6 @@ TEST_F(EngineBasicTest, TestBasicSnapshot) { } // Wait snapshot done set_finished_threads.fetch_add(1); - engine->ReleaseAccessThread(); { std::unique_lock ul(spin); while (!snapshot_done) { @@ -625,7 +596,6 @@ TEST_F(EngineBasicTest, TestBasicSnapshot) { ASSERT_EQ(engine->SortedCreate(sorted_collection_after_snapshot), Status::Ok); ASSERT_EQ(engine->HashCreate(hash_collection_after_snapshot), Status::Ok); ASSERT_EQ(engine->ListCreate(list_after_snapshot), Status::Ok); - engine->ReleaseAccessThread(); { std::lock_guard ul(spin); snapshot_done = true; @@ -865,7 +835,6 @@ TEST_F(EngineBasicTest, TestStringModify) { int num_threads = 16; int ops_per_thread = 1000; uint64_t incr_by = 5; - configs.max_access_threads = num_threads; ASSERT_EQ(Engine::Open(db_path.c_str(), &engine, configs, stdout), Status::Ok); @@ -873,7 +842,6 @@ TEST_F(EngineBasicTest, TestStringModify) { std::string wrong_value_key = "wrong_value"; ASSERT_EQ(engine->Put(wrong_value_key, std::string(10, 'a')), Status::Ok); - engine->ReleaseAccessThread(); auto TestModify = [&](int) { IncNArgs args{5, 0}; @@ -897,7 +865,6 @@ TEST_F(EngineBasicTest, TestStringModify) { TEST_F(BatchWriteTest, Sorted) { size_t num_threads = 1; - configs.max_access_threads = num_threads + 1; for (int index_with_hashtable : {0, 1}) { ASSERT_EQ(Engine::Open(db_path.c_str(), &engine, configs, stdout), Status::Ok); @@ -979,7 +946,6 @@ TEST_F(BatchWriteTest, Sorted) { TEST_F(BatchWriteTest, String) { size_t num_threads = 16; - configs.max_access_threads = num_threads; ASSERT_EQ(Engine::Open(db_path.c_str(), &engine, configs, stdout), Status::Ok); size_t batch_size = 100; @@ -1050,7 +1016,6 @@ TEST_F(BatchWriteTest, String) { TEST_F(BatchWriteTest, Hash) { size_t num_threads = 16; - configs.max_access_threads = num_threads + 1; ASSERT_EQ(Engine::Open(db_path.c_str(), &engine, configs, stdout), Status::Ok); size_t batch_size = 100; @@ -1198,7 +1163,6 @@ TEST_F(EngineBasicTest, TestSeek) { TEST_F(EngineBasicTest, TestStringRestore) { size_t num_threads = 16; - configs.max_access_threads = num_threads; ASSERT_EQ(Engine::Open(db_path.c_str(), &engine, configs, stdout), Status::Ok); // insert and delete some keys, then re-insert some deleted keys @@ -1278,7 +1242,6 @@ TEST_F(EngineBasicTest, TestStringLargeValue) { TEST_F(EngineBasicTest, TestSortedRestore) { size_t num_threads = 16; - configs.max_access_threads = num_threads; for (int opt_large_sorted_collection_recovery : {0, 1}) { for (int index_with_hashtable : {0, 1}) { SortedCollectionConfigs s_configs; @@ -1337,7 +1300,6 @@ TEST_F(EngineBasicTest, TestSortedRestore) { GlobalLogger.Debug( "Restore with opt_large_sorted_collection_restore: %d\n", opt_large_sorted_collection_recovery); - configs.max_access_threads = num_threads; configs.opt_large_sorted_collection_recovery = opt_large_sorted_collection_recovery; // reopen and restore engine and try gets @@ -1454,7 +1416,6 @@ TEST_F(EngineBasicTest, TestSortedRestore) { TEST_F(EngineBasicTest, TestMultiThreadSortedRestore) { size_t num_threads = 16; size_t num_collections = 16; - configs.max_access_threads = num_threads; configs.opt_large_sorted_collection_recovery = true; ASSERT_EQ(Engine::Open(db_path.c_str(), &engine, configs, stdout), Status::Ok); @@ -1521,7 +1482,6 @@ TEST_F(EngineBasicTest, TestMultiThreadSortedRestore) { TEST_F(EngineBasicTest, TestList) { size_t num_threads = 1; size_t count = 1000; - configs.max_access_threads = num_threads + 1; ASSERT_EQ(Engine::Open(db_path.c_str(), &engine, configs, stdout), Status::Ok); std::vector> elems_vec(num_threads); @@ -1792,7 +1752,6 @@ TEST_F(EngineBasicTest, TestList) { TEST_F(EngineBasicTest, TestHash) { size_t num_threads = 1; size_t count = 1000; - configs.max_access_threads = num_threads + 1; ASSERT_EQ(Engine::Open(db_path.c_str(), &engine, configs, stdout), Status::Ok); std::string key{"Hash"}; @@ -1938,7 +1897,6 @@ TEST_F(EngineBasicTest, TestHash) { TEST_F(EngineBasicTest, TestStringHotspot) { size_t n_thread_reading = 16; size_t n_thread_writing = 16; - configs.max_access_threads = n_thread_writing + n_thread_reading; ASSERT_EQ(Engine::Open(db_path.c_str(), &engine, configs, stdout), Status::Ok); @@ -1948,7 +1906,6 @@ TEST_F(EngineBasicTest, TestStringHotspot) { std::string val2(1023, 'b'); ASSERT_EQ(engine->Put(key, val1), Status::Ok); - engine->ReleaseAccessThread(); auto EvenWriteOddRead = [&](uint32_t id) { for (size_t i = 0; i < count; i++) { @@ -1990,7 +1947,6 @@ TEST_F(EngineBasicTest, TestStringHotspot) { TEST_F(EngineBasicTest, TestSortedHotspot) { size_t n_thread_reading = 16; size_t n_thread_writing = 16; - configs.max_access_threads = n_thread_writing + n_thread_reading; ASSERT_EQ(Engine::Open(db_path.c_str(), &engine, configs, stdout), Status::Ok); @@ -2004,7 +1960,6 @@ TEST_F(EngineBasicTest, TestSortedHotspot) { for (const std::string& key : keys) { ASSERT_EQ(engine->SortedPut(collection_name, key, val1), Status::Ok); - engine->ReleaseAccessThread(); auto EvenWriteOddRead = [&](uint32_t id) { for (size_t i = 0; i < count; i++) { @@ -2050,7 +2005,6 @@ TEST_F(EngineBasicTest, TestSortedHotspot) { TEST_F(EngineBasicTest, TestSortedCustomCompareFunction) { using kvpair = std::pair; size_t num_threads = 16; - configs.max_access_threads = num_threads; ASSERT_EQ(Engine::Open(db_path.c_str(), &engine, configs, stdout), Status::Ok); @@ -2156,8 +2110,7 @@ TEST_F(EngineBasicTest, TestSortedCustomCompareFunction) { } TEST_F(EngineBasicTest, TestHashTableIterator) { - size_t threads = 32; - configs.max_access_threads = threads; + size_t num_threads = 32; ASSERT_EQ(Engine::Open(db_path.c_str(), &engine, configs, stdout), Status::Ok); std::string collection_name = "sortedcollection"; @@ -2173,7 +2126,7 @@ TEST_F(EngineBasicTest, TestHashTableIterator) { Status::Ok); } }; - LaunchNThreads(threads, MixedPut); + LaunchNThreads(num_threads, MixedPut); auto test_kvengine = static_cast(engine); auto hash_table = test_kvengine->GetHashTable(); @@ -2224,15 +2177,12 @@ TEST_F(EngineBasicTest, TestHashTableIterator) { } hashtable_iter.Next(); } - ASSERT_EQ(total_entry_num, threads + 1); + ASSERT_EQ(total_entry_num, num_threads + 1); } delete engine; } TEST_F(EngineBasicTest, TestExpireAPI) { - size_t n_thread_reading = 1; - size_t n_thread_writing = 1; - configs.max_access_threads = n_thread_writing + n_thread_reading; ASSERT_EQ(Engine::Open(db_path.c_str(), &engine, configs, stdout), Status::Ok); @@ -2390,7 +2340,6 @@ TEST_F(EngineBasicTest, TestExpireAPI) { TEST_F(EngineBasicTest, TestbackgroundDestroyCollections) { size_t n_thread_writing = 16; - configs.max_access_threads = n_thread_writing; ASSERT_EQ(Engine::Open(db_path.c_str(), &engine, configs, stdout), Status::Ok); TTLType ttl = 1000; // 1s @@ -2464,7 +2413,6 @@ TEST_F(EngineBasicTest, TestbackgroundDestroyCollections) { TEST_F(BatchWriteTest, SortedRollback) { size_t num_threads = 1; - configs.max_access_threads = num_threads + 1; for (int index_with_hashtable : {0, 1}) { // Test crash before commit SyncPoint::GetInstance()->EnableCrashPoint( @@ -2587,7 +2535,6 @@ TEST_F(BatchWriteTest, StringRollBack) { // another thread may reuse this id and the old batch log file // is overwritten. size_t num_threads = 1; - configs.max_access_threads = num_threads; ASSERT_EQ(Engine::Open(db_path.c_str(), &engine, configs, stdout), Status::Ok); size_t batch_size = 100; @@ -2662,7 +2609,6 @@ TEST_F(BatchWriteTest, StringRollBack) { TEST_F(BatchWriteTest, HashRollback) { size_t num_threads = 1; - configs.max_access_threads = num_threads + 1; ASSERT_EQ(Engine::Open(db_path.c_str(), &engine, configs, stdout), Status::Ok); size_t batch_size = 100; @@ -2751,7 +2697,6 @@ TEST_F(BatchWriteTest, ListBatchOperationRollback) { *((std::atomic_bool*)close_reclaimer) = true; return; }); - configs.max_access_threads = 1; ASSERT_EQ(Engine::Open(db_path.c_str(), &engine, configs, stdout), Status::Ok); size_t count = 100; @@ -2864,7 +2809,6 @@ TEST_F(BatchWriteTest, ListBatchOperationRollback) { // Then Repair TEST_F(EngineBasicTest, TestSortedRecoverySyncPointCaseOne) { Configs test_config = configs; - test_config.max_access_threads = 16; std::atomic update_num(1); int cnt = 20; @@ -2955,7 +2899,6 @@ TEST_F(EngineBasicTest, TestSortedRecoverySyncPointCaseTwo) { }); SyncPoint::GetInstance()->EnableProcessing(); - test_config.max_access_threads = 16; ASSERT_EQ(Engine::Open(db_path.c_str(), &engine, test_config, stdout), Status::Ok); @@ -3017,7 +2960,6 @@ TEST_F(EngineBasicTest, TestSortedRecoverySyncPointCaseTwo) { // thread2: iter TEST_F(EngineBasicTest, TestSortedSyncPoint) { Configs test_config = configs; - test_config.max_access_threads = 16; ASSERT_EQ(Engine::Open(db_path.c_str(), &engine, test_config, stdout), Status::Ok); std::vector ths; @@ -3072,8 +3014,6 @@ TEST_F(EngineBasicTest, TestSortedSyncPoint) { } TEST_F(EngineBasicTest, TestHashTableRangeIter) { - uint64_t threads = 16; - configs.max_access_threads = threads; ASSERT_EQ(Engine::Open(db_path.c_str(), &engine, configs, stdout), Status::Ok); std::string key = "stringkey"; @@ -3130,7 +3070,6 @@ TEST_F(EngineBasicTest, TestBackGroundCleaner) { }); SyncPoint::GetInstance()->EnableProcessing(); - configs.max_access_threads = 16; ASSERT_EQ(Engine::Open(db_path.c_str(), &engine, configs, stdout), Status::Ok); @@ -3342,8 +3281,6 @@ TEST_F(EngineBasicTest, TestBackGroundIterNoHashIndexSkiplist) { {{"KVEngine::BackgroundCleaner::IterSkiplist::UnlinkDeleteRecord", "KVEngine::SkiplistNoHashIndex::Put"}}); SyncPoint::GetInstance()->EnableProcessing(); - uint64_t threads = 16; - configs.max_access_threads = threads; ASSERT_EQ(Engine::Open(db_path.c_str(), &engine, configs, stdout), Status::Ok); std::string collection_name = "Skiplist_with_hash_index"; @@ -3432,7 +3369,6 @@ TEST_F(EngineBasicTest, TestDynamicCleaner) { return; }); SyncPoint::GetInstance()->EnableProcessing(); - configs.max_access_threads = 32; configs.hash_bucket_num = 256; configs.clean_threads = 8; ASSERT_EQ(Engine::Open(db_path.c_str(), &engine, configs, stdout), From ad7f5667860ca9ac5c6bd7d2c85191e2272e0506 Mon Sep 17 00:00:00 2001 From: Jiayu Wu Date: Thu, 29 Sep 2022 12:26:22 +0800 Subject: [PATCH 13/23] Basic transaction implementation (#336) Implement basic transaction for string, sorted and hash types. Signed-off-by: Jiayu Wu --- CMakeLists.txt | 2 + engine/c/kvdk_c.hpp | 6 + engine/c/kvdk_transaction.cpp | 63 ++++ engine/data_record.hpp | 20 +- engine/hash_collection/hash_list.cpp | 4 +- engine/hash_collection/hash_list.hpp | 2 +- engine/hash_collection/rebuilder.hpp | 2 +- engine/hash_table.hpp | 2 + engine/kv_engine.cpp | 77 ++--- engine/kv_engine.hpp | 72 +++-- engine/kv_engine_cleaner.cpp | 12 +- engine/kv_engine_hash.cpp | 16 +- engine/kv_engine_list.cpp | 16 +- engine/kv_engine_sorted.cpp | 26 +- engine/list_collection/list.cpp | 2 +- engine/list_collection/list.hpp | 4 +- engine/list_collection/rebuilder.hpp | 2 +- engine/sorted_collection/rebuilder.cpp | 6 +- engine/sorted_collection/skiplist.cpp | 6 +- engine/sorted_collection/skiplist.hpp | 4 +- engine/transaction_impl.cpp | 270 +++++++++++++++++ engine/transaction_impl.hpp | 153 ++++++++++ engine/utils/utils.hpp | 2 + engine/write_batch_impl.hpp | 53 +++- examples/tutorial/c_api_tutorial.c | 55 +++- include/kvdk/engine.h | 34 ++- include/kvdk/engine.hpp | 3 + include/kvdk/transaction.hpp | 38 +++ include/kvdk/types.h | 1 + include/kvdk/write_batch.hpp | 11 +- tests/tests.cpp | 394 +++++++++++++++++++++++++ 31 files changed, 1220 insertions(+), 138 deletions(-) create mode 100644 engine/c/kvdk_transaction.cpp create mode 100644 engine/transaction_impl.cpp create mode 100644 engine/transaction_impl.hpp create mode 100644 include/kvdk/transaction.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 9b24bbfb..b8f194c8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -50,6 +50,7 @@ endif() set(SOURCES engine/c/kvdk_basic_op.cpp engine/c/kvdk_batch.cpp + engine/c/kvdk_transaction.cpp engine/c/kvdk_hash.cpp engine/c/kvdk_list.cpp engine/c/kvdk_sorted.cpp @@ -70,6 +71,7 @@ set(SOURCES engine/hash_collection/hash_list.cpp engine/list_collection/list.cpp engine/write_batch_impl.cpp + engine/transaction_impl.cpp engine/dram_allocator.cpp engine/pmem_allocator/pmem_allocator.cpp engine/thread_manager.cpp diff --git a/engine/c/kvdk_c.hpp b/engine/c/kvdk_c.hpp index 01aba7cf..318cb623 100644 --- a/engine/c/kvdk_c.hpp +++ b/engine/c/kvdk_c.hpp @@ -12,6 +12,7 @@ #include "kvdk/engine.h" #include "kvdk/engine.hpp" #include "kvdk/iterator.hpp" +#include "kvdk/transaction.hpp" #include "kvdk/write_batch.hpp" using kvdk::StringView; @@ -23,6 +24,7 @@ using kvdk::ListIterator; using kvdk::Snapshot; using kvdk::SortedCollectionConfigs; using kvdk::SortedIterator; +using kvdk::Transaction; using kvdk::WriteBatch; using kvdk::WriteOptions; @@ -40,6 +42,10 @@ struct KVDKWriteBatch { std::unique_ptr rep; }; +struct KVDKTransaction { + std::unique_ptr rep; +}; + struct KVDKSortedIterator { SortedIterator* rep; }; diff --git a/engine/c/kvdk_transaction.cpp b/engine/c/kvdk_transaction.cpp new file mode 100644 index 00000000..9d5d3816 --- /dev/null +++ b/engine/c/kvdk_transaction.cpp @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2021-2022 Intel Corporation + */ + +#include "kvdk_c.hpp" + +extern "C" { +KVDKTransaction* KVDKTransactionCreate(KVDKEngine* engine) { + KVDKTransaction* txn = new KVDKTransaction{}; + txn->rep = engine->rep->TransactionCreate(); + return txn; +} +void KVDKTransactionDestory(KVDKTransaction* txn) { delete txn; } +KVDKStatus KVDKTransactionStringPut(KVDKTransaction* txn, char const* key_data, + size_t key_len, char const* val_data, + size_t val_len) { + return txn->rep->StringPut(std::string(key_data, key_len), + std::string(val_data, val_len)); +} +KVDKStatus KVDKTransactionStringDelete(KVDKTransaction* txn, + char const* key_data, size_t key_len) { + return txn->rep->StringDelete(std::string(key_data, key_len)); +} +KVDKStatus KVDKTransactionSortedPut(KVDKTransaction* txn, + char const* collection, + size_t collection_len, char const* key_data, + size_t key_len, char const* val_data, + size_t val_len) { + return txn->rep->SortedPut(std::string(collection, collection_len), + std::string(key_data, key_len), + std::string(val_data, val_len)); +} + +KVDKStatus KVDKTransactionSortedDelete(KVDKTransaction* txn, + char const* collection, + size_t collection_len, + char const* key_data, size_t key_len) { + return txn->rep->SortedDelete(std::string(collection, collection_len), + std::string(key_data, key_len)); +} +KVDKStatus KVDKTransactionHashPut(KVDKTransaction* txn, char const* collection, + size_t collection_len, char const* key_data, + size_t key_len, char const* val_data, + size_t val_len) { + return txn->rep->HashPut(std::string(collection, collection_len), + std::string(key_data, key_len), + std::string(val_data, val_len)); +} +KVDKStatus KVDKTransactionHashDelete(KVDKTransaction* txn, + char const* collection, + size_t collection_len, + char const* key_data, size_t key_len) { + return txn->rep->HashDelete(std::string(collection, collection_len), + std::string(key_data, key_len)); +} +KVDKStatus KVDKTransactionCommit(KVDKTransaction* txn) { + return txn->rep->Commit(); +} +void KVDKTransactionRollback(KVDKTransaction* txn) { + return txn->rep->Rollback(); +} + +} // extern "C" diff --git a/engine/data_record.hpp b/engine/data_record.hpp index 3ca296ff..b8ea2283 100644 --- a/engine/data_record.hpp +++ b/engine/data_record.hpp @@ -17,11 +17,11 @@ namespace KVDK_NAMESPACE { enum RecordType : uint8_t { Empty = 0, String = (1 << 0), - SortedHeader = (1 << 1), + SortedRecord = (1 << 1), SortedElem = (1 << 2), - HashHeader = (1 << 3), + HashRecord = (1 << 3), HashElem = (1 << 4), - ListHeader = (1 << 5), + ListRecord = (1 << 5), ListElem = (1 << 6), }; @@ -35,8 +35,8 @@ enum class RecordStatus : uint8_t { }; const uint8_t ExpirableRecordType = - (RecordType::String | RecordType::SortedHeader | RecordType::HashHeader | - RecordType::ListHeader); + (RecordType::String | RecordType::SortedRecord | RecordType::HashRecord | + RecordType::ListRecord); const uint8_t PrimaryRecordType = ExpirableRecordType; @@ -44,8 +44,8 @@ const uint8_t ElemType = (RecordType::SortedElem | RecordType::HashElem | RecordType::ListElem); const uint8_t CollectionType = - (RecordType::SortedHeader | RecordType::HashHeader | - RecordType::ListHeader); + (RecordType::SortedRecord | RecordType::HashRecord | + RecordType::ListRecord); struct DataHeader { DataHeader() = default; @@ -365,9 +365,9 @@ struct DLRecord { prev(_prev), next(_next), expired_time(_expired_time) { - kvdk_assert(_type & (RecordType::SortedElem | RecordType::SortedHeader | - RecordType::HashElem | RecordType::HashHeader | - RecordType::ListElem | RecordType::ListHeader), + kvdk_assert(_type & (RecordType::SortedElem | RecordType::SortedRecord | + RecordType::HashElem | RecordType::HashRecord | + RecordType::ListElem | RecordType::ListRecord), ""); memcpy(data, _key.data(), _key.size()); memcpy(data + _key.size(), _value.data(), _value.size()); diff --git a/engine/hash_collection/hash_list.cpp b/engine/hash_collection/hash_list.cpp index e10bd1e6..fa65ab41 100644 --- a/engine/hash_collection/hash_list.cpp +++ b/engine/hash_collection/hash_list.cpp @@ -218,7 +218,7 @@ HashList::WriteResult HashList::SetExpireTime(ExpireTimeType expired_time, } DLRecord* pmem_record = DLRecord::PersistDLRecord( pmem_allocator_->offset2addr_checked(space.offset), space.size, timestamp, - RecordType::HashHeader, RecordStatus::Normal, + RecordType::HashRecord, RecordStatus::Normal, pmem_allocator_->addr2offset_checked(header), header->prev, header->next, header->Key(), header->Value(), expired_time); bool success = dl_list_.Replace(header, pmem_record); @@ -265,7 +265,7 @@ CollectionIDType HashList::FetchID(const DLRecord* record) { switch (record->GetRecordType()) { case RecordType::HashElem: return ExtractID(record->Key()); - case RecordType::HashHeader: + case RecordType::HashRecord: return DecodeID(record->Value()); default: GlobalLogger.Error("Wrong record type %u in HashListID", diff --git a/engine/hash_collection/hash_list.hpp b/engine/hash_collection/hash_list.hpp index 86425417..f035ccae 100644 --- a/engine/hash_collection/hash_list.hpp +++ b/engine/hash_collection/hash_list.hpp @@ -162,7 +162,7 @@ class HashList : public Collection { static bool MatchType(const DLRecord* record) { RecordType type = record->GetRecordType(); - return type == RecordType::HashElem || type == RecordType::HashHeader; + return type == RecordType::HashElem || type == RecordType::HashRecord; } private: diff --git a/engine/hash_collection/rebuilder.hpp b/engine/hash_collection/rebuilder.hpp index b8b114fa..1211f895 100644 --- a/engine/hash_collection/rebuilder.hpp +++ b/engine/hash_collection/rebuilder.hpp @@ -203,7 +203,7 @@ class HashListRebuilder { if (!outdated) { auto lookup_result = hash_table_->Insert( - collection_name, RecordType::HashHeader, RecordStatus::Normal, + collection_name, RecordType::HashRecord, RecordStatus::Normal, hlist.get(), PointerType::HashList); switch (lookup_result.s) { case Status::Ok: { diff --git a/engine/hash_table.hpp b/engine/hash_table.hpp index 6dff0666..a0d6d6fd 100644 --- a/engine/hash_table.hpp +++ b/engine/hash_table.hpp @@ -184,6 +184,8 @@ class HashTable { return std::unique_lock{*getHint(key).spin}; } + SpinMutex* GetLock(StringView const& key) { return getHint(key).spin; } + HashTableIterator GetIterator(uint64_t start_slot_idx, uint64_t end_slot_idx); size_t GetSlotsNum() { return slots_.size(); } diff --git a/engine/kv_engine.cpp b/engine/kv_engine.cpp index e177dbe6..1e22b78e 100644 --- a/engine/kv_engine.cpp +++ b/engine/kv_engine.cpp @@ -246,11 +246,11 @@ Status KVEngine::restoreData() { switch (data_entry_cached.meta.type) { case RecordType::SortedElem: - case RecordType::SortedHeader: + case RecordType::SortedRecord: case RecordType::String: - case RecordType::HashHeader: + case RecordType::HashRecord: case RecordType::HashElem: - case RecordType::ListHeader: + case RecordType::ListRecord: case RecordType::ListElem: { if (data_entry_cached.meta.status == RecordStatus::Dirty) { data_entry_cached.meta.type = RecordType::Empty; @@ -301,7 +301,7 @@ Status KVEngine::restoreData() { s = restoreSortedElem(static_cast(recovering_pmem_record)); break; } - case RecordType::SortedHeader: { + case RecordType::SortedRecord: { s = restoreSortedHeader(static_cast(recovering_pmem_record)); break; } @@ -311,7 +311,7 @@ Status KVEngine::restoreData() { data_entry_cached); break; } - case RecordType::ListHeader: { + case RecordType::ListRecord: { s = listRestoreList(static_cast(recovering_pmem_record)); break; } @@ -319,7 +319,7 @@ Status KVEngine::restoreData() { s = listRestoreElem(static_cast(recovering_pmem_record)); break; } - case RecordType::HashHeader: { + case RecordType::HashRecord: { s = restoreHashHeader(static_cast(recovering_pmem_record)); break; } @@ -350,11 +350,11 @@ bool KVEngine::validateRecord(void* data_record) { case RecordType::String: { return static_cast(data_record)->Validate(); } - case RecordType::SortedHeader: + case RecordType::SortedRecord: case RecordType::SortedElem: - case RecordType::HashHeader: + case RecordType::HashRecord: case RecordType::HashElem: - case RecordType::ListHeader: + case RecordType::ListRecord: case RecordType::ListElem: { return static_cast(data_record)->Validate(); } @@ -422,7 +422,7 @@ Status KVEngine::Backup(const pmem::obj::string_view backup_log, } break; } - case RecordType::SortedHeader: { + case RecordType::SortedRecord: { DLRecord* header = slot_iter->GetIndex().skiplist->HeaderRecord(); while (header != nullptr && header->GetTimestamp() > backup_ts) { header = @@ -430,7 +430,7 @@ Status KVEngine::Backup(const pmem::obj::string_view backup_log, } if (header && header->GetRecordStatus() == RecordStatus::Normal && !header->HasExpired()) { - s = backup.Append(RecordType::SortedHeader, header->Key(), + s = backup.Append(RecordType::SortedRecord, header->Key(), header->Value(), header->GetExpireTime()); if (s == Status::Ok) { // Append skiplist elems following the header @@ -452,7 +452,7 @@ Status KVEngine::Backup(const pmem::obj::string_view backup_log, } break; } - case RecordType::HashHeader: { + case RecordType::HashRecord: { DLRecord* header = slot_iter->GetIndex().hlist->HeaderRecord(); while (header != nullptr && header->GetTimestamp() > backup_ts) { header = @@ -460,7 +460,7 @@ Status KVEngine::Backup(const pmem::obj::string_view backup_log, } if (header && header->GetRecordStatus() == RecordStatus::Normal && !header->HasExpired()) { - s = backup.Append(RecordType::HashHeader, header->Key(), + s = backup.Append(RecordType::HashRecord, header->Key(), header->Value(), header->GetExpireTime()); if (s == Status::Ok) { // Append hlist elems following the header @@ -481,7 +481,7 @@ Status KVEngine::Backup(const pmem::obj::string_view backup_log, } break; } - case RecordType::ListHeader: { + case RecordType::ListRecord: { DLRecord* header = slot_iter->GetIndex().list->HeaderRecord(); while (header != nullptr && header->GetTimestamp() > backup_ts) { header = @@ -489,7 +489,7 @@ Status KVEngine::Backup(const pmem::obj::string_view backup_log, } if (header && header->GetRecordStatus() == RecordStatus::Normal && !header->HasExpired()) { - s = backup.Append(RecordType::ListHeader, header->Key(), + s = backup.Append(RecordType::ListRecord, header->Key(), header->Value(), header->GetExpireTime()); if (s == Status::Ok) { // Append hlist elems following the header @@ -572,7 +572,7 @@ Status KVEngine::restoreDataFromBackup(const std::string& backup_log) { iter->Next(); break; } - case RecordType::SortedHeader: { + case RecordType::SortedRecord: { // Maybe reuse id? std::shared_ptr skiplist = nullptr; if (!expired) { @@ -612,7 +612,7 @@ Status KVEngine::restoreDataFromBackup(const std::string& backup_log) { } break; } - case RecordType::HashHeader: { + case RecordType::HashRecord: { std::shared_ptr hlist = nullptr; if (!expired) { s = buildHashlist(record.key, hlist); @@ -645,7 +645,7 @@ Status KVEngine::restoreDataFromBackup(const std::string& backup_log) { } break; } - case RecordType::ListHeader: { + case RecordType::ListRecord: { std::shared_ptr list = nullptr; if (!expired) { s = buildList(record.key, list); @@ -890,16 +890,23 @@ Status KVEngine::maybeInitBatchLogFile() { } Status KVEngine::BatchWrite(std::unique_ptr const& batch) { - WriteBatchImpl const* batch_impl = - dynamic_cast(batch.get()); + const WriteBatchImpl* batch_impl = + dynamic_cast(batch.get()); if (batch_impl == nullptr) { return Status::InvalidArgument; } - return batchWriteImpl(*batch_impl); + return batchWriteImpl(*batch_impl, true); } -Status KVEngine::batchWriteImpl(WriteBatchImpl const& batch) { +Status KVEngine::CommitTransaction(TransactionImpl* txn) { + const WriteBatchImpl* batch = txn->GetBatch(); + kvdk_assert(batch != nullptr, ""); + return batchWriteImpl(*batch, + false /* key should already been locked by txn */); +} + +Status KVEngine::batchWriteImpl(WriteBatchImpl const& batch, bool lock_key) { if (batch.Size() > BatchWriteLog::Capacity()) { return Status::InvalidBatchSize; } @@ -928,7 +935,7 @@ Status KVEngine::batchWriteImpl(WriteBatchImpl const& batch) { // Lookup Skiplists and Hashes for further operations for (auto const& sorted_op : batch.SortedOps()) { - auto res = lookupKey(sorted_op.collection, RecordType::SortedHeader); + auto res = lookupKey(sorted_op.collection, RecordType::SortedRecord); /// TODO: this is a temporary work-around /// We cannot lock both key and field, which may trigger deadlock. /// However, if a collection is created and a field is inserted, @@ -957,14 +964,16 @@ Status KVEngine::batchWriteImpl(WriteBatchImpl const& batch) { // Keys/internal keys to be locked on HashTable std::vector keys_to_lock; - for (auto const& string_op : batch.StringOps()) { - keys_to_lock.push_back(string_op.key); - } - for (auto const& arg : sorted_args) { - keys_to_lock.push_back(arg.skiplist->InternalKey(arg.key)); - } - for (auto const& arg : hash_args) { - keys_to_lock.push_back(arg.hlist->InternalKey(arg.key)); + if (lock_key) { + for (auto const& string_op : batch.StringOps()) { + keys_to_lock.push_back(string_op.key); + } + for (auto const& arg : sorted_args) { + keys_to_lock.push_back(arg.skiplist->InternalKey(arg.key)); + } + for (auto const& arg : hash_args) { + keys_to_lock.push_back(arg.hlist->InternalKey(arg.key)); + } } auto guard = hash_table_->RangeLock(keys_to_lock); @@ -1383,17 +1392,17 @@ HashTable::LookupResult KVEngine::lookupKey(StringView key, uint8_t type_mask) { : Status::Ok; break; } - case RecordType::SortedHeader: + case RecordType::SortedRecord: result.s = result.entry.GetIndex().skiplist->HasExpired() ? Status::Outdated : Status::Ok; break; - case RecordType::ListHeader: + case RecordType::ListRecord: result.s = result.entry.GetIndex().list->HasExpired() ? Status::Outdated : Status::Ok; break; - case RecordType::HashHeader: { + case RecordType::HashRecord: { result.s = result.entry.GetIndex().hlist->HasExpired() ? Status::Outdated : Status::Ok; diff --git a/engine/kv_engine.hpp b/engine/kv_engine.hpp index afdbb8d6..61647120 100644 --- a/engine/kv_engine.hpp +++ b/engine/kv_engine.hpp @@ -35,6 +35,7 @@ #include "sorted_collection/skiplist.hpp" #include "structures.hpp" #include "thread_manager.hpp" +#include "transaction_impl.hpp" #include "utils/utils.hpp" #include "version/old_records_cleaner.hpp" #include "version/version_controller.hpp" @@ -54,12 +55,12 @@ class KVEngine : public Engine { const std::string& backup_log, Engine** engine_ptr, const Configs& configs); - Snapshot* GetSnapshot(bool make_checkpoint) override; + Snapshot* GetSnapshot(bool make_checkpoint) final; Status Backup(const pmem::obj::string_view backup_log, - const Snapshot* snapshot) override; + const Snapshot* snapshot) final; - void ReleaseSnapshot(const Snapshot* snapshot) override { + void ReleaseSnapshot(const Snapshot* snapshot) final { { std::lock_guard lg(checkpoint_lock_); persist_checkpoint_->MaybeRelease( @@ -76,38 +77,38 @@ class KVEngine : public Engine { // 1. Expire assumes that str is not duplicated among all types, which is not // implemented yet // 2. Expire is not compatible with checkpoint for now - Status Expire(const StringView str, TTLType ttl_time) override; + Status Expire(const StringView str, TTLType ttl_time) final; // Get time to expire of str // // Notice: // Expire assumes that str is not duplicated among all types, which is not // implemented yet - Status GetTTL(const StringView str, TTLType* ttl_time) override; + Status GetTTL(const StringView str, TTLType* ttl_time) final; Status TypeOf(StringView key, ValueType* type) final; // String - Status Get(const StringView key, std::string* value) override; + Status Get(const StringView key, std::string* value) final; Status Put(const StringView key, const StringView value, - const WriteOptions& write_options) override; - Status Delete(const StringView key) override; + const WriteOptions& write_options) final; + Status Delete(const StringView key) final; Status Modify(const StringView key, ModifyFunc modify_func, void* modify_args, - const WriteOptions& options) override; + const WriteOptions& options) final; // Sorted Status SortedCreate(const StringView collection_name, - const SortedCollectionConfigs& configs) override; - Status SortedDestroy(const StringView collection_name) override; - Status SortedSize(const StringView collection, size_t* size) override; + const SortedCollectionConfigs& configs) final; + Status SortedDestroy(const StringView collection_name) final; + Status SortedSize(const StringView collection, size_t* size) final; Status SortedGet(const StringView collection, const StringView user_key, - std::string* value) override; + std::string* value) final; Status SortedPut(const StringView collection, const StringView user_key, - const StringView value) override; + const StringView value) final; Status SortedDelete(const StringView collection, - const StringView user_key) override; + const StringView user_key) final; SortedIterator* SortedIteratorCreate(const StringView collection, - Snapshot* snapshot, Status* s) override; - void SortedIteratorRelease(SortedIterator* sorted_iterator) override; + Snapshot* snapshot, Status* s) final; + void SortedIteratorRelease(SortedIterator* sorted_iterator) final; // List Status ListCreate(StringView key) final; @@ -173,6 +174,20 @@ class KVEngine : public Engine { return std::unique_ptr{new WriteBatchImpl{}}; } + std::unique_ptr TransactionCreate() final { + return std::unique_ptr(new TransactionImpl(this)); + } + + // Call this function before doing collection related transaction to avoid + // collection be destroyed during transaction + std::unique_ptr + AcquireCollectionTransactionLock() { + return std::unique_ptr( + new CollectionTransactionCV::TransactionToken(&ct_cv_)); + } + + Status CommitTransaction(TransactionImpl* txn); + // For test cases const std::unordered_map>& GetSkiplists() { @@ -325,6 +340,14 @@ class KVEngine : public Engine { hash_table_->Insert(ret, type, status, addr, pointerType(type)); } + // Call this function before create or destroy a collection to avoid + // collection be destroyed during a related transaction + std::unique_ptr + acquireCollectionCreateOrDestroyLock() { + return std::unique_ptr( + new CollectionTransactionCV::CollectionToken(&ct_cv_)); + } + template static constexpr RecordType collectionType() { static_assert(std::is_same::value || @@ -332,11 +355,11 @@ class KVEngine : public Engine { std::is_same::value, "Invalid type!"); return std::is_same::value - ? RecordType::SortedHeader + ? RecordType::SortedRecord : std::is_same::value - ? RecordType::ListHeader + ? RecordType::ListRecord : std::is_same::value - ? RecordType::HashHeader + ? RecordType::HashRecord : RecordType::Empty; } @@ -352,13 +375,13 @@ class KVEngine : public Engine { kvdk_assert(false, "Not supported!"); return PointerType::Invalid; } - case RecordType::SortedHeader: { + case RecordType::SortedRecord: { return PointerType::Skiplist; } - case RecordType::ListHeader: { + case RecordType::ListRecord: { return PointerType::List; } - case RecordType::HashHeader: { + case RecordType::HashRecord: { return PointerType::HashList; } case RecordType::HashElem: { @@ -429,7 +452,7 @@ class KVEngine : public Engine { Status persistOrRecoverImmutableConfigs(); - Status batchWriteImpl(WriteBatchImpl const& batch); + Status batchWriteImpl(WriteBatchImpl const& batch, bool lock_key); Status batchWriteRollbackLogs(); @@ -652,6 +675,7 @@ class KVEngine : public Engine { std::atomic round_robin_id_{0}; + CollectionTransactionCV ct_cv_; // We manually allocate recovery thread id for no conflict in multi-thread // recovering // Todo: do not hard code diff --git a/engine/kv_engine_cleaner.cpp b/engine/kv_engine_cleaner.cpp index c240c4c5..b5f17b5f 100644 --- a/engine/kv_engine_cleaner.cpp +++ b/engine/kv_engine_cleaner.cpp @@ -160,7 +160,7 @@ void KVEngine::purgeAndFreeDLRecords( } break; } - case RecordType::SortedHeader: { + case RecordType::SortedRecord: { if (record_status != RecordStatus::Outdated && !pmem_record->HasExpired()) { entries.emplace_back( @@ -183,7 +183,7 @@ void KVEngine::purgeAndFreeDLRecords( } break; } - case RecordType::HashHeader: { + case RecordType::HashRecord: { if (record_status != RecordStatus::Outdated && !pmem_record->HasExpired()) { entries.emplace_back( @@ -206,7 +206,7 @@ void KVEngine::purgeAndFreeDLRecords( } break; } - case RecordType::ListHeader: { + case RecordType::ListRecord: { if (record_status != RecordStatus::Outdated && !pmem_record->HasExpired()) { entries.emplace_back( @@ -760,7 +760,7 @@ void Cleaner::FetchOutdatedCollections( auto skiplist_iter = outdated_collections_.skiplists.begin(); if (skiplist_iter->second < min_snapshot_ts) { outdated_collection = skiplist_iter->first; - record_type = RecordType::SortedHeader; + record_type = RecordType::SortedRecord; outdated_collections_.skiplists.erase(skiplist_iter); } } @@ -769,7 +769,7 @@ void Cleaner::FetchOutdatedCollections( auto list_iter = outdated_collections_.lists.begin(); if (list_iter->second < min_snapshot_ts) { outdated_collection = list_iter->first; - record_type = RecordType::ListHeader; + record_type = RecordType::ListRecord; outdated_collections_.lists.erase(list_iter); } } @@ -778,7 +778,7 @@ void Cleaner::FetchOutdatedCollections( auto hash_list_iter = outdated_collections_.hashlists.begin(); if (hash_list_iter->second < min_snapshot_ts) { outdated_collection = hash_list_iter->first; - record_type = RecordType::HashHeader; + record_type = RecordType::HashRecord; outdated_collections_.hashlists.erase(hash_list_iter); } } diff --git a/engine/kv_engine_hash.cpp b/engine/kv_engine_hash.cpp index 8523bcc6..ca144cd9 100644 --- a/engine/kv_engine_hash.cpp +++ b/engine/kv_engine_hash.cpp @@ -22,9 +22,10 @@ Status KVEngine::buildHashlist(const StringView& collection, auto ul = hash_table_->AcquireLock(collection); auto holder = version_controller_.GetLocalSnapshotHolder(); TimestampType new_ts = holder.Timestamp(); - auto lookup_result = lookupKey(collection, RecordType::HashHeader); + auto lookup_result = lookupKey(collection, RecordType::HashRecord); if (lookup_result.s == Status::NotFound || lookup_result.s == Status::Outdated) { + auto create_token = acquireCollectionCreateOrDestroyLock(); DLRecord* existing_header = lookup_result.s == Outdated ? lookup_result.entry.GetIndex().hlist->HeaderRecord() @@ -40,7 +41,7 @@ Status KVEngine::buildHashlist(const StringView& collection, // header point to itself DLRecord* pmem_record = DLRecord::PersistDLRecord( pmem_allocator_->offset2addr_checked(space.offset), space.size, new_ts, - RecordType::HashHeader, RecordStatus::Normal, + RecordType::HashRecord, RecordStatus::Normal, pmem_allocator_->addr2offset(existing_header), space.offset, space.offset, collection, value_str); hlist = std::make_shared(pmem_record, collection, id, @@ -48,7 +49,7 @@ Status KVEngine::buildHashlist(const StringView& collection, dllist_locks_.get()); kvdk_assert(hlist != nullptr, ""); addHashlistToMap(hlist); - insertKeyOrElem(lookup_result, RecordType::HashHeader, RecordStatus::Normal, + insertKeyOrElem(lookup_result, RecordType::HashRecord, RecordStatus::Normal, hlist.get()); return Status::Ok; } else { @@ -69,8 +70,9 @@ Status KVEngine::HashDestroy(StringView collection) { HashList* hlist; Status s = hashListFind(collection, &hlist); if (s == Status::Ok) { + auto destroy = acquireCollectionCreateOrDestroyLock(); DLRecord* header = hlist->HeaderRecord(); - kvdk_assert(header->GetRecordType() == RecordType::HashHeader, ""); + kvdk_assert(header->GetRecordType() == RecordType::HashRecord, ""); StringView value = header->Value(); auto request_size = DLRecord::RecordSize(collection, value); SpaceEntry space = pmem_allocator_->Allocate(request_size); @@ -79,12 +81,12 @@ Status KVEngine::HashDestroy(StringView collection) { } DLRecord* pmem_record = DLRecord::PersistDLRecord( pmem_allocator_->offset2addr_checked(space.offset), space.size, new_ts, - RecordType::HashHeader, RecordStatus::Outdated, + RecordType::HashRecord, RecordStatus::Outdated, pmem_allocator_->addr2offset_checked(header), header->prev, header->next, collection, value); bool success = hlist->Replace(header, pmem_record); kvdk_assert(success, "existing header should be linked on its hlist"); - hash_table_->Insert(collection, RecordType::HashHeader, + hash_table_->Insert(collection, RecordType::HashRecord, RecordStatus::Outdated, hlist, PointerType::HashList); { std::unique_lock hlist_lock(hlists_mu_); @@ -248,7 +250,7 @@ void KVEngine::HashIteratorRelease(HashIterator* hash_iter) { Status KVEngine::hashListFind(StringView collection, HashList** hlist) { // Callers should acquire the access token or snapshot. // Lockless lookup for the collection - auto result = lookupKey(collection, RecordType::HashHeader); + auto result = lookupKey(collection, RecordType::HashRecord); if (result.s == Status::Outdated) { return Status::NotFound; } diff --git a/engine/kv_engine_list.cpp b/engine/kv_engine_list.cpp index 9b931b5a..695a7779 100644 --- a/engine/kv_engine_list.cpp +++ b/engine/kv_engine_list.cpp @@ -19,9 +19,10 @@ Status KVEngine::buildList(const StringView& list_name, auto ul = hash_table_->AcquireLock(list_name); auto holder = version_controller_.GetLocalSnapshotHolder(); TimestampType new_ts = holder.Timestamp(); - auto lookup_result = lookupKey(list_name, RecordType::ListHeader); + auto lookup_result = lookupKey(list_name, RecordType::ListRecord); if (lookup_result.s == Status::NotFound || lookup_result.s == Status::Outdated) { + auto create_token = acquireCollectionCreateOrDestroyLock(); DLRecord* existing_header = lookup_result.s == Outdated ? lookup_result.entry.GetIndex().hlist->HeaderRecord() @@ -37,14 +38,14 @@ Status KVEngine::buildList(const StringView& list_name, // header point to itself DLRecord* pmem_record = DLRecord::PersistDLRecord( pmem_allocator_->offset2addr_checked(space.offset), space.size, new_ts, - RecordType::ListHeader, RecordStatus::Normal, + RecordType::ListRecord, RecordStatus::Normal, pmem_allocator_->addr2offset(existing_header), space.offset, space.offset, list_name, value_str); list = std::make_shared(pmem_record, list_name, id, pmem_allocator_.get(), dllist_locks_.get()); kvdk_assert(list != nullptr, ""); addListToMap(list); - insertKeyOrElem(lookup_result, RecordType::ListHeader, RecordStatus::Normal, + insertKeyOrElem(lookup_result, RecordType::ListRecord, RecordStatus::Normal, list.get()); return Status::Ok; } else { @@ -64,8 +65,9 @@ Status KVEngine::ListDestroy(StringView collection) { List* list; Status s = listFind(collection, &list); if (s == Status::Ok) { + auto destroy_token = acquireCollectionCreateOrDestroyLock(); DLRecord* header = list->HeaderRecord(); - kvdk_assert(header->GetRecordType() == RecordType::ListHeader, ""); + kvdk_assert(header->GetRecordType() == RecordType::ListRecord, ""); StringView value = header->Value(); auto request_size = DLRecord::RecordSize(collection, value); SpaceEntry space = pmem_allocator_->Allocate(request_size); @@ -74,12 +76,12 @@ Status KVEngine::ListDestroy(StringView collection) { } DLRecord* pmem_record = DLRecord::PersistDLRecord( pmem_allocator_->offset2addr_checked(space.offset), space.size, new_ts, - RecordType::ListHeader, RecordStatus::Outdated, + RecordType::ListRecord, RecordStatus::Outdated, pmem_allocator_->addr2offset_checked(header), header->prev, header->next, collection, value); bool success = list->Replace(header, pmem_record); kvdk_assert(success, "existing header should be linked on its list"); - hash_table_->Insert(collection, RecordType::ListHeader, + hash_table_->Insert(collection, RecordType::ListRecord, RecordStatus::Outdated, list, PointerType::List); { std::unique_lock list_lock(lists_mu_); @@ -528,7 +530,7 @@ Status KVEngine::listRestoreList(DLRecord* pmp_record) { } Status KVEngine::listFind(StringView list_name, List** list) { - auto result = lookupKey(list_name, RecordType::ListHeader); + auto result = lookupKey(list_name, RecordType::ListRecord); if (result.s == Status::Outdated) { return Status::NotFound; } diff --git a/engine/kv_engine_sorted.cpp b/engine/kv_engine_sorted.cpp index dfff0fbd..9dc801ed 100644 --- a/engine/kv_engine_sorted.cpp +++ b/engine/kv_engine_sorted.cpp @@ -26,8 +26,9 @@ Status KVEngine::buildSkiplist(const StringView& collection_name, auto holder = version_controller_.GetLocalSnapshotHolder(); TimestampType new_ts = holder.Timestamp(); auto lookup_result = - lookupKey(collection_name, RecordType::SortedHeader); + lookupKey(collection_name, RecordType::SortedRecord); if (lookup_result.s == NotFound || lookup_result.s == Outdated) { + auto create_token = acquireCollectionCreateOrDestroyLock(); DLRecord* existing_header = lookup_result.s == Outdated ? lookup_result.entry.GetIndex().skiplist->HeaderRecord() @@ -52,7 +53,7 @@ Status KVEngine::buildSkiplist(const StringView& collection_name, // header point to itself DLRecord* pmem_record = DLRecord::PersistDLRecord( pmem_allocator_->offset2addr(space_entry.offset), space_entry.size, - new_ts, RecordType::SortedHeader, RecordStatus::Normal, + new_ts, RecordType::SortedRecord, RecordStatus::Normal, pmem_allocator_->addr2offset(existing_header), space_entry.offset, space_entry.offset, collection_name, value_str); @@ -61,7 +62,7 @@ Status KVEngine::buildSkiplist(const StringView& collection_name, pmem_allocator_.get(), hash_table_.get(), dllist_locks_.get(), s_configs.index_with_hashtable); addSkiplistToMap(skiplist); - insertKeyOrElem(lookup_result, RecordType::SortedHeader, + insertKeyOrElem(lookup_result, RecordType::SortedRecord, RecordStatus::Normal, skiplist.get()); } else { return lookup_result.s == Status::Ok ? Status::Existed : lookup_result.s; @@ -76,11 +77,12 @@ Status KVEngine::SortedDestroy(const StringView collection_name) { auto snapshot_holder = version_controller_.GetLocalSnapshotHolder(); auto new_ts = snapshot_holder.Timestamp(); auto lookup_result = - lookupKey(collection_name, RecordType::SortedHeader); + lookupKey(collection_name, RecordType::SortedRecord); if (lookup_result.s == Status::Ok) { + auto destroy_token = acquireCollectionCreateOrDestroyLock(); Skiplist* skiplist = lookup_result.entry.GetIndex().skiplist; DLRecord* header = skiplist->HeaderRecord(); - assert(header->GetRecordType() == RecordType::SortedHeader); + assert(header->GetRecordType() == RecordType::SortedRecord); StringView value = header->Value(); auto request_size = sizeof(DLRecord) + collection_name.size() + value.size(); @@ -90,14 +92,14 @@ Status KVEngine::SortedDestroy(const StringView collection_name) { } DLRecord* pmem_record = DLRecord::PersistDLRecord( pmem_allocator_->offset2addr_checked(space_entry.offset), - space_entry.size, new_ts, RecordType::SortedHeader, + space_entry.size, new_ts, RecordType::SortedRecord, RecordStatus::Outdated, pmem_allocator_->addr2offset_checked(header), header->prev, header->next, collection_name, value, 0); bool success = Skiplist::Replace(header, pmem_record, skiplist->HeaderNode(), pmem_allocator_.get(), dllist_locks_.get()); kvdk_assert(success, "existing header should be linked on its skiplist"); - insertKeyOrElem(lookup_result, RecordType::SortedHeader, + insertKeyOrElem(lookup_result, RecordType::SortedRecord, RecordStatus::Outdated, skiplist); { std::unique_lock skiplist_lock(skiplists_mu_); @@ -116,7 +118,7 @@ Status KVEngine::SortedSize(const StringView collection, size_t* size) { auto holder = version_controller_.GetLocalSnapshotHolder(); Skiplist* skiplist = nullptr; - auto ret = lookupKey(collection, RecordType::SortedHeader); + auto ret = lookupKey(collection, RecordType::SortedRecord); if (ret.s != Status::Ok) { return ret.s == Status::Outdated ? Status::NotFound : ret.s; } @@ -136,7 +138,7 @@ Status KVEngine::SortedGet(const StringView collection, auto holder = version_controller_.GetLocalSnapshotHolder(); Skiplist* skiplist = nullptr; - auto ret = lookupKey(collection, RecordType::SortedHeader); + auto ret = lookupKey(collection, RecordType::SortedRecord); if (ret.s != Status::Ok) { return ret.s == Status::Outdated ? Status::NotFound : ret.s; } @@ -157,7 +159,7 @@ Status KVEngine::SortedPut(const StringView collection, Skiplist* skiplist = nullptr; - auto ret = lookupKey(collection, RecordType::SortedHeader); + auto ret = lookupKey(collection, RecordType::SortedRecord); if (ret.s != Status::Ok) { return ret.s == Status::Outdated ? Status::NotFound : ret.s; } @@ -176,7 +178,7 @@ Status KVEngine::SortedDelete(const StringView collection, auto holder = version_controller_.GetLocalSnapshotHolder(); Skiplist* skiplist = nullptr; - auto ret = lookupKey(collection, RecordType::SortedHeader); + auto ret = lookupKey(collection, RecordType::SortedRecord); if (ret.s != Status::Ok) { return (ret.s == Status::Outdated || ret.s == Status::NotFound) ? Status::NotFound @@ -198,7 +200,7 @@ SortedIterator* KVEngine::SortedIteratorCreate(const StringView collection, snapshot = GetSnapshot(false); } // find collection - auto res = lookupKey(collection, RecordType::SortedHeader); + auto res = lookupKey(collection, RecordType::SortedRecord); if (s != nullptr) { *s = (res.s == Status::Outdated) ? Status::NotFound : res.s; } diff --git a/engine/list_collection/list.cpp b/engine/list_collection/list.cpp index cbe98fad..1098a839 100644 --- a/engine/list_collection/list.cpp +++ b/engine/list_collection/list.cpp @@ -17,7 +17,7 @@ List::WriteResult List::SetExpireTime(ExpireTimeType expired_time, } DLRecord* pmem_record = DLRecord::PersistDLRecord( pmem_allocator_->offset2addr_checked(space.offset), space.size, timestamp, - RecordType::ListHeader, RecordStatus::Normal, + RecordType::ListRecord, RecordStatus::Normal, pmem_allocator_->addr2offset_checked(header), header->prev, header->next, header->Key(), header->Value(), expired_time); bool success = dl_list_.Replace(header, pmem_record); diff --git a/engine/list_collection/list.hpp b/engine/list_collection/list.hpp index 16613c0b..999c660b 100644 --- a/engine/list_collection/list.hpp +++ b/engine/list_collection/list.hpp @@ -128,7 +128,7 @@ class List : public Collection { switch (record->GetRecordType()) { case RecordType::ListElem: return ExtractID(record->Key()); - case RecordType::ListHeader: + case RecordType::ListRecord: return DecodeID(record->Value()); default: GlobalLogger.Error("Wrong record type %u in ListID", @@ -140,7 +140,7 @@ class List : public Collection { static bool MatchType(const DLRecord* record) { RecordType type = record->GetRecordType(); - return type == RecordType::ListElem || type == RecordType::ListHeader; + return type == RecordType::ListElem || type == RecordType::ListRecord; } private: diff --git a/engine/list_collection/rebuilder.hpp b/engine/list_collection/rebuilder.hpp index c2cb694c..5f93dfa6 100644 --- a/engine/list_collection/rebuilder.hpp +++ b/engine/list_collection/rebuilder.hpp @@ -172,7 +172,7 @@ class ListRebuilder { if (!outdated) { auto lookup_result = hash_table_->Insert( - collection_name, RecordType::ListHeader, RecordStatus::Normal, + collection_name, RecordType::ListRecord, RecordStatus::Normal, list.get(), PointerType::List); switch (lookup_result.s) { case Status::Ok: { diff --git a/engine/sorted_collection/rebuilder.cpp b/engine/sorted_collection/rebuilder.cpp index e5666dd7..4b27b691 100644 --- a/engine/sorted_collection/rebuilder.cpp +++ b/engine/sorted_collection/rebuilder.cpp @@ -40,7 +40,7 @@ SortedCollectionRebuilder::RebuildResult SortedCollectionRebuilder::Rebuild() { } Status SortedCollectionRebuilder::AddHeader(DLRecord* header_record) { - assert(header_record->GetRecordType() == RecordType::SortedHeader); + assert(header_record->GetRecordType() == RecordType::SortedRecord); bool linked_record = recovery_utils_.CheckAndRepairLinkage(header_record); if (!linked_record) { @@ -639,12 +639,12 @@ Status SortedCollectionRebuilder::insertHashIndex(const StringView& key, RecordType::SortedElem, ""); } else if (index_type == PointerType::Skiplist) { - record_type = RecordType::SortedHeader; + record_type = RecordType::SortedRecord; record_status = static_cast(index_ptr)->HeaderRecord()->GetRecordStatus(); kvdk_assert( static_cast(index_ptr)->HeaderRecord()->GetRecordType() == - RecordType::SortedHeader, + RecordType::SortedRecord, ""); } else { kvdk_assert(false, "Wrong type in sorted collection rebuilder"); diff --git a/engine/sorted_collection/skiplist.cpp b/engine/sorted_collection/skiplist.cpp index 90d01d12..67c0f961 100644 --- a/engine/sorted_collection/skiplist.cpp +++ b/engine/sorted_collection/skiplist.cpp @@ -65,7 +65,7 @@ Skiplist::WriteResult Skiplist::SetExpireTime(ExpireTimeType expired_time, } DLRecord* pmem_record = DLRecord::PersistDLRecord( pmem_allocator_->offset2addr_checked(space_entry.offset), - space_entry.size, timestamp, RecordType::SortedHeader, + space_entry.size, timestamp, RecordType::SortedRecord, RecordStatus::Normal, pmem_allocator_->addr2offset_checked(header), header->prev, header->next, header->Key(), header->Value(), expired_time); bool success = Skiplist::Replace(header, pmem_record, HeaderNode(), @@ -310,10 +310,10 @@ bool Skiplist::lockInsertPosition(const StringView& inserting_key, auto check_order = [&]() { bool res = /*check next*/ (next_record->GetRecordType() == - RecordType::SortedHeader || + RecordType::SortedRecord || Compare(inserting_key, UserKey(next_record)) <= 0) && /*check prev*/ (prev_record->GetRecordType() == - RecordType::SortedHeader || + RecordType::SortedRecord || Compare(inserting_key, UserKey(prev_record)) > 0); return res; }; diff --git a/engine/sorted_collection/skiplist.hpp b/engine/sorted_collection/skiplist.hpp index 588bc7cf..a069ca48 100644 --- a/engine/sorted_collection/skiplist.hpp +++ b/engine/sorted_collection/skiplist.hpp @@ -305,7 +305,7 @@ class Skiplist : public Collection { static bool MatchType(DLRecord* record) { RecordType type = record->GetRecordType(); - return type == RecordType::SortedElem || type == RecordType::SortedHeader; + return type == RecordType::SortedElem || type == RecordType::SortedRecord; } // Remove a dl record from its skiplist by unlinking @@ -377,7 +377,7 @@ class Skiplist : public Collection { case RecordType::SortedElem: return ExtractID(record->Key()); break; - case RecordType::SortedHeader: + case RecordType::SortedRecord: return DecodeID(record->Value()); default: GlobalLogger.Error("Wrong record type %u in SkiplistID", diff --git a/engine/transaction_impl.cpp b/engine/transaction_impl.cpp new file mode 100644 index 00000000..7938bbf6 --- /dev/null +++ b/engine/transaction_impl.cpp @@ -0,0 +1,270 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2021 Intel Corporation + */ + +#include "transaction_impl.hpp" + +#include "kv_engine.hpp" + +namespace KVDK_NAMESPACE { +// To avoid deadlock in transaction, we abort locking key in +// random(kLockTimeoutMicrosecondsMin,kLockTimeoutMicrosecondsMax) and +// return Timeout in write operations +constexpr int64_t kLockTimeoutMicrosecondsMin = 5000; +constexpr int64_t kLockTimeoutMicrosecondsMax = 15000; + +TransactionImpl::TransactionImpl(KVEngine* engine) + : engine_(engine), timeout_(randomTimeout()) { + kvdk_assert(engine_ != nullptr, ""); + batch_.reset( + dynamic_cast(engine_->WriteBatchCreate().release())); + kvdk_assert(batch_ != nullptr, ""); +} + +TransactionImpl::~TransactionImpl() { Rollback(); } + +Status TransactionImpl::StringPut(const std::string& key, + const std::string& value) { + auto hash_table = engine_->GetHashTable(); + if (!tryLock(hash_table->GetLock(key))) { + status_ = Status::Timeout; + return status_; + } + + batch_->StringPut(key, value); + return Status::Ok; +} + +Status TransactionImpl::StringDelete(const std::string& key) { + auto hash_table = engine_->GetHashTable(); + if (!tryLock(hash_table->GetLock(key))) { + status_ = Status::Timeout; + return status_; + } + + batch_->StringDelete(key); + return Status::Ok; +} + +Status TransactionImpl::StringGet(const std::string& key, std::string* value) { + auto op = batch_->StringGet(key); + if (op != nullptr) { + if (op->op == WriteOp::Delete) { + return Status::NotFound; + } else { + value->assign(op->value); + return Status::Ok; + } + } else { + auto hash_table = engine_->GetHashTable(); + if (!tryLock(hash_table->GetLock(key))) { + status_ = Status::Timeout; + return status_; + } + return engine_->Get(key, value); + } +} + +Status TransactionImpl::SortedPut(const std::string& collection, + const std::string& key, + const std::string& value) { + acquireCollectionTransaction(); + auto hash_table = engine_->GetHashTable(); + auto lookup_result = + hash_table->Lookup(collection, RecordType::SortedRecord); + if (lookup_result.s != Status::Ok) { + kvdk_assert(lookup_result.s == Status::NotFound, ""); + return lookup_result.s; + } + Skiplist* skiplist = lookup_result.entry.GetIndex().skiplist; + auto internal_key = skiplist->InternalKey(key); + if (!tryLock(hash_table->GetLock(internal_key))) { + status_ = Status::Timeout; + return status_; + } + + batch_->SortedPut(collection, key, value); + return Status::Ok; +} + +Status TransactionImpl::SortedDelete(const std::string& collection, + const std::string& key) { + acquireCollectionTransaction(); + auto hash_table = engine_->GetHashTable(); + auto lookup_result = + hash_table->Lookup(collection, RecordType::SortedRecord); + if (lookup_result.s != Status::Ok) { + kvdk_assert(lookup_result.s == Status::NotFound, ""); + return lookup_result.s; + } + Skiplist* skiplist = lookup_result.entry.GetIndex().skiplist; + auto internal_key = skiplist->InternalKey(key); + if (!tryLock(hash_table->GetLock(internal_key))) { + status_ = Status::Timeout; + return status_; + } + + batch_->SortedDelete(collection, key); + return Status::Ok; +} + +Status TransactionImpl::SortedGet(const std::string& collection, + const std::string& key, std::string* value) { + auto op = batch_->SortedGet(collection, key); + if (op != nullptr) { + if (op->op == WriteOp::Delete) { + return Status::NotFound; + } else { + value->assign(op->value); + return Status::Ok; + } + } else { + acquireCollectionTransaction(); + auto hash_table = engine_->GetHashTable(); + auto lookup_result = + hash_table->Lookup(collection, RecordType::SortedRecord); + if (lookup_result.s != Status::Ok) { + kvdk_assert(lookup_result.s == Status::NotFound, ""); + return lookup_result.s; + } + Skiplist* skiplist = lookup_result.entry.GetIndex().skiplist; + auto internal_key = skiplist->InternalKey(key); + if (!tryLock(hash_table->GetLock(internal_key))) { + status_ = Status::Timeout; + return status_; + } + + return skiplist->Get(key, value); + } +} + +Status TransactionImpl::HashPut(const std::string& collection, + const std::string& key, + const std::string& value) { + acquireCollectionTransaction(); + auto hash_table = engine_->GetHashTable(); + auto lookup_result = + hash_table->Lookup(collection, RecordType::HashRecord); + if (lookup_result.s != Status::Ok) { + kvdk_assert(lookup_result.s == Status::NotFound, ""); + return lookup_result.s; + } + HashList* hlist = lookup_result.entry.GetIndex().hlist; + auto internal_key = hlist->InternalKey(key); + if (!tryLock(hash_table->GetLock(internal_key))) { + status_ = Status::Timeout; + return status_; + } + + batch_->HashPut(collection, key, value); + return Status::Ok; +} + +Status TransactionImpl::HashDelete(const std::string& collection, + const std::string& key) { + acquireCollectionTransaction(); + auto hash_table = engine_->GetHashTable(); + auto lookup_result = + hash_table->Lookup(collection, RecordType::HashRecord); + if (lookup_result.s != Status::Ok) { + kvdk_assert(lookup_result.s == Status::NotFound, ""); + return lookup_result.s; + } + HashList* hlist = lookup_result.entry.GetIndex().hlist; + auto internal_key = hlist->InternalKey(key); + if (!tryLock(hash_table->GetLock(internal_key))) { + status_ = Status::Timeout; + return status_; + } + + batch_->HashDelete(collection, key); + return Status::Ok; +} + +Status TransactionImpl::HashGet(const std::string& collection, + const std::string& key, std::string* value) { + auto op = batch_->HashGet(collection, key); + if (op != nullptr) { + if (op->op == WriteOp::Delete) { + return Status::NotFound; + } else { + value->assign(op->value); + return Status::Ok; + } + } else { + acquireCollectionTransaction(); + auto hash_table = engine_->GetHashTable(); + auto lookup_result = + hash_table->Lookup(collection, RecordType::HashRecord); + if (lookup_result.s != Status::Ok) { + kvdk_assert(lookup_result.s == Status::NotFound, ""); + return lookup_result.s; + } + HashList* hlist = lookup_result.entry.GetIndex().hlist; + auto internal_key = hlist->InternalKey(key); + if (!tryLock(hash_table->GetLock(internal_key))) { + status_ = Status::Timeout; + return status_; + } + + return hlist->Get(key, value); + } +} + +void TransactionImpl::acquireCollectionTransaction() { + if (ct_token_ == nullptr) { + ct_token_ = engine_->AcquireCollectionTransactionLock(); + } +} + +bool TransactionImpl::tryLock(SpinMutex* spin) { + auto iter = locked_.find(spin); + if (iter == locked_.end()) { + if (tryLockImpl(spin)) { + locked_.insert(spin); + return true; + } + return false; + } else { + return true; + } +} + +bool TransactionImpl::tryLockImpl(SpinMutex* spin) { + auto now = TimeUtils::microseconds_time(); + while (!spin->try_lock()) { + if (TimeUtils::microseconds_time() - now > timeout_) { + return false; + } + } + return true; +} + +Status TransactionImpl::Commit() { + Status s = engine_->CommitTransaction(this); + Rollback(); + return s; +} + +void TransactionImpl::Rollback() { + for (SpinMutex* s : locked_) { + s->unlock(); + } + locked_.clear(); + string_kv_.clear(); + sorted_kv_.clear(); + hash_kv_.clear(); + batch_->Clear(); + ct_token_ = nullptr; +} + +void TransactionImpl::SetLockTimeout(int64_t microseconds) { + timeout_ = microseconds; +} + +int64_t TransactionImpl::randomTimeout() { + return fast_random_64() % + (kLockTimeoutMicrosecondsMax - kLockTimeoutMicrosecondsMin) + + kLockTimeoutMicrosecondsMin; +} +} // namespace KVDK_NAMESPACE \ No newline at end of file diff --git a/engine/transaction_impl.hpp b/engine/transaction_impl.hpp new file mode 100644 index 00000000..40257a6e --- /dev/null +++ b/engine/transaction_impl.hpp @@ -0,0 +1,153 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2021 Intel Corporation + */ + +#pragma once + +#include +#include +#include + +#include "alias.hpp" +#include "kvdk/transaction.hpp" +#include "write_batch_impl.hpp" + +namespace KVDK_NAMESPACE { + +class KVEngine; + +// Collections of in processing transaction should not be created or +// destroyed, we use this for communication between collection related +// transactions and collection create/destroy threads +class CollectionTransactionCV { + public: + struct TransactionToken { + TransactionToken(CollectionTransactionCV* cv) : cv_(cv) { + kvdk_assert(cv_ != nullptr, ""); + cv_->AcquireTransaction(); + } + + TransactionToken(const TransactionToken&) = delete; + TransactionToken(TransactionToken&&) = delete; + + ~TransactionToken() { cv_->FinishTransaction(); } + + private: + CollectionTransactionCV* cv_; + }; + + struct CollectionToken { + CollectionToken(CollectionTransactionCV* cv) : cv_(cv) { + kvdk_assert(cv_ != nullptr, ""); + cv_->AcquireCollection(); + } + + ~CollectionToken() { cv_->FinishCollection(); } + + CollectionToken(const CollectionToken&) = delete; + CollectionToken(CollectionToken&&) = delete; + + private: + CollectionTransactionCV* cv_; + }; + + CollectionTransactionCV() = default; + + CollectionTransactionCV(const CollectionTransactionCV&) = delete; + + void AcquireTransaction() { + std::unique_lock ul(spin_); + while (processing_collection_ > 0) { + cv_.wait(ul); + } + kvdk_assert(processing_collection_ == 0, ""); + processing_transaction_++; + } + + void AcquireCollection() { + std::unique_lock ul(spin_); + // collection create/destroy has higher priority than txn as it's faster, + // so we add processing cnt here to forbit hungry + processing_collection_++; + while (processing_transaction_ > 0) { + cv_.wait(ul); + } + kvdk_assert(processing_transaction_ == 0, ""); + } + + void FinishTransaction() { + std::unique_lock ul(spin_); + if (--processing_transaction_ == 0) { + cv_.notify_all(); + } + } + + void FinishCollection() { + std::unique_lock ul(spin_); + if (--processing_collection_ == 0) { + cv_.notify_all(); + } + } + + private: + std::condition_variable_any cv_; + SpinMutex spin_; + int processing_collection_ = 0; + int processing_transaction_ = 0; +}; + +class TransactionImpl final : public Transaction { + public: + TransactionImpl(KVEngine* engine); + ~TransactionImpl() final; + Status StringPut(const std::string& key, const std::string& value) final; + Status StringDelete(const std::string& key) final; + Status StringGet(const std::string& key, std::string* value) final; + Status SortedPut(const std::string& collection, const std::string& key, + const std::string& value) final; + Status SortedDelete(const std::string& collection, + const std::string& key) final; + Status SortedGet(const std::string& collection, const std::string& key, + std::string* value) final; + Status HashPut(const std::string& collection, const std::string& key, + const std::string& value) final; + Status HashDelete(const std::string& collection, + const std::string& key) final; + Status HashGet(const std::string& collection, const std::string& key, + std::string* value) final; + Status Commit() final; + void Rollback() final; + Status InternalStatus() final { return status_; } + + // This used by kv engine + WriteBatchImpl* GetBatch() { return batch_.get(); } + + // Set lock time out while acquiring a key lock in transaction, if <0, + // operations will immediately return timeout while failed to lock a key + void SetLockTimeout(int64_t micro_seconds); + + private: + struct KVOp { + WriteOp op; + std::string value; + }; + + bool tryLock(SpinMutex* spin); + bool tryLockImpl(SpinMutex* spin); + void acquireCollectionTransaction(); + int64_t randomTimeout(); + + KVEngine* engine_; + Status status_; + std::unordered_map> + sorted_kv_; + std::unordered_map> + hash_kv_; + std::unordered_map string_kv_; + std::unique_ptr batch_; + // TODO use std::unique_lock + std::unordered_set locked_; + std::unique_ptr ct_token_; + int64_t timeout_; +}; +} // namespace KVDK_NAMESPACE \ No newline at end of file diff --git a/engine/utils/utils.hpp b/engine/utils/utils.hpp index 3926727a..ed501b81 100644 --- a/engine/utils/utils.hpp +++ b/engine/utils/utils.hpp @@ -512,6 +512,8 @@ inline UnixTimeType unix_time(void) { inline int64_t millisecond_time() { return unix_time() / 1000; } +inline int64_t microseconds_time() { return unix_time(); } + inline bool CheckIsExpired(ExpireTimeType expired_time) { if (expired_time >= 0 && expired_time <= millisecond_time()) { return true; diff --git a/engine/write_batch_impl.hpp b/engine/write_batch_impl.hpp index 86a2cc57..e82dee6f 100644 --- a/engine/write_batch_impl.hpp +++ b/engine/write_batch_impl.hpp @@ -14,8 +14,6 @@ namespace KVDK_NAMESPACE { -struct Splice; - class WriteBatchImpl final : public WriteBatch { public: struct StringOp { @@ -71,32 +69,67 @@ class WriteBatchImpl final : public WriteBatch { string_ops_.insert(op); } - void SortedPut(std::string const& key, std::string const& field, + void SortedPut(std::string const& collection, std::string const& key, std::string const& value) final { - SortedOp op{WriteOp::Put, key, field, value}; + SortedOp op{WriteOp::Put, collection, key, value}; sorted_ops_.erase(op); sorted_ops_.insert(op); } - void SortedDelete(std::string const& key, std::string const& field) final { - SortedOp op{WriteOp::Delete, key, field, std::string{}}; + void SortedDelete(std::string const& collection, + std::string const& key) final { + SortedOp op{WriteOp::Delete, collection, key, std::string{}}; sorted_ops_.erase(op); sorted_ops_.insert(op); } - void HashPut(std::string const& key, std::string const& field, + void HashPut(std::string const& collection, std::string const& key, std::string const& value) final { - HashOp op{WriteOp::Put, key, field, value}; + HashOp op{WriteOp::Put, collection, key, value}; hash_ops_.erase(op); hash_ops_.insert(op); } - void HashDelete(std::string const& key, std::string const& field) final { - HashOp op{WriteOp::Delete, key, field, std::string{}}; + void HashDelete(std::string const& collection, std::string const& key) final { + HashOp op{WriteOp::Delete, collection, key, std::string{}}; hash_ops_.erase(op); hash_ops_.insert(op); } + // Get a string op from this batch + // if key not exist in this batch, return nullptr + const StringOp* StringGet(std::string const& key) { + StringOp op{WriteOp::Put, key, ""}; + auto iter = string_ops_.find(op); + if (iter == string_ops_.end()) { + return nullptr; + } + return &(*iter); + } + + // Get a sorted op from this batch + // if collection key not exist in this batch, return nullptr + const SortedOp* SortedGet(const std::string& collection, + const std::string& key) { + SortedOp op{WriteOp::Put, collection, key, ""}; + auto iter = sorted_ops_.find(op); + if (iter == sorted_ops_.end()) { + return nullptr; + } + return &(*iter); + } + + // Get a hash op from this batch + // if the collection key not exist in this batch, return nullptr + const HashOp* HashGet(std::string const& collection, std::string const& key) { + HashOp op{WriteOp::Put, collection, key, ""}; + auto iter = hash_ops_.find(op); + if (iter == hash_ops_.end()) { + return nullptr; + } + return &(*iter); + } + void Clear() final { string_ops_.clear(); sorted_ops_.clear(); diff --git a/examples/tutorial/c_api_tutorial.c b/examples/tutorial/c_api_tutorial.c index 91c712d8..df5d7fc5 100644 --- a/examples/tutorial/c_api_tutorial.c +++ b/examples/tutorial/c_api_tutorial.c @@ -26,7 +26,7 @@ static int StrCmp(const char* a, size_t alen, const char* b, size_t blen) { return r; } -void AnonymousCollectionExample(KVDKEngine* kvdk_engine) { +void StringExample(KVDKEngine* kvdk_engine) { const char* key1 = "key1"; const char* key2 = "key2"; const char* value1 = "value1"; @@ -301,7 +301,7 @@ void CompFuncForSortedCollectionExample(KVDKEngine* kvdk_engine) { KVDKDestroySortedCollectionConfigs(s_configs); } -void BatchWriteAnonCollectionExample(KVDKEngine* kvdk_engine) { +void BatchWriteStringExample(KVDKEngine* kvdk_engine) { const char* key1 = "key1"; const char* key2 = "key2"; const char* value1 = "value1"; @@ -324,12 +324,52 @@ void BatchWriteAnonCollectionExample(KVDKEngine* kvdk_engine) { assert(s == Ok); cmp = StrCmp(read_v2, read_v2_len, value2, strlen(value2)); assert(cmp == 0); - printf("Successfully performed BatchWrite on anonymous global collection.\n"); + printf("Successfully performed BatchWrite on String.\n"); KVDKWriteBatchDestory(kvdk_wb); free(read_v1); free(read_v2); } +void TransactionStringExample(KVDKEngine* kvdk_engine) { + const char* receiver = "Jack"; + const char* payer = "Tom"; + const char* payer_balance = "10"; + const char* receiver_balance = "0"; + KVDKWriteOptions* write_option = KVDKCreateWriteOptions(); + + KVDKPut(kvdk_engine, payer, strlen(payer), payer_balance, + strlen(payer_balance), write_option); + KVDKPut(kvdk_engine, receiver, strlen(receiver), receiver_balance, + strlen(receiver_balance), write_option); + + KVDKTransaction* txn = KVDKTransactionCreate(kvdk_engine); + assert(txn != NULL); + KVDKStatus s = + KVDKTransactionStringPut(txn, payer, strlen(payer), "0", strlen("0")); + assert(s == Ok); + s = KVDKTransactionStringPut(txn, receiver, strlen(receiver), "10", + strlen("10")); + assert(s == Ok); + s = KVDKTransactionCommit(txn); + assert(s == Ok); + + char* val; + size_t val_len; + s = KVDKGet(kvdk_engine, payer, strlen(payer), &val_len, &val); + assert(s == Ok); + assert(val_len == strlen("0")); + assert(memcmp(val, "0", val_len) == 0); + free(val); + s = KVDKGet(kvdk_engine, receiver, strlen(receiver), &val_len, &val); + assert(s == Ok); + assert(val_len == strlen("10")); + assert(memcmp(val, "10", val_len) == 0); + free(val); + + KVDKTransactionDestory(txn); + printf("Successfully performed Transaction on String.\n"); +} + void HashesCollectionExample(KVDKEngine* kvdk_engine) { const char* nums[10] = {"9", "5", "2", "0", "7", "3", "1", "8", "6", "4"}; const char* hash_collection = "hash_collection"; @@ -739,7 +779,7 @@ int main() { ModifyExample(kvdk_engine); // Anonymous Global Collection Example - AnonymousCollectionExample(kvdk_engine); + StringExample(kvdk_engine); // Named Sorted Collection Example SortedCollectionExample(kvdk_engine); @@ -749,8 +789,11 @@ int main() { CompFuncForSortedCollectionExample(kvdk_engine); - // BatchWrite on Anonymous Global Collection Example - BatchWriteAnonCollectionExample(kvdk_engine); + // BatchWrite on String Example + BatchWriteStringExample(kvdk_engine); + + // Transaction on String Example + TransactionStringExample(kvdk_engine); // Hashes Collection Example HashesCollectionExample(kvdk_engine); diff --git a/include/kvdk/engine.h b/include/kvdk/engine.h index 8bd12871..41c4fae0 100644 --- a/include/kvdk/engine.h +++ b/include/kvdk/engine.h @@ -18,6 +18,7 @@ typedef struct KVDKEngine KVDKEngine; typedef struct KVDKConfigs KVDKConfigs; typedef struct KVDKWriteOptions KVDKWriteOptions; typedef struct KVDKWriteBatch KVDKWriteBatch; +typedef struct KVDKTransaction KVDKTransaction; typedef struct KVDKSortedIterator KVDKSortedIterator; typedef struct KVDKListIterator KVDKListIterator; typedef struct KVDKHashIterator KVDKHashIterator; @@ -101,7 +102,38 @@ extern void KVDKWriteBatchHashDelete(KVDKWriteBatch* batch, extern KVDKStatus KVDKBatchWrite(KVDKEngine* engine, KVDKWriteBatch const* batch); -// For Anonymous Global Collection +// For Transactions +extern KVDKTransaction* KVDKTransactionCreate(KVDKEngine* engine); +extern void KVDKTransactionDestory(KVDKTransaction* txn); +extern KVDKStatus KVDKTransactionStringPut(KVDKTransaction* txn, + char const* key_data, size_t key_len, + char const* val_data, + size_t val_len); +extern KVDKStatus KVDKTransactionStringDelete(KVDKTransaction* txn, + char const* key_data, + size_t key_len); +extern KVDKStatus KVDKTransactionSortedPut( + KVDKTransaction* txn, char const* collection, size_t collection_len, + char const* key_data, size_t key_len, char const* val_data, size_t val_len); +extern KVDKStatus KVDKTransactionSortedDelete(KVDKTransaction* txn, + char const* collection, + size_t collection_len, + char const* key_data, + size_t key_len); +extern KVDKStatus KVDKTransactionHashPut(KVDKTransaction* txn, + char const* collection, + size_t collection_len, + char const* key_data, size_t key_len, + char const* val_data, size_t val_len); +extern KVDKStatus KVDKTransactionHashDelete(KVDKTransaction* txn, + char const* collection, + size_t collection_len, + char const* key_data, + size_t key_len); +extern KVDKStatus KVDKTransactionCommit(KVDKTransaction* txn); +extern void KVDKTransactionRollback(KVDKTransaction* txn); + +// For String KV extern KVDKStatus KVDKGet(KVDKEngine* engine, const char* key, size_t key_len, size_t* val_len, char** val); extern KVDKStatus KVDKPut(KVDKEngine* engine, const char* key, size_t key_len, diff --git a/include/kvdk/engine.hpp b/include/kvdk/engine.hpp index 44ba87e2..6a0ba7ed 100644 --- a/include/kvdk/engine.hpp +++ b/include/kvdk/engine.hpp @@ -10,6 +10,7 @@ #include "comparator.hpp" #include "configs.hpp" #include "iterator.hpp" +#include "transaction.hpp" #include "types.hpp" #include "write_batch.hpp" @@ -78,6 +79,8 @@ class Engine { virtual std::unique_ptr WriteBatchCreate() = 0; + virtual std::unique_ptr TransactionCreate() = 0; + // Search the STRING-type KV of "key" and store the corresponding value to // *value on success. If the "key" does not exist, return NotFound. virtual Status Get(const StringView key, std::string* value) = 0; diff --git a/include/kvdk/transaction.hpp b/include/kvdk/transaction.hpp new file mode 100644 index 00000000..b52c3fa7 --- /dev/null +++ b/include/kvdk/transaction.hpp @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2021 Intel Corporation + */ + +#pragma once + +#include + +#include "types.hpp" + +namespace KVDK_NAMESPACE { +class Transaction { + public: + // TODO: use StringView instead of std::string + virtual Status StringPut(const std::string& key, + const std::string& value) = 0; + virtual Status StringDelete(const std::string& key) = 0; + virtual Status StringGet(const std::string& key, std::string* value) = 0; + virtual Status SortedPut(const std::string& collection, + const std::string& key, + const std::string& value) = 0; + virtual Status SortedDelete(const std::string& collection, + const std::string& key) = 0; + virtual Status SortedGet(const std::string& collection, + const std::string& key, std::string* value) = 0; + virtual Status HashPut(const std::string& collection, const std::string& key, + const std::string& value) = 0; + virtual Status HashDelete(const std::string& collection, + const std::string& key) = 0; + virtual Status HashGet(const std::string& collection, const std::string& key, + std::string* value) = 0; + + virtual Status Commit() = 0; + virtual void Rollback() = 0; + virtual Status InternalStatus() = 0; + virtual ~Transaction() = default; +}; +} // namespace KVDK_NAMESPACE \ No newline at end of file diff --git a/include/kvdk/types.h b/include/kvdk/types.h index 10554404..dfa97b8a 100644 --- a/include/kvdk/types.h +++ b/include/kvdk/types.h @@ -70,6 +70,7 @@ __attribute__((unused)) static char const* KVDKValueTypeString[] = { GEN(IOError) \ GEN(InvalidConfiguration) \ GEN(Fail) \ + GEN(Timeout) \ GEN(Abort) typedef enum { KVDK_STATUS(GENERATE_ENUM) } KVDKStatus; diff --git a/include/kvdk/write_batch.hpp b/include/kvdk/write_batch.hpp index 63b9949d..a7b8784d 100644 --- a/include/kvdk/write_batch.hpp +++ b/include/kvdk/write_batch.hpp @@ -15,14 +15,15 @@ class WriteBatch { virtual void StringPut(std::string const& key, std::string const& value) = 0; virtual void StringDelete(std::string const& key) = 0; - virtual void SortedPut(std::string const& key, std::string const& field, + virtual void SortedPut(std::string const& collection, std::string const& key, std::string const& value) = 0; - virtual void SortedDelete(std::string const& key, - std::string const& field) = 0; + virtual void SortedDelete(std::string const& collection, + std::string const& key) = 0; - virtual void HashPut(std::string const& key, std::string const& field, + virtual void HashPut(std::string const& collection, std::string const& key, std::string const& value) = 0; - virtual void HashDelete(std::string const& key, std::string const& field) = 0; + virtual void HashDelete(std::string const& collection, + std::string const& key) = 0; virtual void Clear() = 0; diff --git a/tests/tests.cpp b/tests/tests.cpp index 62c4da8d..2fe32c97 100644 --- a/tests/tests.cpp +++ b/tests/tests.cpp @@ -355,6 +355,8 @@ class EngineBasicTest : public testing::Test { class BatchWriteTest : public EngineBasicTest {}; +class TrasactionTest : public EngineBasicTest {}; + TEST_F(EngineBasicTest, TestUniqueKey) { std::string sorted_collection("sorted_collection"); std::string hash_collection("unordered_collection"); @@ -2407,6 +2409,398 @@ TEST_F(EngineBasicTest, TestbackgroundDestroyCollections) { delete engine; } +TEST_F(TrasactionTest, TransactionBasic) { + size_t num_threads = 32; + configs.max_access_threads = num_threads; + ASSERT_EQ(Engine::Open(db_path, &engine, configs, stdout), Status::Ok); + int amount = num_threads; + int transfer_amount = 1; + std::vector accounts{"Jack", "Tom"}; + int round = 10000; + std::string sorted_bank{"sorted_bank"}; + std::string hash_bank{"hash_bank"}; + + ASSERT_EQ(engine->SortedCreate(sorted_bank), Status::Ok); + ASSERT_EQ(engine->HashCreate(hash_bank), Status::Ok); + + auto string_transfer_txn = [&](size_t) { + int cnt = round; + while (cnt--) { + std::string payer; + std::string receiver; + if (fast_random_64() % 2 == 0) { + payer = accounts[0]; + receiver = accounts[1]; + } else { + payer = accounts[1]; + receiver = accounts[0]; + } + + std::string payer_balance; + std::string receiver_balance; + bool exist = true; + + auto txn = engine->TransactionCreate(); + Status s = txn->StringGet(payer, &payer_balance); + if (s == Status::Ok) { + s = txn->StringGet(receiver, &receiver_balance); + if (s == Status::Timeout) { + txn->Rollback(); + continue; + } else { + ASSERT_EQ(s, Status::Ok); + } + } else if (s == Status::NotFound) { + exist = false; + s = txn->StringGet(receiver, &receiver_balance); + if (s == Status::Timeout) { + txn->Rollback(); + continue; + } else { + ASSERT_EQ(s, Status::NotFound); + payer_balance = std::to_string(amount); + receiver_balance = std::to_string(amount); + // Key is locked in previous get, so no conflict + ASSERT_EQ(txn->StringPut(payer, payer_balance), Status::Ok); + ASSERT_EQ(txn->StringPut(receiver, receiver_balance), Status::Ok); + } + } else { + ASSERT_EQ(s, Status::Timeout); + txn->Rollback(); + continue; + } + ASSERT_EQ(std::stoi(payer_balance) + std::stoi(receiver_balance), + 2 * amount); + if (std::stoi(payer_balance) == 0) { + txn->Rollback(); + continue; + } + + // Do transfer + std::string payer_balance_after_transfer = + std::to_string(std::stoi(payer_balance) - transfer_amount); + std::string receiver_balance_after_transfer = + std::to_string(std::stoi(receiver_balance) + transfer_amount); + ASSERT_EQ(txn->StringPut(payer, payer_balance_after_transfer), + Status::Ok); + ASSERT_EQ(txn->StringPut(receiver, receiver_balance_after_transfer), + Status::Ok); + std::string value; + // Un-commited change should be visible by this transaction, but invisible + // outsider the transaction + ASSERT_EQ(txn->StringGet(payer, &value), Status::Ok); + ASSERT_EQ(value, payer_balance_after_transfer); + ASSERT_EQ(txn->StringGet(receiver, &value), Status::Ok); + ASSERT_EQ(value, receiver_balance_after_transfer); + ASSERT_EQ(std::stoi(payer_balance_after_transfer) + + std::stoi(receiver_balance_after_transfer), + 2 * amount); + if (exist) { + ASSERT_EQ(engine->Get(payer, &value), Status::Ok); + ASSERT_EQ(value, payer_balance); + ASSERT_EQ(engine->Get(receiver, &value), Status::Ok); + ASSERT_EQ(value, receiver_balance); + } else { + ASSERT_EQ(engine->Get(payer, &value), Status::NotFound); + ASSERT_EQ(engine->Get(receiver, &value), Status::NotFound); + } + ASSERT_EQ(txn->Commit(), Status::Ok); + break; + } + }; + + auto sorted_transfer_txn = [&](size_t) { + int cnt = round; + while (cnt--) { + std::string payer; + std::string receiver; + if (fast_random_64() % 2 == 0) { + payer = accounts[0]; + receiver = accounts[1]; + } else { + payer = accounts[1]; + receiver = accounts[0]; + } + + std::string payer_balance; + std::string receiver_balance; + bool exist = true; + + auto txn = engine->TransactionCreate(); + Status s = txn->SortedGet(sorted_bank, payer, &payer_balance); + if (s == Status::Ok) { + s = txn->SortedGet(sorted_bank, receiver, &receiver_balance); + if (s == Status::Timeout) { + txn->Rollback(); + continue; + } else { + ASSERT_EQ(s, Status::Ok); + } + } else if (s == Status::NotFound) { + exist = false; + s = txn->SortedGet(sorted_bank, receiver, &receiver_balance); + if (s == Status::Timeout) { + txn->Rollback(); + continue; + } else { + ASSERT_EQ(s, Status::NotFound); + payer_balance = std::to_string(amount); + receiver_balance = std::to_string(amount); + // Key is locked in previous get, so no conflict + ASSERT_EQ(txn->SortedPut(sorted_bank, payer, payer_balance), + Status::Ok); + ASSERT_EQ(txn->SortedPut(sorted_bank, receiver, receiver_balance), + Status::Ok); + } + } else { + ASSERT_EQ(s, Status::Timeout); + txn->Rollback(); + continue; + } + ASSERT_EQ(std::stoi(payer_balance) + std::stoi(receiver_balance), + 2 * amount); + if (std::stoi(payer_balance) == 0) { + txn->Rollback(); + continue; + } + + // Do transfer + std::string payer_balance_after_transfer = + std::to_string(std::stoi(payer_balance) - transfer_amount); + std::string receiver_balance_after_transfer = + std::to_string(std::stoi(receiver_balance) + transfer_amount); + ASSERT_EQ( + txn->SortedPut(sorted_bank, payer, payer_balance_after_transfer), + Status::Ok); + ASSERT_EQ(txn->SortedPut(sorted_bank, receiver, + receiver_balance_after_transfer), + Status::Ok); + std::string value; + // Un-commited change should be visible by this transaction, but invisible + // outsider the transaction + ASSERT_EQ(txn->SortedGet(sorted_bank, payer, &value), Status::Ok); + ASSERT_EQ(value, payer_balance_after_transfer); + ASSERT_EQ(txn->SortedGet(sorted_bank, receiver, &value), Status::Ok); + ASSERT_EQ(value, receiver_balance_after_transfer); + ASSERT_EQ(std::stoi(payer_balance_after_transfer) + + std::stoi(receiver_balance_after_transfer), + 2 * amount); + if (exist) { + ASSERT_EQ(engine->SortedGet(sorted_bank, payer, &value), Status::Ok); + ASSERT_EQ(value, payer_balance); + ASSERT_EQ(engine->SortedGet(sorted_bank, receiver, &value), Status::Ok); + ASSERT_EQ(value, receiver_balance); + } else { + ASSERT_EQ(engine->SortedGet(sorted_bank, payer, &value), + Status::NotFound); + ASSERT_EQ(engine->SortedGet(sorted_bank, receiver, &value), + Status::NotFound); + } + ASSERT_EQ(txn->Commit(), Status::Ok); + break; + } + }; + + auto hash_transfer_txn = [&](size_t) { + int cnt = round; + while (cnt--) { + std::string payer; + std::string receiver; + if (fast_random_64() % 2 == 0) { + payer = accounts[0]; + receiver = accounts[1]; + } else { + payer = accounts[1]; + receiver = accounts[0]; + } + + std::string payer_balance; + std::string receiver_balance; + bool exist = true; + + auto txn = engine->TransactionCreate(); + Status s = txn->HashGet(hash_bank, payer, &payer_balance); + if (s == Status::Ok) { + s = txn->HashGet(hash_bank, receiver, &receiver_balance); + if (s == Status::Timeout) { + txn->Rollback(); + continue; + } else { + ASSERT_EQ(s, Status::Ok); + } + } else if (s == Status::NotFound) { + exist = false; + s = txn->HashGet(hash_bank, receiver, &receiver_balance); + if (s == Status::Timeout) { + txn->Rollback(); + continue; + } else { + ASSERT_EQ(s, Status::NotFound); + payer_balance = std::to_string(amount); + receiver_balance = std::to_string(amount); + // Key is locked in previous get, so no conflict + ASSERT_EQ(txn->HashPut(hash_bank, payer, payer_balance), Status::Ok); + ASSERT_EQ(txn->HashPut(hash_bank, receiver, receiver_balance), + Status::Ok); + } + } else { + ASSERT_EQ(s, Status::Timeout); + txn->Rollback(); + continue; + } + ASSERT_EQ(std::stoi(payer_balance) + std::stoi(receiver_balance), + 2 * amount); + if (std::stoi(payer_balance) == 0) { + txn->Rollback(); + continue; + } + + // Do transfer + std::string payer_balance_after_transfer = + std::to_string(std::stoi(payer_balance) - transfer_amount); + std::string receiver_balance_after_transfer = + std::to_string(std::stoi(receiver_balance) + transfer_amount); + ASSERT_EQ(txn->HashPut(hash_bank, payer, payer_balance_after_transfer), + Status::Ok); + ASSERT_EQ( + txn->HashPut(hash_bank, receiver, receiver_balance_after_transfer), + Status::Ok); + std::string value; + // Un-commited change should be visible by this transaction, but invisible + // outsider the transaction + ASSERT_EQ(txn->HashGet(hash_bank, payer, &value), Status::Ok); + ASSERT_EQ(value, payer_balance_after_transfer); + ASSERT_EQ(txn->HashGet(hash_bank, receiver, &value), Status::Ok); + ASSERT_EQ(value, receiver_balance_after_transfer); + ASSERT_EQ(std::stoi(payer_balance_after_transfer) + + std::stoi(receiver_balance_after_transfer), + 2 * amount); + if (exist) { + ASSERT_EQ(engine->HashGet(hash_bank, payer, &value), Status::Ok); + ASSERT_EQ(value, payer_balance); + ASSERT_EQ(engine->HashGet(hash_bank, receiver, &value), Status::Ok); + ASSERT_EQ(value, receiver_balance); + } else { + ASSERT_EQ(engine->HashGet(hash_bank, payer, &value), Status::NotFound); + ASSERT_EQ(engine->HashGet(hash_bank, receiver, &value), + Status::NotFound); + } + ASSERT_EQ(txn->Commit(), Status::Ok); + break; + } + }; + + auto not_existed_collection_txn = [&]() { + std::string not_exist_collection("not_exist"); + std::string key("key"); + std::string val("val"); + std::string got_val; + auto txn = engine->TransactionCreate(); + ASSERT_TRUE(txn != nullptr); + ASSERT_EQ(txn->SortedPut(not_exist_collection, key, val), Status::NotFound); + ASSERT_EQ(txn->SortedDelete(not_exist_collection, key), Status::NotFound); + ASSERT_EQ(txn->SortedGet(not_exist_collection, key, &got_val), + Status::NotFound); + + ASSERT_EQ(txn->HashPut(not_exist_collection, key, val), Status::NotFound); + ASSERT_EQ(txn->HashDelete(not_exist_collection, key), Status::NotFound); + ASSERT_EQ(txn->HashGet(not_exist_collection, key, &got_val), + Status::NotFound); + }; + + LaunchNThreads(num_threads, string_transfer_txn); + LaunchNThreads(num_threads, sorted_transfer_txn); + LaunchNThreads(num_threads, hash_transfer_txn); + std::vector balances{2}; + ASSERT_EQ(engine->Get(accounts[0], &balances[0]), Status::Ok); + ASSERT_EQ(engine->Get(accounts[1], &balances[1]), Status::Ok); + ASSERT_EQ(std::stoi(balances[0]) + std::stoi(balances[1]), 2 * amount); + ASSERT_EQ(engine->SortedGet(sorted_bank, accounts[0], &balances[0]), + Status::Ok); + ASSERT_EQ(engine->SortedGet(sorted_bank, accounts[1], &balances[1]), + Status::Ok); + ASSERT_EQ(std::stoi(balances[0]) + std::stoi(balances[1]), 2 * amount); + ASSERT_EQ(engine->HashGet(hash_bank, accounts[0], &balances[0]), Status::Ok); + ASSERT_EQ(engine->HashGet(hash_bank, accounts[1], &balances[1]), Status::Ok); + ASSERT_EQ(std::stoi(balances[0]) + std::stoi(balances[1]), 2 * amount); + + not_existed_collection_txn(); + + delete engine; +} + +TEST_F(TrasactionTest, Conflict) { + configs.max_access_threads = 3; + ASSERT_EQ(Engine::Open(db_path, &engine, configs, stdout), Status::Ok); + std::string sorted_collection{"sorted_collection"}; + std::string hash_collection{"hash_collection"}; + std::string key{"key"}; + std::string val("val"); + ASSERT_EQ(engine->SortedCreate(sorted_collection), Status::Ok); + ASSERT_EQ(engine->HashCreate(hash_collection), Status::Ok); + + Status expected_return = Status::Timeout; + + auto operation_thread = [&](size_t) { + std::string got_val; + auto txn = engine->TransactionCreate(); + ASSERT_TRUE(txn != nullptr); + ASSERT_EQ(txn->StringPut(key, val), expected_return); + ASSERT_EQ(txn->StringGet(key, &got_val), expected_return); + if (expected_return == Status::Ok) { + ASSERT_EQ(got_val, val); + } + ASSERT_EQ(txn->StringDelete(key), expected_return); + txn->Rollback(); + + ASSERT_EQ(txn->SortedPut(sorted_collection, key, val), expected_return); + ASSERT_EQ(txn->SortedGet(sorted_collection, key, &got_val), + expected_return); + if (expected_return == Status::Ok) { + ASSERT_EQ(got_val, val); + } + ASSERT_EQ(txn->SortedDelete(sorted_collection, key), expected_return); + txn->Rollback(); + + ASSERT_EQ(txn->HashPut(hash_collection, key, val), expected_return); + ASSERT_EQ(txn->HashGet(hash_collection, key, &got_val), expected_return); + if (expected_return == Status::Ok) { + ASSERT_EQ(got_val, val); + } + ASSERT_EQ(txn->HashDelete(hash_collection, key), expected_return); + txn->Rollback(); + }; + + auto txn = engine->TransactionCreate(); + ASSERT_EQ(txn->StringPut(key, val), Status::Ok); + ASSERT_EQ(txn->SortedPut(sorted_collection, key, val), Status::Ok); + ASSERT_EQ(txn->HashPut(hash_collection, key, val), Status::Ok); + LaunchNThreads(1, operation_thread); + ASSERT_EQ(txn->Commit(), Status::Ok); + + std::string got_val; + ASSERT_EQ(txn->StringGet(key, &got_val), Status::Ok); + ASSERT_EQ(got_val, val); + ASSERT_EQ(txn->SortedGet(sorted_collection, key, &got_val), Status::Ok); + ASSERT_EQ(got_val, val); + ASSERT_EQ(txn->HashGet(hash_collection, key, &got_val), Status::Ok); + ASSERT_EQ(got_val, val); + LaunchNThreads(1, operation_thread); + ASSERT_EQ(txn->Commit(), Status::Ok); + + expected_return = Status::Ok; + LaunchNThreads(1, operation_thread); + + expected_return = Status::Timeout; + ASSERT_EQ(txn->StringDelete(key), Status::Ok); + ASSERT_EQ(txn->SortedDelete(sorted_collection, key), Status::Ok); + ASSERT_EQ(txn->HashDelete(hash_collection, key), Status::Ok); + LaunchNThreads(1, operation_thread); + ASSERT_EQ(txn->Commit(), Status::Ok); + + delete engine; +} + // ========================= Sync Point ====================================== #if KVDK_DEBUG_LEVEL > 0 From 61bb464a56ca1362f91a4d5c959c5ed10e81b6a1 Mon Sep 17 00:00:00 2001 From: Jiayu Wu Date: Fri, 30 Sep 2022 09:20:24 +0800 Subject: [PATCH 14/23] Align reference of std::string in header files to StringView (#345) Signed-off-by: Jiayu Wu --- engine/engine.cpp | 6 +++--- engine/kv_engine.cpp | 28 +++++++++++++----------- engine/kv_engine.hpp | 8 +++---- engine/transaction_impl.cpp | 35 +++++++++++++++--------------- engine/transaction_impl.hpp | 24 ++++++++++----------- engine/write_batch_impl.hpp | 41 ++++++++++++++++++++++++------------ include/kvdk/engine.hpp | 6 +++--- include/kvdk/transaction.hpp | 30 ++++++++++++-------------- include/kvdk/write_batch.hpp | 24 ++++++++++----------- 9 files changed, 108 insertions(+), 94 deletions(-) diff --git a/engine/engine.cpp b/engine/engine.cpp index 1f2bf1bd..9ba3bf3a 100644 --- a/engine/engine.cpp +++ b/engine/engine.cpp @@ -7,15 +7,15 @@ #include "kv_engine.hpp" namespace KVDK_NAMESPACE { -Status Engine::Open(const std::string& name, Engine** engine_ptr, +Status Engine::Open(const StringView name, Engine** engine_ptr, const Configs& configs, FILE* log_file) { GlobalLogger.Init(log_file, configs.log_level); Status s = KVEngine::Open(name, engine_ptr, configs); return s; } -Status Engine::Restore(const std::string& engine_path, - const std::string& backup_file, Engine** engine_ptr, +Status Engine::Restore(const StringView engine_path, + const StringView backup_file, Engine** engine_ptr, const Configs& configs, FILE* log_file) { GlobalLogger.Init(log_file, configs.log_level); Status s = KVEngine::Restore(engine_path, backup_file, engine_ptr, configs); diff --git a/engine/kv_engine.cpp b/engine/kv_engine.cpp index 1e22b78e..d2220a86 100644 --- a/engine/kv_engine.cpp +++ b/engine/kv_engine.cpp @@ -57,11 +57,13 @@ KVEngine::~KVEngine() { GlobalLogger.Info("Instance closed\n"); } -Status KVEngine::Open(const std::string& name, Engine** engine_ptr, +Status KVEngine::Open(const StringView engine_path, Engine** engine_ptr, const Configs& configs) { - GlobalLogger.Info("Opening kvdk instance from %s ...\n", name.c_str()); + std::string engine_path_str(string_view_2_string(engine_path)); + GlobalLogger.Info("Opening kvdk instance from %s ...\n", + engine_path_str.c_str()); KVEngine* engine = new KVEngine(configs); - Status s = engine->init(name, configs); + Status s = engine->init(engine_path_str, configs); if (s == Status::Ok) { s = engine->restoreExistingData(); } @@ -76,16 +78,18 @@ Status KVEngine::Open(const std::string& name, Engine** engine_ptr, return s; } -Status KVEngine::Restore(const std::string& engine_path, - const std::string& backup_log, Engine** engine_ptr, +Status KVEngine::Restore(const StringView engine_path, + const StringView backup_log, Engine** engine_ptr, const Configs& configs) { + std::string engine_path_str(string_view_2_string(engine_path)); + std::string backup_log_str(string_view_2_string(backup_log)); GlobalLogger.Info( "Restoring kvdk instance from backup log %s to engine path %s\n", - backup_log.c_str(), engine_path.c_str()); + backup_log_str.c_str(), engine_path_str.c_str()); KVEngine* engine = new KVEngine(configs); - Status s = engine->init(engine_path, configs); + Status s = engine->init(engine_path_str, configs); if (s == Status::Ok) { - s = engine->restoreDataFromBackup(backup_log); + s = engine->restoreDataFromBackup(backup_log_str); } if (s == Status::Ok) { @@ -94,7 +98,7 @@ Status KVEngine::Restore(const std::string& engine_path, engine->ReportPMemUsage(); } else { GlobalLogger.Error("Restore kvdk instance from backup log %s failed: %d\n", - backup_log.c_str(), s); + backup_log_str.c_str(), s); delete engine; } return s; @@ -158,12 +162,12 @@ Status KVEngine::init(const std::string& name, const Configs& configs) { return Status::IOError; } - db_file_ = data_file(); + data_file_ = data_file(); configs_ = configs; } else { configs_ = configs; - db_file_ = name; + data_file_ = name; // The devdax mode need to execute the shell scripts/init_devdax.sh, // then a fsdax model namespace will be created and the @@ -185,7 +189,7 @@ Status KVEngine::init(const std::string& name, const Configs& configs) { } pmem_allocator_.reset(PMEMAllocator::NewPMEMAllocator( - db_file_, configs_.pmem_file_size, configs_.pmem_segment_blocks, + data_file_, configs_.pmem_file_size, configs_.pmem_segment_blocks, configs_.pmem_block_size, configs_.max_access_threads, configs_.populate_pmem_space, configs_.use_devdax_mode, &version_controller_)); diff --git a/engine/kv_engine.hpp b/engine/kv_engine.hpp index 61647120..c2d6ad8c 100644 --- a/engine/kv_engine.hpp +++ b/engine/kv_engine.hpp @@ -48,11 +48,11 @@ class KVEngine : public Engine { public: ~KVEngine(); - static Status Open(const std::string& name, Engine** engine_ptr, + static Status Open(const StringView engine_path, Engine** engine_ptr, const Configs& configs); - static Status Restore(const std::string& engine_path, - const std::string& backup_log, Engine** engine_ptr, + static Status Restore(const StringView engine_path, + const StringView backup_log, Engine** engine_ptr, const Configs& configs); Snapshot* GetSnapshot(bool make_checkpoint) final; @@ -641,7 +641,7 @@ class KVEngine : public Engine { std::string dir_; std::string batch_log_dir_; - std::string db_file_; + std::string data_file_; std::unique_ptr pmem_allocator_; Configs configs_; bool closing_{false}; diff --git a/engine/transaction_impl.cpp b/engine/transaction_impl.cpp index 7938bbf6..51bcde49 100644 --- a/engine/transaction_impl.cpp +++ b/engine/transaction_impl.cpp @@ -23,8 +23,8 @@ TransactionImpl::TransactionImpl(KVEngine* engine) TransactionImpl::~TransactionImpl() { Rollback(); } -Status TransactionImpl::StringPut(const std::string& key, - const std::string& value) { +Status TransactionImpl::StringPut(const StringView key, + const StringView value) { auto hash_table = engine_->GetHashTable(); if (!tryLock(hash_table->GetLock(key))) { status_ = Status::Timeout; @@ -35,7 +35,7 @@ Status TransactionImpl::StringPut(const std::string& key, return Status::Ok; } -Status TransactionImpl::StringDelete(const std::string& key) { +Status TransactionImpl::StringDelete(const StringView key) { auto hash_table = engine_->GetHashTable(); if (!tryLock(hash_table->GetLock(key))) { status_ = Status::Timeout; @@ -46,7 +46,7 @@ Status TransactionImpl::StringDelete(const std::string& key) { return Status::Ok; } -Status TransactionImpl::StringGet(const std::string& key, std::string* value) { +Status TransactionImpl::StringGet(const StringView key, std::string* value) { auto op = batch_->StringGet(key); if (op != nullptr) { if (op->op == WriteOp::Delete) { @@ -65,9 +65,9 @@ Status TransactionImpl::StringGet(const std::string& key, std::string* value) { } } -Status TransactionImpl::SortedPut(const std::string& collection, - const std::string& key, - const std::string& value) { +Status TransactionImpl::SortedPut(const StringView collection, + const StringView key, + const StringView value) { acquireCollectionTransaction(); auto hash_table = engine_->GetHashTable(); auto lookup_result = @@ -87,8 +87,8 @@ Status TransactionImpl::SortedPut(const std::string& collection, return Status::Ok; } -Status TransactionImpl::SortedDelete(const std::string& collection, - const std::string& key) { +Status TransactionImpl::SortedDelete(const StringView collection, + const StringView key) { acquireCollectionTransaction(); auto hash_table = engine_->GetHashTable(); auto lookup_result = @@ -108,8 +108,8 @@ Status TransactionImpl::SortedDelete(const std::string& collection, return Status::Ok; } -Status TransactionImpl::SortedGet(const std::string& collection, - const std::string& key, std::string* value) { +Status TransactionImpl::SortedGet(const StringView collection, + const StringView key, std::string* value) { auto op = batch_->SortedGet(collection, key); if (op != nullptr) { if (op->op == WriteOp::Delete) { @@ -138,9 +138,8 @@ Status TransactionImpl::SortedGet(const std::string& collection, } } -Status TransactionImpl::HashPut(const std::string& collection, - const std::string& key, - const std::string& value) { +Status TransactionImpl::HashPut(const StringView collection, + const StringView key, const StringView value) { acquireCollectionTransaction(); auto hash_table = engine_->GetHashTable(); auto lookup_result = @@ -160,8 +159,8 @@ Status TransactionImpl::HashPut(const std::string& collection, return Status::Ok; } -Status TransactionImpl::HashDelete(const std::string& collection, - const std::string& key) { +Status TransactionImpl::HashDelete(const StringView collection, + const StringView key) { acquireCollectionTransaction(); auto hash_table = engine_->GetHashTable(); auto lookup_result = @@ -181,8 +180,8 @@ Status TransactionImpl::HashDelete(const std::string& collection, return Status::Ok; } -Status TransactionImpl::HashGet(const std::string& collection, - const std::string& key, std::string* value) { +Status TransactionImpl::HashGet(const StringView collection, + const StringView key, std::string* value) { auto op = batch_->HashGet(collection, key); if (op != nullptr) { if (op->op == WriteOp::Delete) { diff --git a/engine/transaction_impl.hpp b/engine/transaction_impl.hpp index 40257a6e..00ef3bf6 100644 --- a/engine/transaction_impl.hpp +++ b/engine/transaction_impl.hpp @@ -100,20 +100,18 @@ class TransactionImpl final : public Transaction { public: TransactionImpl(KVEngine* engine); ~TransactionImpl() final; - Status StringPut(const std::string& key, const std::string& value) final; - Status StringDelete(const std::string& key) final; - Status StringGet(const std::string& key, std::string* value) final; - Status SortedPut(const std::string& collection, const std::string& key, - const std::string& value) final; - Status SortedDelete(const std::string& collection, - const std::string& key) final; - Status SortedGet(const std::string& collection, const std::string& key, + Status StringPut(const StringView key, const StringView value) final; + Status StringDelete(const StringView key) final; + Status StringGet(const StringView key, std::string* value) final; + Status SortedPut(const StringView collection, const StringView key, + const StringView value) final; + Status SortedDelete(const StringView collection, const StringView key) final; + Status SortedGet(const StringView collection, const StringView key, std::string* value) final; - Status HashPut(const std::string& collection, const std::string& key, - const std::string& value) final; - Status HashDelete(const std::string& collection, - const std::string& key) final; - Status HashGet(const std::string& collection, const std::string& key, + Status HashPut(const StringView collection, const StringView key, + const StringView value) final; + Status HashDelete(const StringView collection, const StringView key) final; + Status HashGet(const StringView collection, const StringView key, std::string* value) final; Status Commit() final; void Rollback() final; diff --git a/engine/write_batch_impl.hpp b/engine/write_batch_impl.hpp index e82dee6f..7ce49045 100644 --- a/engine/write_batch_impl.hpp +++ b/engine/write_batch_impl.hpp @@ -17,12 +17,22 @@ namespace KVDK_NAMESPACE { class WriteBatchImpl final : public WriteBatch { public: struct StringOp { + StringOp(WriteOp o, const StringView& k, const StringView& v) + : op(o), key(string_view_2_string(k)), value(string_view_2_string(v)) {} + WriteOp op; std::string key; std::string value; }; struct SortedOp { + SortedOp(WriteOp o, const StringView& c, const StringView& k, + const StringView& v) + : op(o), + collection(string_view_2_string(c)), + key(string_view_2_string(k)), + value(string_view_2_string(v)) {} + WriteOp op; std::string collection; std::string key; @@ -30,6 +40,13 @@ class WriteBatchImpl final : public WriteBatch { }; struct HashOp { + HashOp(WriteOp o, const StringView& c, const StringView& k, + const StringView& v) + : op(o), + collection(string_view_2_string(c)), + key(string_view_2_string(k)), + value(string_view_2_string(v)) {} + WriteOp op; std::string collection; std::string key; @@ -57,40 +74,39 @@ class WriteBatchImpl final : public WriteBatch { } }; - void StringPut(std::string const& key, std::string const& value) final { + void StringPut(const StringView key, const StringView value) final { StringOp op{WriteOp::Put, key, value}; string_ops_.erase(op); string_ops_.insert(op); } - void StringDelete(std::string const& key) final { + void StringDelete(const StringView key) final { StringOp op{WriteOp::Delete, key, std::string{}}; string_ops_.erase(op); string_ops_.insert(op); } - void SortedPut(std::string const& collection, std::string const& key, - std::string const& value) final { + void SortedPut(const StringView collection, const StringView key, + const StringView value) final { SortedOp op{WriteOp::Put, collection, key, value}; sorted_ops_.erase(op); sorted_ops_.insert(op); } - void SortedDelete(std::string const& collection, - std::string const& key) final { + void SortedDelete(const StringView collection, const StringView key) final { SortedOp op{WriteOp::Delete, collection, key, std::string{}}; sorted_ops_.erase(op); sorted_ops_.insert(op); } - void HashPut(std::string const& collection, std::string const& key, - std::string const& value) final { + void HashPut(const StringView collection, const StringView key, + const StringView value) final { HashOp op{WriteOp::Put, collection, key, value}; hash_ops_.erase(op); hash_ops_.insert(op); } - void HashDelete(std::string const& collection, std::string const& key) final { + void HashDelete(const StringView collection, const StringView key) final { HashOp op{WriteOp::Delete, collection, key, std::string{}}; hash_ops_.erase(op); hash_ops_.insert(op); @@ -98,7 +114,7 @@ class WriteBatchImpl final : public WriteBatch { // Get a string op from this batch // if key not exist in this batch, return nullptr - const StringOp* StringGet(std::string const& key) { + const StringOp* StringGet(const StringView key) { StringOp op{WriteOp::Put, key, ""}; auto iter = string_ops_.find(op); if (iter == string_ops_.end()) { @@ -109,8 +125,7 @@ class WriteBatchImpl final : public WriteBatch { // Get a sorted op from this batch // if collection key not exist in this batch, return nullptr - const SortedOp* SortedGet(const std::string& collection, - const std::string& key) { + const SortedOp* SortedGet(const StringView collection, const StringView key) { SortedOp op{WriteOp::Put, collection, key, ""}; auto iter = sorted_ops_.find(op); if (iter == sorted_ops_.end()) { @@ -121,7 +136,7 @@ class WriteBatchImpl final : public WriteBatch { // Get a hash op from this batch // if the collection key not exist in this batch, return nullptr - const HashOp* HashGet(std::string const& collection, std::string const& key) { + const HashOp* HashGet(const StringView collection, const StringView key) { HashOp op{WriteOp::Put, collection, key, ""}; auto iter = hash_ops_.find(op); if (iter == hash_ops_.end()) { diff --git a/include/kvdk/engine.hpp b/include/kvdk/engine.hpp index 6a0ba7ed..d408191b 100644 --- a/include/kvdk/engine.hpp +++ b/include/kvdk/engine.hpp @@ -32,7 +32,7 @@ class Engine { // Return Status::Ok on sucess, return other status for any error // // To close the instance, just delete *engine_ptr. - static Status Open(const std::string& engine_path, Engine** engine_ptr, + static Status Open(const StringView engine_path, Engine** engine_ptr, const Configs& configs, FILE* log_file = stdout); // Restore a KVDK instance from a backup log file. @@ -49,8 +49,8 @@ class Engine { // // Notice: // "engine_path" should be an empty dir - static Status Restore(const std::string& engine_path, - const std::string& backup_log, Engine** engine_ptr, + static Status Restore(const StringView engine_path, + const StringView backup_log, Engine** engine_ptr, const Configs& configs, FILE* log_file = stdout); virtual Status TypeOf(StringView key, ValueType* type) = 0; diff --git a/include/kvdk/transaction.hpp b/include/kvdk/transaction.hpp index b52c3fa7..05cbf739 100644 --- a/include/kvdk/transaction.hpp +++ b/include/kvdk/transaction.hpp @@ -12,22 +12,20 @@ namespace KVDK_NAMESPACE { class Transaction { public: // TODO: use StringView instead of std::string - virtual Status StringPut(const std::string& key, - const std::string& value) = 0; - virtual Status StringDelete(const std::string& key) = 0; - virtual Status StringGet(const std::string& key, std::string* value) = 0; - virtual Status SortedPut(const std::string& collection, - const std::string& key, - const std::string& value) = 0; - virtual Status SortedDelete(const std::string& collection, - const std::string& key) = 0; - virtual Status SortedGet(const std::string& collection, - const std::string& key, std::string* value) = 0; - virtual Status HashPut(const std::string& collection, const std::string& key, - const std::string& value) = 0; - virtual Status HashDelete(const std::string& collection, - const std::string& key) = 0; - virtual Status HashGet(const std::string& collection, const std::string& key, + virtual Status StringPut(const StringView key, const StringView value) = 0; + virtual Status StringDelete(const StringView key) = 0; + virtual Status StringGet(const StringView key, std::string* value) = 0; + virtual Status SortedPut(const StringView collection, const StringView key, + const StringView value) = 0; + virtual Status SortedDelete(const StringView collection, + const StringView key) = 0; + virtual Status SortedGet(const StringView collection, const StringView key, + std::string* value) = 0; + virtual Status HashPut(const StringView collection, const StringView key, + const StringView value) = 0; + virtual Status HashDelete(const StringView collection, + const StringView key) = 0; + virtual Status HashGet(const StringView collection, const StringView key, std::string* value) = 0; virtual Status Commit() = 0; diff --git a/include/kvdk/write_batch.hpp b/include/kvdk/write_batch.hpp index a7b8784d..b156dff1 100644 --- a/include/kvdk/write_batch.hpp +++ b/include/kvdk/write_batch.hpp @@ -12,18 +12,18 @@ namespace KVDK_NAMESPACE { class WriteBatch { public: - virtual void StringPut(std::string const& key, std::string const& value) = 0; - virtual void StringDelete(std::string const& key) = 0; - - virtual void SortedPut(std::string const& collection, std::string const& key, - std::string const& value) = 0; - virtual void SortedDelete(std::string const& collection, - std::string const& key) = 0; - - virtual void HashPut(std::string const& collection, std::string const& key, - std::string const& value) = 0; - virtual void HashDelete(std::string const& collection, - std::string const& key) = 0; + virtual void StringPut(const StringView key, const StringView value) = 0; + virtual void StringDelete(const StringView key) = 0; + + virtual void SortedPut(const StringView collection, const StringView key, + const StringView value) = 0; + virtual void SortedDelete(const StringView collection, + const StringView key) = 0; + + virtual void HashPut(const StringView collection, const StringView key, + const StringView value) = 0; + virtual void HashDelete(const StringView collection, + const StringView key) = 0; virtual void Clear() = 0; From 5cff991ed84f4e8b6baf336c642eae40eecbb9df Mon Sep 17 00:00:00 2001 From: Jiayu Wu Date: Fri, 30 Sep 2022 12:59:44 +0800 Subject: [PATCH 15/23] Documents update and annotation completion (#346) Signed-off-by: Jiayu Wu --- doc/benchmark.md | 11 +- doc/user_doc.md | 98 +++++--- engine/hash_collection/hash_list.hpp | 11 + engine/kv_engine.cpp | 18 +- engine/kv_engine.hpp | 4 +- include/kvdk/configs.hpp | 9 +- include/kvdk/engine.hpp | 362 +++++++++++++++++++-------- include/kvdk/snapshot.hpp | 12 + include/kvdk/transaction.hpp | 72 +++++- include/kvdk/types.h | 4 +- tests/tests.cpp | 8 +- 11 files changed, 446 insertions(+), 163 deletions(-) create mode 100644 include/kvdk/snapshot.hpp diff --git a/doc/benchmark.md b/doc/benchmark.md index 887e4508..34311427 100644 --- a/doc/benchmark.md +++ b/doc/benchmark.md @@ -4,6 +4,15 @@ To test performance of KVDK, you can run our benchmark tool "bench", the tool is You can manually run individual benchmark follow the examples as shown bellow, or simply run our basic benchmark script "scripts/run_benchmark.py" to test all the basic read/write performance. +To run the script, you shoulf first build kvdk, then run: + +``` +scripts/run_benchmark.py [data_type] [key distribution] +``` + +data_type: Which data type to benchmark, it can be string/sorted/hash/list/blackhole/all + +key distribution: Distribution of key of the benchmark workloads, it can be random/zipf/all ## Fill data to new instance To test performance, we need to first fill key-value pairs to the KVDK instance. Since KVDK did not support cross-socket access yet, we need to bind bench program to a numa node: @@ -20,7 +29,7 @@ Explanation of arguments: -space: PMem space that allocate to the KVDK instance. - -max_access_threads: Max concurrent access threads of the KVDK instance, set it to the number of the hyper-threads for performance consideration. + -max_access_threads: Max concurrent access threads in the KVDK instance, set it to the number of the hyper-threads for performance consideration. You can call KVDK API with any number of threads, but if your parallel threads more than max_access_threads, the performance will be degraded due to synchronization cost -type: Type of key-value pairs to benchmark, it can be "string", "hash" or "sorted". diff --git a/doc/user_doc.md b/doc/user_doc.md index 809a8e32..c982c499 100644 --- a/doc/user_doc.md +++ b/doc/user_doc.md @@ -1,9 +1,9 @@ KVDK ======= -KVDK(Key-Value Development Kit) is a Key-Value store for Persistent memory(PMem). +KVDK(Key-Value Development Kit) is a Key-Value store for Persistent Memory (PMem). -KVDK supports both sorted and unsorted KV-Pairs. +KVDK supports basic read and write operations on both sorted and unsorted KV-Pairs, it also support some advanced features, such as **backup**, **checkpoint**, **expire key**, **atomic batch write** and **transactions**. Code snippets in this user documents are from `./examples/tutorial/cpp_api_tutorial.cpp`, which is built as `./build/examples/tutorial/cpp_api_tutorial`. @@ -70,7 +70,7 @@ int main() `kvdk::Status` indicates status of KVDK function calls. Functions return `kvdk::Status::Ok` if such a function call is a success. If exceptions are raised during function calls, other `kvdk::Status` is returned, -such as `kvdk::Status::MemoryOverflow`. +such as `kvdk::Status::MemoryOverflow` while no enough memory to allocate. ## Close a KVDK instance @@ -97,26 +97,45 @@ int main() ``` ## Data types -KVDK currently supports string type for both keys and values. -### Strings -All keys and values in a KVDK instance are strings. +KVDK currently supports raw string, sorted collection, hash collection and list data type. + +### Raw String + +All keys and values in a KVDK instance are strings. You can directly store or read key-value pairs in global namespace, which is accessible via Get, Put, Delete and Modify operations, we call them string type data in kvdk. Keys are limited to have a maximum size of 64KB. -A string value can be at max 64MB in length by default. The maximum length can be configured when initializing a KVDK instance. +A value can be at max 64MB in length by default. The maximum length can be configured when initializing a KVDK instance. + +### Collections + +Instead of raw string, you can organize key-value pairs to a collection, each collection has its own namespace. + +Currently we have three types of collection: + +#### Sorted Collection + +KV pairs are stored with some kind of order (lexicographical order by default) in Sorted Collection, they can be iterated forward or backward starting from an arbitrary point(at a key or between two keys) by an iterator. They can also be directly accessed via SortedGet, SortedPut, SortedDelete operations. + +#### Hash Collection + +Hash Collection is like Raw String with a name space, you can access KV pairs via HashGet, HashPut, HashDelete and HashModify operations. + +In current version, performance of operations on hash collection is similar to sorted collection, which much slower than raw-string, so we recomend use raw-string or sorted collection as high priority. + +#### List -## Collections -All Key-Value pairs(KV-Pairs) are organized into collections. +List is a list of string elements, you can access elems at the front or back via ListPushFront, ListPushBack, ListPopFron, ListPopBack, or operation elems with index via ListInsertAt, ListInsertBefore, ListInsertAfter and ListErase. Notice that operation with index take O(n) time, while operation on front and back only takes O(1). -There is an anonymous global collection with KV-Pairs directly accessible via Get, Put, Delete operations. The anonymous global collection is unsorted. +### Namespace -Users can also create named collections. +Each collection has its own namespace, so you can store same key in every collection. Howevery, collection name and raw string key are in a same namespace, so you can't assign same name for a collection and a string key, otherwise a error status (Status::WrongType) will be returned. -KVDK currently supports sorted named collections. Users can iterate forward or backward starting from an arbitrary point(at a key or between two keys) by an iterator. Elements can also be directly accessed via SortedGet, SortedPut, SortedDelete operations. +## API Examples -## Reads and Writes in Anonymous Global Collection +### Reads and Writes with String type -A KVDK instance provides Get, Put, Delete methods to query/modify/delete entries. +A KVDK instance provides Get, Put, Delete methods to query/modify/delete raw string kvs. The following code performs a series of Get, Put and Delete operations. @@ -125,7 +144,7 @@ int main() { ... Open a KVDK instance as described in "Open a KVDK instance" ... - // Reads and Writes on Anonymous Global Collection + // Reads and Writes String KV { std::string key1{"key1"}; std::string key2{"key2"}; @@ -173,11 +192,11 @@ int main() } ``` -## Reads and Writes in a Named Collection +### Reads and Writes in a Sorted Collection A KVDK instance provides SortedGet, SortedPut, SortedDelete methods to query/modify/delete sorted entries. -The following code performs a series of SortedGet, SortedPut and SortedDelete operations, which also initialize a named collection implicitly. +The following code performs a series of SortedGet, SortedPut and SortedDelete operations on a sorted collection. ```c++ int main() @@ -194,9 +213,13 @@ int main() std::string value2{"value2"}; std::string v; + // You must create sorted collections before you do any operations on them + status = engine->SortedCreate(collection1); + assert(status == kvdk::Status::Ok); + status = engine->SortedCreate(collection2); + assert(status == kvdk::Status::Ok); + // Insert key1-value1 into "my_collection_1". - // Implicitly create a collection named "my_collection_1" in which - // key1-value1 is stored. status = engine->SortedPut(collection1, key1, value1); assert(status == kvdk::Status::Ok); @@ -206,8 +229,6 @@ int main() assert(v == value1); // Insert key1-value2 into "my_collection_2". - // Implicitly create a collection named "my_collection_2" in which - // key1-value2 is stored. status = engine->SortedPut(collection2, key1, value2); assert(status == kvdk::Status::Ok); @@ -236,8 +257,13 @@ int main() status = engine->SortedDelete(collection1, key1); assert(status == kvdk::Status::Ok); - printf("Successfully performed SortedGet, SortedPut, SortedDelete operations on named " - "collections.\n"); + // Destroy sorted collections + status = engine->SortedDestroy(collection1); + assert(status == kvdk::Status::Ok); + status = engine->SrotedDestroy(collection2); + assert(status == kvdk::Status::Ok); + + printf("Successfully performed SortedGet, SortedPut, SortedDelete operations.\n"); } ... Do something else with KVDK instance ... @@ -246,17 +272,18 @@ int main() } ``` -## Iterating a Named Collection -The following example demonstrates how to iterate through a named collection. It also demonstrates how to iterate through a range defined by Key. +### Iterating a Sorted Collection +The following example demonstrates how to iterate through a sorted collection at a consistent view of data. It also demonstrates how to iterate through a range defined by Key. ```c++ int main() { ... Open a KVDK instance as described in "Open a KVDK instance" ... - // Iterating a Sorted Named Collection + // Iterating a Sorted Sorted Collection { std::string sorted_collection{"my_sorted_collection"}; + engine->SortedCreate(sorted_collection); // Create toy keys and values. std::vector> kv_pairs; for (int i = 0; i < 10; ++i) { @@ -282,7 +309,9 @@ int main() // Sort kv_pairs for checking the order of "my_sorted_collection". std::sort(kv_pairs.begin(), kv_pairs.end()); - // Iterate through collection "my_sorted_collection" + // Iterate through collection "my_sorted_collection", the iter is + // created on a consistent view while you create it, e.g. all + // modifications after you create the iter won't be observed auto iter = engine->SortedIteratorCreate(sorted_collection); iter->SeekToFirst(); { @@ -320,7 +349,7 @@ int main() } } - printf("Successfully iterated through a sorted named collections.\n"); + printf("Successfully iterated through a sorted collections.\n"); engine->SortedIteratorRelease(iter); } @@ -330,7 +359,7 @@ int main() } ``` -## Atomic Updates +### Atomic Updates KVDK supports organizing a series of Put, Delete operations into a `kvdk::WriteBatch` object as an atomic operation. If KVDK fail to apply the `kvdk::WriteBatch` object as a whole, i.e. the system shuts down during applying the batch, it will roll back to the status right before applying the `kvdk::WriteBatch`. ```c++ @@ -387,7 +416,12 @@ A KVDK instance can be accessed by multiple read and write threads safely. Synch Users can configure KVDK to adapt to their system environment by setting up a `kvdk::Configs` object and passing it to 'kvdk::Engine::Open' when initializing a KVDK instance. ### Max Access Threads -Maximum number of access threads is specified by `kvdk::Configs::max_access_threads`. Defaulted to 48. It's recommended to set this number to the number of threads provided by CPU. +Maximum number of internal access threads in kvdk is specified by `kvdk::Configs::max_access_threads`. Defaulted to 64. It's recommended to set this number to the number of threads provided by CPU. + +You can call KVDK API with any number of threads, but if your parallel threads more than max_access_threads, the performance will be degraded due to synchronization cost + +### Clean Threads +KVDK reclaim space of updated/deleted data in background with dynamic number of clean threads, you can specify max clean thread number with `kvdk::Configs::clean_threads`. Defaulted to 8, you can config more clean threads in delete intensive workloads to avoid space be exhausted. ### PMem File Size `kvdk::Configs::pmem_file_size` specifies the space allocated to a KVDK instance. Defaulted to 2^38Bytes = 256GB. @@ -418,3 +452,7 @@ Specified by `kvdk::Configs::hash_bucket_num`. Greater number will improve perfo ### Buckets per Slot Specified by `kvdk::Configs::num_buckets_per_slot`. Smaller number will improve performance by reducing lock contentions and improving caching at the cost of greater DRAM space. Please read Architecture Documentation for details before tuning this parameter. + +## Advanced features and more API + +Please read examples/tutorial for more API and advanced features in KVDK. diff --git a/engine/hash_collection/hash_list.hpp b/engine/hash_collection/hash_list.hpp index f035ccae..033bf691 100644 --- a/engine/hash_collection/hash_list.hpp +++ b/engine/hash_collection/hash_list.hpp @@ -82,6 +82,17 @@ class HashList : public Collection { // Notice: the deleting key should already been locked by engine WriteResult Delete(const StringView& key, TimestampType timestamp); + // Modify value of "key" in the hash list + // + // Args: + // * modify_func: customized function to modify existing value of key. See + // definition of ModifyFunc (types.hpp) for more details. + // * modify_args: customized arguments of modify_func. + // + // Return: + // Status::Ok if modify success. + // Status::Abort if modify function abort modifying. + // Return other non-Ok status on any error. WriteResult Modify(const StringView key, ModifyFunc modify_func, void* modify_args, TimestampType timestamp); diff --git a/engine/kv_engine.cpp b/engine/kv_engine.cpp index d2220a86..cb7cc794 100644 --- a/engine/kv_engine.cpp +++ b/engine/kv_engine.cpp @@ -1226,10 +1226,10 @@ Status KVEngine::batchWriteRollbackLogs() { return Status::Ok; } -Status KVEngine::GetTTL(const StringView str, TTLType* ttl_time) { +Status KVEngine::GetTTL(const StringView key, TTLType* ttl_time) { *ttl_time = kInvalidTTL; - auto ul = hash_table_->AcquireLock(str); - auto res = lookupKey(str, ExpirableRecordType); + auto ul = hash_table_->AcquireLock(key); + auto res = lookupKey(key, ExpirableRecordType); if (res.s == Status::Ok) { ExpireTimeType expire_time; @@ -1266,7 +1266,7 @@ Status KVEngine::TypeOf(StringView key, ValueType* type) { if (res.s == Status::Ok) { switch (res.entry_ptr->GetIndexType()) { case PointerType::Skiplist: { - *type = ValueType::SortedSet; + *type = ValueType::SortedCollection; break; } case PointerType::List: { @@ -1274,7 +1274,7 @@ Status KVEngine::TypeOf(StringView key, ValueType* type) { break; } case PointerType::HashList: { - *type = ValueType::HashSet; + *type = ValueType::HashCollection; break; } case PointerType::StringRecord: { @@ -1289,7 +1289,7 @@ Status KVEngine::TypeOf(StringView key, ValueType* type) { return res.s == Status::Outdated ? Status::NotFound : res.s; } -Status KVEngine::Expire(const StringView str, TTLType ttl_time) { +Status KVEngine::Expire(const StringView key, TTLType ttl_time) { auto thread_holder = AcquireAccessThread(); int64_t base_time = TimeUtils::millisecond_time(); @@ -1298,10 +1298,10 @@ Status KVEngine::Expire(const StringView str, TTLType ttl_time) { } ExpireTimeType expired_time = TimeUtils::TTLToExpireTime(ttl_time, base_time); - auto ul = hash_table_->AcquireLock(str); + auto ul = hash_table_->AcquireLock(key); auto snapshot_holder = version_controller_.GetLocalSnapshotHolder(); // TODO: maybe have a wrapper function(lookupKeyAndMayClean). - auto lookup_result = lookupKey(str, ExpirableRecordType); + auto lookup_result = lookupKey(key, ExpirableRecordType); if (lookup_result.s == Status::Outdated) { return Status::NotFound; } @@ -1313,7 +1313,7 @@ Status KVEngine::Expire(const StringView str, TTLType ttl_time) { ul.unlock(); version_controller_.ReleaseLocalSnapshot(); lookup_result.s = Modify( - str, + key, [](const std::string* old_val, std::string* new_val, void*) { new_val->assign(*old_val); return ModifyOperation::Write; diff --git a/engine/kv_engine.hpp b/engine/kv_engine.hpp index c2d6ad8c..f577e96b 100644 --- a/engine/kv_engine.hpp +++ b/engine/kv_engine.hpp @@ -77,13 +77,13 @@ class KVEngine : public Engine { // 1. Expire assumes that str is not duplicated among all types, which is not // implemented yet // 2. Expire is not compatible with checkpoint for now - Status Expire(const StringView str, TTLType ttl_time) final; + Status Expire(const StringView key, TTLType ttl_time) final; // Get time to expire of str // // Notice: // Expire assumes that str is not duplicated among all types, which is not // implemented yet - Status GetTTL(const StringView str, TTLType* ttl_time) final; + Status GetTTL(const StringView key, TTLType* ttl_time) final; Status TypeOf(StringView key, ValueType* type) final; diff --git a/include/kvdk/configs.hpp b/include/kvdk/configs.hpp index 3d010591..0dca292c 100644 --- a/include/kvdk/configs.hpp +++ b/include/kvdk/configs.hpp @@ -19,9 +19,6 @@ enum class LogLevel : uint8_t { None, }; -// A snapshot indicates a immutable view of a KVDK engine at a certain time -struct Snapshot {}; - // Configs of created sorted collection // For correctness of encoding, please add new config field in the end of the // existing fields @@ -31,12 +28,14 @@ struct SortedCollectionConfigs { }; struct Configs { + // TODO: rename to concurrent internal threads + // // Max number of concurrent threads read/write the kvdk instance internally. - // Set it >= your CPU core number to get best performance + // Set it to the number of the hyper-threads to get best performance // // Notice: you can call KVDK API with any number of threads, but if your // parallel threads more than max_access_threads, the performance will be - // damaged due to synchronization cost + // degraded due to synchronization cost uint64_t max_access_threads = 64; // Size of PMem space to store KV data, this is not scalable in current diff --git a/include/kvdk/engine.hpp b/include/kvdk/engine.hpp index d408191b..8bc0c319 100644 --- a/include/kvdk/engine.hpp +++ b/include/kvdk/engine.hpp @@ -10,6 +10,7 @@ #include "comparator.hpp" #include "configs.hpp" #include "iterator.hpp" +#include "snapshot.hpp" #include "transaction.hpp" #include "types.hpp" #include "write_batch.hpp" @@ -22,7 +23,7 @@ class Engine { public: // Open a new KVDK instance or restore a existing KVDK instance // - // Para: + // Args: // * engine_path: indicates the dir path that persist the instance // * engine_ptr: store the pointer to restored instance // * configs: engine configs4 @@ -37,7 +38,7 @@ class Engine { // Restore a KVDK instance from a backup log file. // - // Para: + // Args: // * engine_path: indicates the dir path that persist the instance // * backup_log: the backup log file restored from // * engine_ptr: store the pointer to restored instance @@ -53,16 +54,43 @@ class Engine { const StringView backup_log, Engine** engine_ptr, const Configs& configs, FILE* log_file = stdout); + // Get type of key, it can be String, SortedCollection, HashCollection or List + // + // Return: + // Status::Ok and store type to "*type" on success + // Status::NotFound if key does not exist virtual Status TypeOf(StringView key, ValueType* type) = 0; - // Insert a STRING-type KV to set "key" to hold "value", return Ok on - // successful persistence, return non-Ok on any error. + // Insert a STRING-type KV to set "key" to hold "value". + // + // Args: + // *options: customized write options + // + // Return: + // Status Ok on success . + // Status::WrongType if key exists but is a collection. + // Status::PMemOverflow/Status::MemoryOverflow if PMem/DRAM exhausted. virtual Status Put(const StringView key, const StringView value, const WriteOptions& options = WriteOptions()) = 0; + // Search the STRING-type KV of "key" in the kvdk instance. + // + // Return: + // Return Status::Ok and store the corresponding value to *value on success. + // Return Status::NotFound if the "key" does not exist. + virtual Status Get(const StringView key, std::string* value) = 0; + + // Remove STRING-type KV of "key". + // + // Return: + // Status::Ok on success or the "key" did not exist + // Status::WrongType if key exists but is a collection type + // Status::PMemOverflow if PMem exhausted + virtual Status Delete(const StringView key) = 0; + // Modify value of existing key in the engine // - // Para: + // Args: // * modify_func: customized function to modify existing value of key. See // definition of ModifyFunc (types.hpp) for more details. // * modify_args: customized arguments of modify_func. @@ -75,45 +103,77 @@ class Engine { void* modify_args, const WriteOptions& options = WriteOptions()) = 0; + // Atomically do a batch of operations (Put or Delete) to the instance, these + // operations either all succeed, or all fail. The data will be rollbacked if + // the instance crash during a batch write + // + // Return: + // Status::Ok on success + // Status::NotFound if a collection operated by the batch does not exist + // Status::PMemOverflow/Status::MemoryOverflow if PMem/DRAM exhausted + // + // Notice: + // BatchWrite has no isolation guaranteed, if you need it, you should use + // Transaction API virtual Status BatchWrite(std::unique_ptr const& batch) = 0; + // Create a write batch for BatchWrite operation virtual std::unique_ptr WriteBatchCreate() = 0; + // Start a transaction on the kvdk instance. + // + // The transaction is implemented with pessimistic locking, so any operation + // may conflict with other access thread and compete locks. A transaction + // operation will return Status::Timeout in this case to avoid dead lock. + // + // Return: + // Return a pointer to transaction struct for doing transactions. + // + // Notice: + // 1. A thread should not call normal write APIs while a transaction of + // it has not been committed or rollbacked, otherwise the thread may deadlock + // as the transaction may holding locks required by normal write operations. + // 2. Once the transaction is committed or rollbacked, you can start next + // transaction on this struct. + // 3. Commit or Rollback the transaction as soon as possible to release locks + // it holds. virtual std::unique_ptr TransactionCreate() = 0; - // Search the STRING-type KV of "key" and store the corresponding value to - // *value on success. If the "key" does not exist, return NotFound. - virtual Status Get(const StringView key, std::string* value) = 0; - - // Search the STRING-type or Collection and store the corresponding expired + // Search the STRING-type or Collection and get the corresponding expired // time to *expired_time on success. - /* - * @param ttl_time. - * If the key is persist, ttl_time is INT64_MAX and Status::Ok; - * If the key is expired or does not exist, ttl_time is 0 and return - * Status::NotFound. - */ - virtual Status GetTTL(const StringView str, int64_t* ttl_time) = 0; - - /* Put the STRING-type or Collection type expired time. - * @param ttl_time is negetive or positive number. - * If ttl_time == INT64_MAX, the key is persistent; - * If ttl_time <=0, the key is expired immediately; - */ - virtual Status Expire(const StringView str, int64_t ttl_time) = 0; + // + // Args: + // * key: STRING-type key or collection name to search. + // * ttl_time: store TTL result. + // + // Return: + // Status::Ok and store ttl time to *ttl_time on success + // Status::NotFound if key is expired or does not exist + virtual Status GetTTL(const StringView key, int64_t* ttl_time) = 0; - // Remove STRING-type KV of "key". - // Return Ok on success or the "key" did not exist, return non-Ok on any - // error. - virtual Status Delete(const StringView key) = 0; + // Set ttl_time for STRING-type or Collection type data + // + // Args: + // * key: STRING-type key or collection name to set ttl_time. + // * ttl_time: ttl time to set. if ttl_time == kPersistTTL, the name will not + // be expired. If ttl_time <=0, the name is expired immediately. + // + // Return: + // Status::Ok on success. + // Status::NotFound if key does not exist. + virtual Status Expire(const StringView key, int64_t ttl_time) = 0; - // Create a sorted collection with configs + // Create a empty sorted collection with configs. You should always create + // collection before you do any operations on it + // + // Args: + // * configs: customized config of creating collection + // // Return: // Status::Ok on success // Status::Existed if sorted collection already existed // Status::WrongType if collection existed but not a sorted collection - // Status::PMemOverflow if PMem exhausted - // Status::MemoryOverflow if DRAM exhausted + // Status::PMemOverflow/Status::MemoryOverflow if PMem/DRAM exhausted virtual Status SortedCreate( const StringView collection, const SortedCollectionConfigs& configs = SortedCollectionConfigs()) = 0; @@ -127,26 +187,35 @@ class Engine { // Get number of elements in a sorted collection // - // Return Ok on success, return NotFound if collection not exist + // Return: + // Status::Ok on success + // Status::NotFound if collection not exist virtual Status SortedSize(const StringView collection, size_t* size) = 0; - // Insert a SORTED-type KV to set "key" of sorted collection "collection" - // to hold "value", if "collection" not exist, it will be created, return - // Ok on successful persistence, return non-Ok on any error. + // Insert a KV to set "key" in sorted collection "collection" + // to hold "value" + // Return: + // Status::Ok on success. + // Status::NotFound if collection not exist. + // Status::WrongType if collection exists but is not a sorted collection. + // Status::PMemOverflow/Status::MemoryOverflow if PMem/DRAM exhausted. virtual Status SortedPut(const StringView collection, const StringView key, const StringView value) = 0; - // Search the SORTED-type KV of "key" in sorted collection "collection" - // and store the corresponding value to *value on success. If the - // "collection"/"key" did not exist, return NotFound. + // Search the KV of "key" in sorted collection "collection" + // + // Return: + // Status::Ok and store the corresponding value to *value on success. + // Status::NotFound If the "collection" or "key" does not exist. virtual Status SortedGet(const StringView collection, const StringView key, std::string* value) = 0; - // Remove SORTED-type KV of "key" in the sorted collection "collection". + // Remove KV of "key" in the sorted collection "collection". + // // Return: // Status::Ok on success or key not existed in collection // Status::NotFound if collection not exist - // Status::WrongType if collection exists but is not a sorted collection + // Status::WrongType if collection exists but is not a sorted collection. // Status::PMemOverflow if PMem exhausted. virtual Status SortedDelete(const StringView collection, const StringView key) = 0; @@ -155,10 +224,10 @@ class Engine { // Create an empty List. // Return: - // Status::WrongType if list name existed but is not a List. - // Status::Existed if a List named list already exists. - // Status::PMemOverflow if PMem exhausted. - // Status::Ok if successfully created the List. + // Status::WrongType if list name existed but is not a List. + // Status::Existed if a List named list already exists. + // Status::PMemOverflow if PMem exhausted. + // Status::Ok if successfully created the List. virtual Status ListCreate(StringView list) = 0; // Destroy a List associated with key @@ -170,50 +239,50 @@ class Engine { // Total elements in List. // Return: - // Status::InvalidDataSize if list name is too long - // Status::WrongType if list name is not a List. - // Status::NotFound if list does not exist or has expired. - // Status::Ok and length of List if List exists. + // Status::InvalidDataSize if list name is too long + // Status::WrongType if list name is not a List. + // Status::NotFound if list does not exist or has expired. + // Status::Ok and length of List if List exists. virtual Status ListSize(StringView list, size_t* sz) = 0; // Push element as first element of List // Return: - // Status::InvalidDataSize if list name or elem is too long - // Status::WrongType if list name is not a List. - // Status::PMemOverflow if PMem exhausted. - // Status::Ok if operation succeeded. + // Status::InvalidDataSize if list name or elem is too long + // Status::WrongType if list name is not a List. + // Status::PMemOverflow if PMem exhausted. + // Status::Ok if operation succeeded. virtual Status ListPushFront(StringView list, StringView elem) = 0; // Push element as last element of List // Return: - // Status::InvalidDataSize if list name or elem is too long - // Status::WrongType if list name in the instance is not a List. - // Status::PMemOverflow if PMem exhausted. - // Status::Ok if operation succeeded. + // Status::InvalidDataSize if list name or elem is too long + // Status::WrongType if list name in the instance is not a List. + // Status::PMemOverflow if PMem exhausted. + // Status::Ok if operation succeeded. virtual Status ListPushBack(StringView list, StringView elem) = 0; // Pop first element of list // Return: - // Status::InvalidDataSize if list name is too long - // Status::WrongType if list is not a List. - // Status::NotFound if list does not exist or has expired. - // Status::Ok and element if operation succeeded. + // Status::InvalidDataSize if list name is too long + // Status::WrongType if list is not a List. + // Status::NotFound if list does not exist or has expired. + // Status::Ok and element if operation succeeded. virtual Status ListPopFront(StringView list, std::string* elem) = 0; // Pop last element of List // Return: - // Status::InvalidDataSize if list name is too long - // Status::WrongType if list is not a List. - // Status::NotFound if list does not exist or has expired. - // Status::Ok and element if operation succeeded. + // Status::InvalidDataSize if list name is too long + // Status::WrongType if list is not a List. + // Status::NotFound if list does not exist or has expired. + // Status::Ok and element if operation succeeded. virtual Status ListPopBack(StringView list, std::string* elem) = 0; // Push multiple elements to the front of List // Return: - // Status::InvalidDataSize if list name or elem is too long - // Status::WrongType if list is not a List. - // Status::PMemOverflow if PMem exhausted. - // Status::Ok if operation succeeded. + // Status::InvalidDataSize if list name or elem is too long + // Status::WrongType if list is not a List. + // Status::PMemOverflow if PMem exhausted. + // Status::Ok if operation succeeded. virtual Status ListBatchPushFront(StringView list, std::vector const& elems) = 0; virtual Status ListBatchPushFront(StringView list, @@ -221,10 +290,10 @@ class Engine { // Push multiple elements to the back of List // Return: - // Status::InvalidDataSize if list or elem is too long - // Status::WrongType if list name is not a List. - // Status::PMemOverflow if PMem exhausted. - // Status::Ok if operation succeeded. + // Status::InvalidDataSize if list or elem is too long + // Status::WrongType if list name is not a List. + // Status::PMemOverflow if PMem exhausted. + // Status::Ok if operation succeeded. virtual Status ListBatchPushBack(StringView list, std::vector const& elems) = 0; virtual Status ListBatchPushBack(StringView list, @@ -232,19 +301,19 @@ class Engine { // Pop first N element of List // Return: - // Status::InvalidDataSize if list is too long - // Status::WrongType if list is not a List. - // Status::NotFound if list does not exist or has expired. - // Status::Ok and element if operation succeeded. + // Status::InvalidDataSize if list is too long + // Status::WrongType if list is not a List. + // Status::NotFound if list does not exist or has expired. + // Status::Ok and element if operation succeeded. virtual Status ListBatchPopFront(StringView list, size_t n, std::vector* elems) = 0; // Pop last N element of List // Return: - // Status::InvalidDataSize if list is too long - // Status::WrongType if list is not a List. - // Status::NotFound if list does not exist or has expired. - // Status::Ok and element if operation succeeded. + // Status::InvalidDataSize if list is too long + // Status::WrongType if list is not a List. + // Status::NotFound if list does not exist or has expired. + // Status::Ok and element if operation succeeded. virtual Status ListBatchPopBack(StringView list, size_t n, std::vector* elems) = 0; @@ -252,9 +321,9 @@ class Engine { // src_pos and dst_pos can only be 0, indicating List front, // or -1, indicating List back. // Return: - // Status::WrongType if src or dst is not a List. - // Status::NotFound if list or element not exist - // Status::Ok and moved element if operation succeeded. + // Status::WrongType if src or dst is not a List. + // Status::NotFound if list or element not exist + // Status::Ok and moved element if operation succeeded. virtual Status ListMove(StringView src_list, ListPos src_pos, StringView dst_list, ListPos dst_pos, std::string* elem) = 0; @@ -262,49 +331,58 @@ class Engine { // Insert a element to a list at index, the index can be positive or // negative // Return: - // Status::InvalidDataSize if list name is too large - // Status::WrongType if collection is not a list - // Status::NotFound if collection not found or index is beyond list size - // Status::Ok if operation succeeded + // Status::InvalidDataSize if list name is too large + // Status::WrongType if collection is not a list + // Status::NotFound if collection not found or index is beyond list size + // Status::Ok if operation succeeded virtual Status ListInsertAt(StringView list, StringView elem, long index) = 0; // Insert an element before element "pos" in list "collection" // Return: - // Status::InvalidDataSize if elem is too long. - // Status::PMemOverflow if PMem exhausted. - // Status::NotFound if List of the ListIterator has expired or been - // deleted, or "pos" not exist in the list - // Status::Ok if operation succeeded. + // Status::InvalidDataSize if elem is too long. + // Status::PMemOverflow if PMem exhausted. + // Status::NotFound if List of the ListIterator has expired or been + // deleted, or "pos" not exist in the list + // Status::Ok if operation succeeded. virtual Status ListInsertBefore(StringView list, StringView elem, StringView pos) = 0; // Insert an element after element "pos" in list "collection" // Return: - // Status::InvalidDataSize if elem is too long. - // Status::PMemOverflow if PMem exhausted. - // Status::NotFound if List of the ListIterator has expired or been - // deleted, or "pos" not exist in the list - // Status::Ok if operation succeeded. + // Status::InvalidDataSize if elem is too long. + // Status::PMemOverflow if PMem exhausted. + // Status::NotFound if List of the ListIterator has expired or been + // deleted, or "pos" not exist in the list + // Status::Ok if operation succeeded. virtual Status ListInsertAfter(StringView list, StringView elem, StringView pos) = 0; // Remove the element at index // Return: - // Status::NotFound if the index beyond list size. - // Status::Ok if operation succeeded, and store removed elem in "elem" + // Status::NotFound if the index beyond list size. + // Status::Ok if operation succeeded, and store removed elem in "elem" virtual Status ListErase(StringView list, long index, std::string* elem) = 0; // Replace the element at index // Return: - // Status::InvalidDataSize if elem is too long - // Status::NotFound if if the index beyond list size. - // Status::Ok if operation succeeded. + // Status::InvalidDataSize if elem is too long + // Status::NotFound if if the index beyond list size. + // Status::Ok if operation succeeded. virtual Status ListReplace(StringView list, long index, StringView elem) = 0; // Create a KV iterator on list "list", which is able to iterate all elems in - // the list at "snapshot" version, if snapshot is nullptr, then a - // internal snapshot will be created at current version and the iterator will - // be created on it + // the list + // + // Args: + // * snapshot: iterator will iterate all elems a t "snapshot" version, if + // snapshot is nullptr, then a internal snapshot will be created at current + // version and the iterator will be created on it + // * status: store operation status if not null + // + // Return: + // Return A pointer to iterator on success. + // Return nullptr if list not exist or any other errors, and store error + // status to "status" // // Notice: // 1. Iterator will be invalid after the passed snapshot is released @@ -313,20 +391,76 @@ class Engine { virtual ListIterator* ListIteratorCreate(StringView list, Snapshot* snapshot = nullptr, Status* status = nullptr) = 0; + // Release a ListIterator and its holding resources virtual void ListIteratorRelease(ListIterator*) = 0; /// Hash APIs /////////////////////////////////////////////////////////////// + // Create a empty hash collection. You should always create collection before + // you do any operations on it + // + // Return: + // Status::Ok on success + // Status::Existed if hash collection already existed + // Status::WrongType if collection existed but not a hash collection + // Status::PMemOverflow/Status::MemoryOverflow if PMem/DRAM exhausted virtual Status HashCreate(StringView collection) = 0; + + // Destroy a hash collection + // Return: + // Status::Ok on success + // Status::WrongType if collection existed but not a hash collection + // Status::PMemOverflow if PMem exhausted virtual Status HashDestroy(StringView collection) = 0; + + // Get number of elements in a hash collection + // + // Return: + // Status::Ok on success + // Status::NotFound if collection not exist virtual Status HashSize(StringView collection, size_t* len) = 0; + + // Search the KV of "key" in hash collection "collection" + // + // Return: + // Status::Ok and store the corresponding value to *value on success. + // Status::NotFound If the "collection" or "key" does not exist. virtual Status HashGet(StringView collection, StringView key, std::string* value) = 0; + + // Insert a KV to set "key" in hash collection "collection" + // to hold "value" + // Return: + // Status::Ok on success. + // Status::NotFound if collection not exist. + // Status::WrongType if collection exists but is not a hash collection. + // Status::PMemOverflow/Status::MemoryOverflow if PMem/DRAM exhausted. virtual Status HashPut(StringView collection, StringView key, StringView value) = 0; + + // Remove KV of "key" in the hash collection "collection". + // + // Return: + // Status::Ok on success or key not existed in collection + // Status::NotFound if collection not exist + // Status::WrongType if collection exists but is not a hash collection. + // Status::PMemOverflow if PMem exhausted. virtual Status HashDelete(StringView collection, StringView key) = 0; + + // Modify value of a existing key in a hash collection + // + // Args: + // * modify_func: customized function to modify existing value of key. See + // definition of ModifyFunc (types.hpp) for more details. + // * modify_args: customized arguments of modify_func. + // + // Return: + // Return Status::Ok if modify success. + // Return Status::Abort if modify function abort modifying. + // Return other non-Ok status on any error. virtual Status HashModify(StringView collection, StringView key, ModifyFunc modify_func, void* cb_args) = 0; + // Create a KV iterator on hash collection "collection", which is able to // iterate all elems in the collection at "snapshot" version, if snapshot is // nullptr, then a internal snapshot will be created at current version and @@ -362,9 +496,18 @@ class Engine { virtual void ReleaseSnapshot(const Snapshot*) = 0; // Create a KV iterator on sorted collection "collection", which is able to - // sequentially iterate all KVs in the "collection" at "snapshot" version, if + // sequentially iterate all KVs in the "collection". + // + // Args: + // * snapshot: iterator will iterate all elems a t "snapshot" version, if // snapshot is nullptr, then a internal snapshot will be created at current // version and the iterator will be created on it + // * status: store operation status if not null + // + // Return: + // Return A pointer to iterator on success. + // Return nullptr if collection not exist or any other errors, and store error + // status to "status" // // Notice: // 1. Iterator will be invalid after the passed snapshot is released @@ -374,13 +517,14 @@ class Engine { Snapshot* snapshot = nullptr, Status* s = nullptr) = 0; - // Release a sorted iterator + // Release a sorted iterator and its holding resouces virtual void SortedIteratorRelease(SortedIterator*) = 0; // Register a customized comparator to the engine on runtime // - // Return true on success, return false if a comparator of comparator_name - // already existed + // Return: + // Return true on success + // Return false if a comparator of comparator_name already existed virtual bool registerComparator(const StringView& comparator_name, Comparator) = 0; diff --git a/include/kvdk/snapshot.hpp b/include/kvdk/snapshot.hpp new file mode 100644 index 00000000..91b29622 --- /dev/null +++ b/include/kvdk/snapshot.hpp @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2021 Intel Corporation + */ + +#pragma once + +namespace KVDK_NAMESPACE { + +// A snapshot indicates a immutable view of a KVDK engine at a certain time +struct Snapshot {}; + +} // namespace KVDK_NAMESPACE \ No newline at end of file diff --git a/include/kvdk/transaction.hpp b/include/kvdk/transaction.hpp index 05cbf739..9a02978c 100644 --- a/include/kvdk/transaction.hpp +++ b/include/kvdk/transaction.hpp @@ -9,28 +9,98 @@ #include "types.hpp" namespace KVDK_NAMESPACE { +// This struct is used to do transaction operations. A transaction struct is +// assotiated with a kvdk instance class Transaction { public: - // TODO: use StringView instead of std::string + // Put a STRING-type KV to the transaction + // + // Return: + // Status::Ok on success + // Status::Timeout on conflict and long-time lock contention virtual Status StringPut(const StringView key, const StringView value) = 0; + // Delete a STRING-type key to the transaction + // + // Return: + // Status::Ok on success + // Status::Timeout on conflict and long-time lock contention virtual Status StringDelete(const StringView key) = 0; + // Get value of a STRING-type KV. It will first get from the transaction + // operations (Put/Delete), then the kvdk instance of the transaction + // + // Return: + // Status::Ok on success and store value to "*value" + // Status::NotFound if key not existed or be deleted by this transaction + // Status::Timeout on conflict and long-time lock contention virtual Status StringGet(const StringView key, std::string* value) = 0; + // Put a KV of sorted collection to the transaction + // + // Return: + // Status::Ok on success + // Status::NotFound if collection does not exist + // Status::Timeout on conflict and long-time lock contention virtual Status SortedPut(const StringView collection, const StringView key, const StringView value) = 0; + // Delete a KV from sorted collection to the trnasaction. + // + // Return: + // Status::Ok on success + // Status::NotFound if collection does not exist + // Status::Timeout on conflict and long-time lock contention virtual Status SortedDelete(const StringView collection, const StringView key) = 0; + // Get value of a KV from sorted collection. It will first get from the + // transaction operations (Put/Delete), then the kvdk instance of the + // transaction + // + // Return: + // Status::Ok and store value to "*value" on success + // Status::NotFound if collection or key does not exist + // Status::Timeout on conflict and long-time lock contention virtual Status SortedGet(const StringView collection, const StringView key, std::string* value) = 0; + // Put a KV of hash collection to the transaction + // + // Return: + // Status::Ok on success + // Status::NotFound if collection does not exist + // Status::Timeout on conflict and long-time lock contention virtual Status HashPut(const StringView collection, const StringView key, const StringView value) = 0; + // Delete a KV from hash collection to the transaction + // + // Return: + // Status::Ok on success + // Status::NotFound if collection does not exist + // Status::Timeout on conflict and long-time lock contention virtual Status HashDelete(const StringView collection, const StringView key) = 0; + + // Get value of a KV from hash collection. It will first get from the + // transaction operations (Put/Delete), then the kvdk instance of the + // transaction + // + // Return: + // Status::Ok and store value to "*value" on success + // Status::NotFound if collection or key does not exist + // Status::Timeout on conflict and long-time lock contention virtual Status HashGet(const StringView collection, const StringView key, std::string* value) = 0; + // Commit all operations of the transaction to the kvdk instance, and + // release all locks it holds + // + // Return: + // Status::Ok on success, all operations will be persistent on the instance + // Status::PMemOverflow/Status::MemoryOverflow if PMem/DRAM exhausted virtual Status Commit() = 0; + + // Rollback all operations of the transaction, release locks it holds virtual void Rollback() = 0; + + // Return status of the last transaction operation virtual Status InternalStatus() = 0; + virtual ~Transaction() = default; }; } // namespace KVDK_NAMESPACE \ No newline at end of file diff --git a/include/kvdk/types.h b/include/kvdk/types.h index dfa97b8a..f7a1b637 100644 --- a/include/kvdk/types.h +++ b/include/kvdk/types.h @@ -43,8 +43,8 @@ typedef void (*KVDKFreeFunc)(void*); #define KVDK_TYPES(GEN) \ GEN(String) \ - GEN(SortedSet) \ - GEN(HashSet) \ + GEN(SortedCollection) \ + GEN(HashCollection) \ GEN(List) typedef enum { KVDK_TYPES(GENERATE_ENUM) } KVDKValueType; diff --git a/tests/tests.cpp b/tests/tests.cpp index 2fe32c97..7f25c086 100644 --- a/tests/tests.cpp +++ b/tests/tests.cpp @@ -473,8 +473,8 @@ TEST_F(EngineBasicTest, TypeOfKey) { ASSERT_EQ(Engine::Open(db_path.c_str(), &engine, configs, stdout), Status::Ok); std::unordered_map key_types; - for (auto type : {ValueType::String, ValueType::HashSet, ValueType::List, - ValueType::SortedSet}) { + for (auto type : {ValueType::String, ValueType::HashCollection, + ValueType::List, ValueType::SortedCollection}) { std::string key = KVDKValueTypeString[type]; key_types[key] = type; ValueType type_resp; @@ -483,7 +483,7 @@ TEST_F(EngineBasicTest, TypeOfKey) { ASSERT_EQ(engine->Put(key, ""), Status::Ok); break; } - case ValueType::HashSet: { + case ValueType::HashCollection: { ASSERT_EQ(engine->HashCreate(key), Status::Ok); break; } @@ -491,7 +491,7 @@ TEST_F(EngineBasicTest, TypeOfKey) { ASSERT_EQ(engine->ListCreate(key), Status::Ok); break; } - case ValueType::SortedSet: { + case ValueType::SortedCollection: { ASSERT_EQ(engine->SortedCreate(key), Status::Ok); break; } From d6b630bf98acfdc530af031e5a6992f9b1afc575 Mon Sep 17 00:00:00 2001 From: Jiayu Wu Date: Fri, 30 Sep 2022 13:50:00 +0800 Subject: [PATCH 16/23] Release v1.0 (#347) Signed-off-by: Jiayu Wu --- README.md | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 37cec6f0..decdead5 100644 --- a/README.md +++ b/README.md @@ -5,19 +5,18 @@ `KVDK` (Key-Value Development Kit) is a key-value store library implemented in C++ language. It is designed for supporting DRAM, Optane persistent memory and CXL memory pool. It also demonstrates several optimization methods for high performance with tiered memory. Besides providing the basic APIs of key-value store, it offers several advanced features, like read-modify-write, checkpoint, etc. ## Features +* Rich data types + * string, sorted, hash, list, hash * Basic KV operations - * string get/set/update/delete -* Sorted KV operations - * sorted string get/set/update/scan/delete -* Rich value types - * list, hash + * get/put/update/delete/scan * Read-Modify-Write * Support TTL * Atomic Batch Write * Snapshot based Scan * Consistent Dump & Restore to/from storage +* Consistent Checkpoint +* Transaction * C/C++/Java APIs -* Support Transaction (coming soon) # Limitations * The maximum supported key-value size is 64KB-4GB. From b8f3fd371d189320feb8163a9c614a8e7cdb1aab Mon Sep 17 00:00:00 2001 From: ZiyanShi Date: Tue, 18 Oct 2022 15:56:15 +0800 Subject: [PATCH 17/23] ... Signed-off-by: ZiyanShi --- scripts/run_benchmark.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/run_benchmark.py b/scripts/run_benchmark.py index 18bce9b8..7c601b8e 100644 --- a/scripts/run_benchmark.py +++ b/scripts/run_benchmark.py @@ -7,7 +7,7 @@ bin = "../build/bench" exec = "numactl --cpunodebind={0} --membind={0} {1}".format(numanode, bin) -num_thread = 48 +num_thread = 64 value_sizes = [24] # constant: value size always be "value_size", # random: value size uniformly distributed in [1, value_size] From 09577c812b98fd8ed6866edc60db5562c28a1d0c Mon Sep 17 00:00:00 2001 From: ZiyanShi Date: Wed, 19 Oct 2022 09:40:09 +0800 Subject: [PATCH 18/23] bench fill Signed-off-by: ZiyanShi --- benchmark/bench.cpp | 4 +++- scripts/benchmark_impl.py | 2 -- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmark/bench.cpp b/benchmark/bench.cpp index 186cbb13..666b05fc 100644 --- a/benchmark/bench.cpp +++ b/benchmark/bench.cpp @@ -645,7 +645,9 @@ int main(int argc, char** argv) { throw std::runtime_error{"Fail to create VHash"}; } } - LaunchNThreads(FLAGS_threads, FillVHash); + if (!FLAGS_fill) { + LaunchNThreads(FLAGS_threads, FillVHash); + } #else throw std::runtime_error{"VHash not supported!"}; #endif diff --git a/scripts/benchmark_impl.py b/scripts/benchmark_impl.py index 2a5e6933..e9ddcb72 100644 --- a/scripts/benchmark_impl.py +++ b/scripts/benchmark_impl.py @@ -9,8 +9,6 @@ def __fill(exec, shared_para, data_type, report_path): - if data_type == "vhash": - return new_para = shared_para + " -fill=1 -type={}".format(data_type) report = report_path + "_fill" print("Fill {}".format(data_type)) From c7efd9758a248bf0a850514394a65dbd7a4e49e2 Mon Sep 17 00:00:00 2001 From: ZiyanShi Date: Wed, 19 Oct 2022 09:49:29 +0800 Subject: [PATCH 19/23] Add test Signed-off-by: ZiyanShi --- tests/tests.cpp | 89 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/tests/tests.cpp b/tests/tests.cpp index f9be5ae5..3338c5f7 100644 --- a/tests/tests.cpp +++ b/tests/tests.cpp @@ -1935,6 +1935,95 @@ TEST_F(EngineBasicTest, TestHash) { delete engine; } +TEST_F(EngineBasicTest, TestVHash) { + size_t num_threads = 1; + size_t count = 1000; + configs.max_access_threads = num_threads + 1; + ASSERT_EQ(Engine::Open(db_path.c_str(), &engine, configs, stdout), + Status::Ok); + std::string key{"VHash"}; + ASSERT_EQ(engine->VHashCreate(key), Status::Ok); + ASSERT_EQ(engine->VHashDestroy(key), Status::Ok); + ASSERT_EQ(engine->VHashCreate(key), Status::Ok); + using umap = std::unordered_map; + std::vector local_copies(num_threads); + std::mutex mu; + + auto VPut = [&](size_t tid) { + umap& local_copy = local_copies[tid]; + for (size_t j = 0; j < count; j++) { + std::string field{std::to_string(tid) + "_" + GetRandomString(10)}; + std::string value{GetRandomString(120)}; + ASSERT_EQ(engine->VHashPut(key, field, value), Status::Ok); + local_copy[field] = value; + } + }; + + auto VGet = [&](size_t tid) { + umap const& local_copy = local_copies[tid]; + for (auto const& kv : local_copy) { + std::string resp; + ASSERT_EQ(engine->VHashGet(key, kv.first, &resp), Status::Ok); + ASSERT_EQ(resp, kv.second) << "Field:\t" << kv.first << "\n"; + } + }; + + auto VDelete = [&](size_t tid) { + umap& local_copy = local_copies[tid]; + std::string sink; + for (size_t i = 0; i < count / 2; i++) { + auto iter = local_copy.begin(); + ASSERT_EQ(engine->VHashDelete(key, iter->first), Status::Ok); + ASSERT_EQ(engine->VHashGet(key, iter->first, &sink), Status::NotFound); + local_copy.erase(iter); + } + }; + + auto VSize = [&](size_t) { + size_t len = 0; + ASSERT_EQ(engine->VHashSize(key, &len), Status::Ok); + size_t cnt = 0; + for (size_t tid = 0; tid < num_threads; tid++) { + cnt += local_copies[tid].size(); + } + ASSERT_EQ(len, cnt); + }; + + auto VIterate = [&](size_t tid) { + umap combined; + for (size_t tid = 0; tid < num_threads; tid++) { + umap const& local_copy = local_copies[tid]; + for (auto const& kv : local_copy) { + combined[kv.first] = kv.second; + } + } + + auto iter = engine->VHashIteratorCreate(key); + + ASSERT_NE(iter, nullptr); + size_t cnt = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ++cnt; + ASSERT_EQ(combined[iter->Key()], iter->Value()); + } + ASSERT_EQ(cnt, combined.size()); + }; + + for (size_t i = 0; i < 3; i++) { + LaunchNThreads(num_threads, VPut); + LaunchNThreads(num_threads, VGet); + LaunchNThreads(num_threads, VDelete); + LaunchNThreads(num_threads, VIterate); + LaunchNThreads(num_threads, VSize); + LaunchNThreads(num_threads, VPut); + LaunchNThreads(num_threads, VGet); + LaunchNThreads(num_threads, VDelete); + LaunchNThreads(num_threads, VIterate); + LaunchNThreads(num_threads, VSize); + } + delete engine; +} + TEST_F(EngineBasicTest, TestStringHotspot) { size_t n_thread_reading = 16; size_t n_thread_writing = 16; From 1cbf20c5e408c5326784607e477c81e05ec8a290 Mon Sep 17 00:00:00 2001 From: ZiyanShi Date: Wed, 19 Oct 2022 12:36:55 +0800 Subject: [PATCH 20/23] resolve logical conflicts Signed-off-by: ZiyanShi --- engine/kv_engine_vhash.cpp | 43 +++++++++++--------------- engine/version/old_records_cleaner.cpp | 4 +-- 2 files changed, 20 insertions(+), 27 deletions(-) diff --git a/engine/kv_engine_vhash.cpp b/engine/kv_engine_vhash.cpp index 733cf6bf..f49775f4 100644 --- a/engine/kv_engine_vhash.cpp +++ b/engine/kv_engine_vhash.cpp @@ -4,34 +4,29 @@ namespace KVDK_NAMESPACE { Status KVEngine::VHashCreate(StringView key, size_t capacity) KVDK_TRY { - Status s = maybeInitAccessThread(); - defer(ReleaseAccessThread()); - if (s != Status::Ok) return s; if (!checkKeySize(key)) return Status::InvalidDataSize; + auto thread_access = AcquireAccessThread(); return vhashes_.Create(key, capacity) ? Status::Ok : Status::Existed; } KVDK_HANDLE_EXCEPTIONS Status KVEngine::VHashDestroy(StringView key) KVDK_TRY { - Status s = maybeInitAccessThread(); - defer(ReleaseAccessThread()); - if (s != Status::Ok) return s; if (!checkKeySize(key)) return Status::InvalidDataSize; + auto thread_access = AcquireAccessThread(); return vhashes_.Destroy(key) ? Status::Ok : Status::NotFound; } KVDK_HANDLE_EXCEPTIONS Status KVEngine::VHashSize(StringView key, size_t* len) KVDK_TRY { - Status s = maybeInitAccessThread(); - if (s != Status::Ok) return s; if (!checkKeySize(key)) return Status::InvalidDataSize; + auto thread_access = AcquireAccessThread(); auto token = version_controller_.GetLocalSnapshotHolder(); + VHash* vhash = vhashes_.Get(key); if (vhash == nullptr) return Status::NotFound; - *len = vhash->Size(); return Status::Ok; } @@ -39,34 +34,32 @@ KVDK_HANDLE_EXCEPTIONS Status KVEngine::VHashGet(StringView key, StringView field, std::string* value) KVDK_TRY { - Status s = maybeInitAccessThread(); - if (s != Status::Ok) return s; if (!checkKeySize(key) || !checkKeySize(field)) return Status::InvalidDataSize; + auto thread_access = AcquireAccessThread(); auto token = version_controller_.GetLocalSnapshotHolder(); + VHash* vhash = vhashes_.Get(key); if (vhash == nullptr) return Status::NotFound; StringView val; if (!vhash->Get(field, val)) return Status::NotFound; - value->assign(val.data(), val.size()); - return Status::Ok; } KVDK_HANDLE_EXCEPTIONS Status KVEngine::VHashPut(StringView key, StringView field, StringView value) KVDK_TRY { - Status s = maybeInitAccessThread(); - if (s != Status::Ok) return s; if (!checkKeySize(key) || !checkKeySize(field) || !checkValueSize(value)) return Status::InvalidDataSize; + auto thread_access = AcquireAccessThread(); auto token = version_controller_.GetLocalSnapshotHolder(); + VHash* vhash = vhashes_.Get(key); - if (vhash == nullptr) return s; + if (vhash == nullptr) return Status::NotFound; vhash->Put(field, value); return Status::Ok; @@ -74,15 +67,14 @@ Status KVEngine::VHashPut(StringView key, StringView field, KVDK_HANDLE_EXCEPTIONS Status KVEngine::VHashDelete(StringView key, StringView field) KVDK_TRY { - Status s = maybeInitAccessThread(); - if (s != Status::Ok) return s; if (!checkKeySize(key) || !checkKeySize(field)) return Status::InvalidDataSize; + auto thread_access = AcquireAccessThread(); auto token = version_controller_.GetLocalSnapshotHolder(); - VHash* vhash = vhashes_.Get(key); - if (vhash == nullptr) return s; + VHash* vhash = vhashes_.Get(key); + if (vhash == nullptr) return Status::NotFound; vhash->Delete(field); return Status::Ok; } @@ -90,8 +82,6 @@ KVDK_HANDLE_EXCEPTIONS Status KVEngine::VHashModify(StringView key, StringView field, ModifyFunc modify_func, void* cb_args) KVDK_TRY { - Status s = maybeInitAccessThread(); - if (s != Status::Ok) return s; if (!checkKeySize(key) || !checkKeySize(field)) return Status::InvalidDataSize; @@ -108,9 +98,10 @@ Status KVEngine::VHashModify(StringView key, StringView field, auto cleanup = [&](StringView) { return; }; + auto thread_access = AcquireAccessThread(); auto token = version_controller_.GetLocalSnapshotHolder(); VHash* vhash = vhashes_.Get(key); - if (vhash == nullptr) return s; + if (vhash == nullptr) return Status::NotFound; return (vhash->Modify(field, modify, cb_args, cleanup)) ? Status::Ok : Status::Abort; @@ -121,16 +112,18 @@ std::unique_ptr KVEngine::VHashIteratorCreate(StringView key, Status* s) try { Status sink; s = (s != nullptr) ? s : &sink; + *s = Status::NotFound; - *s = maybeInitAccessThread(); - if (*s != Status::Ok) return nullptr; if (!checkKeySize(key)) return nullptr; /// TODO: iterator should hold an access token to keep VHash valid. auto token = version_controller_.GetLocalSnapshotHolder(); + auto thread_access = AcquireAccessThread(); + VHash* vhash = vhashes_.Get(key); if (vhash == nullptr) return nullptr; + *s = Status::Ok; return vhash->MakeIterator(); } catch (std::exception const& ex) { Status sink; diff --git a/engine/version/old_records_cleaner.cpp b/engine/version/old_records_cleaner.cpp index 467b1eea..849cc749 100644 --- a/engine/version/old_records_cleaner.cpp +++ b/engine/version/old_records_cleaner.cpp @@ -18,8 +18,8 @@ void OldRecordsCleaner::RegisterDelayDeleter(IDeleter& deleter) { } void OldRecordsCleaner::DelayDelete(IDeleter& deleter, void* obj) { - kvdk_assert(access_thread.id >= 0, ""); - auto& tc = cleaner_thread_cache_[access_thread.id]; + kvdk_assert(this_thread.id >= 0, ""); + auto& tc = cleaner_thread_cache_[this_thread.id]; std::lock_guard guard{tc.old_records_lock}; TimestampType ts = kv_engine_->version_controller_.GetCurrentTimestamp(); From 96f2691e0fcb3e8e4dec8eaa0610a00b0dd239ed Mon Sep 17 00:00:00 2001 From: ZiyanShi Date: Wed, 19 Oct 2022 12:57:16 +0800 Subject: [PATCH 21/23] fix warnings Signed-off-by: ZiyanShi --- benchmark/bench.cpp | 3 ++- engine/experimental/vhash_group.hpp | 2 +- tests/tests.cpp | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/benchmark/bench.cpp b/benchmark/bench.cpp index 9d78b99d..ba891cd0 100644 --- a/benchmark/bench.cpp +++ b/benchmark/bench.cpp @@ -367,6 +367,7 @@ void DBScan(int tid) { operations_counted = operations; } } + break; } case DataType::Blackhole: { operations += 1024; @@ -548,7 +549,7 @@ void ProcessBenchmarkConfigs() { if (bench_data_type == DataType::VHash) { // Vhash needs fill for read and update benchmarks operations_per_thread = FLAGS_num_kv / FLAGS_max_access_threads + 1; - for (int i = 0; i < FLAGS_max_access_threads; i++) { + for (size_t i = 0; i < FLAGS_max_access_threads; i++) { ranges.emplace_back(i * operations_per_thread, (i + 1) * operations_per_thread); } diff --git a/engine/experimental/vhash_group.hpp b/engine/experimental/vhash_group.hpp index dc6e78d7..80a38491 100644 --- a/engine/experimental/vhash_group.hpp +++ b/engine/experimental/vhash_group.hpp @@ -12,8 +12,8 @@ namespace KVDK_NAMESPACE { /// TODO: Add hpmap_alloc to allocate memory for hashptr_maps. class VHashGroup { private: - OldRecordsCleaner& cleaner; IVolatileAllocator& kv_alloc; + OldRecordsCleaner& cleaner; VHashKVBuilder kvb{kv_alloc, cleaner}; VHashBuilder vhb{cleaner}; diff --git a/tests/tests.cpp b/tests/tests.cpp index 62a8f8a3..0422ad57 100644 --- a/tests/tests.cpp +++ b/tests/tests.cpp @@ -1950,7 +1950,7 @@ TEST_F(EngineBasicTest, TestVHash) { ASSERT_EQ(len, cnt); }; - auto VIterate = [&](size_t tid) { + auto VIterate = [&](size_t) { umap combined; for (size_t tid = 0; tid < num_threads; tid++) { umap const& local_copy = local_copies[tid]; From 4bc822c4b21c53fb0e7f4c17fb47a2e9cc218383 Mon Sep 17 00:00:00 2001 From: ZiyanShi Date: Wed, 19 Oct 2022 15:27:20 +0800 Subject: [PATCH 22/23] Fix memory leak during shutdown. Signed-off-by: ZiyanShi --- engine/version/old_records_cleaner.cpp | 17 +++++++++++++++++ engine/version/old_records_cleaner.hpp | 2 ++ 2 files changed, 19 insertions(+) diff --git a/engine/version/old_records_cleaner.cpp b/engine/version/old_records_cleaner.cpp index 849cc749..c677ee0e 100644 --- a/engine/version/old_records_cleaner.cpp +++ b/engine/version/old_records_cleaner.cpp @@ -9,6 +9,23 @@ namespace KVDK_NAMESPACE { +OldRecordsCleaner::~OldRecordsCleaner() { + // Clean twice to ensure all resources are released. + TryGlobalClean(); + TryGlobalClean(); + for (size_t i = 0; i < cleaner_thread_cache_.size(); i++) { + auto& tc = cleaner_thread_cache_[i]; + std::lock_guard lg(tc.old_records_lock); + if (!tc.pending_free_space_entries.empty()) + GlobalLogger.Error("PendingFree leaked!"); + for (auto const& q : tc.local_queues_) { + if (!q.empty()) GlobalLogger.Error("DRAM leaked!"); + } + } + if (!global_pending_free_space_entries_.empty()) + GlobalLogger.Error("PendingFree leaked!"); +} + void OldRecordsCleaner::RegisterDelayDeleter(IDeleter& deleter) { size_t idx = global_queues_.size(); delay_deleters_[&deleter] = idx; diff --git a/engine/version/old_records_cleaner.hpp b/engine/version/old_records_cleaner.hpp index d4aaeda3..5d832532 100644 --- a/engine/version/old_records_cleaner.hpp +++ b/engine/version/old_records_cleaner.hpp @@ -39,6 +39,8 @@ class OldRecordsCleaner { assert(kv_engine_ != nullptr); } + ~OldRecordsCleaner(); + // Warning: not thread safe. Must be called during kv_engine initialization. void RegisterDelayDeleter(IDeleter& deleter); From 80695c25b681c73602819b92fa356f464d3f5599 Mon Sep 17 00:00:00 2001 From: ZiyanShi Date: Wed, 19 Oct 2022 16:15:33 +0800 Subject: [PATCH 23/23] bug fix Signed-off-by: ZiyanShi --- engine/kv_engine_vhash.cpp | 2 +- engine/version/old_records_cleaner.cpp | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/engine/kv_engine_vhash.cpp b/engine/kv_engine_vhash.cpp index f49775f4..52a977ee 100644 --- a/engine/kv_engine_vhash.cpp +++ b/engine/kv_engine_vhash.cpp @@ -116,9 +116,9 @@ std::unique_ptr KVEngine::VHashIteratorCreate(StringView key, if (!checkKeySize(key)) return nullptr; + auto thread_access = AcquireAccessThread(); /// TODO: iterator should hold an access token to keep VHash valid. auto token = version_controller_.GetLocalSnapshotHolder(); - auto thread_access = AcquireAccessThread(); VHash* vhash = vhashes_.Get(key); if (vhash == nullptr) return nullptr; diff --git a/engine/version/old_records_cleaner.cpp b/engine/version/old_records_cleaner.cpp index c677ee0e..7dc9d25d 100644 --- a/engine/version/old_records_cleaner.cpp +++ b/engine/version/old_records_cleaner.cpp @@ -36,7 +36,8 @@ void OldRecordsCleaner::RegisterDelayDeleter(IDeleter& deleter) { void OldRecordsCleaner::DelayDelete(IDeleter& deleter, void* obj) { kvdk_assert(this_thread.id >= 0, ""); - auto& tc = cleaner_thread_cache_[this_thread.id]; + auto& tc = cleaner_thread_cache_[ThreadManager::ThreadID() % + cleaner_thread_cache_.size()]; std::lock_guard guard{tc.old_records_lock}; TimestampType ts = kv_engine_->version_controller_.GetCurrentTimestamp();