From db050c6e647420b85be2a98747e0d70166da7f05 Mon Sep 17 00:00:00 2001 From: Anton Korobeynikov Date: Wed, 3 Jul 2024 22:58:30 -0700 Subject: [PATCH] Improve cstring cache internals (#4780) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Switch to abseil node_hash_set for cstring internal hash. Reduce number of lookups while there. Signed-off-by: Anton Korobeynikov * Import get_cached method Signed-off-by: Anton Korobeynikov * Usual "unbreak Bazel" commit Signed-off-by: Anton Korobeynikov * Apply suggestions from code review Co-authored-by: Vladimír Štill Signed-off-by: Anton Korobeynikov --------- Signed-off-by: Anton Korobeynikov Co-authored-by: Vladimír Štill --- BUILD.bazel | 2 ++ lib/cstring.cpp | 60 ++++++++++++++++++++++++++++-------------- lib/cstring.h | 5 ++++ test/gtest/cstring.cpp | 15 +++++++++++ 4 files changed, 62 insertions(+), 20 deletions(-) diff --git a/BUILD.bazel b/BUILD.bazel index f991520687..fc8d54f82d 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -51,6 +51,8 @@ cc_library( ":config_h", "@boost//:format", "@boost//:multiprecision", + "@com_google_absl//absl/container:flat_hash_map", + "@com_google_absl//absl/container:node_hash_set", "@com_google_absl//absl/numeric:bits", "@com_google_absl//absl/strings", "@com_google_absl//absl/strings:str_format", diff --git a/lib/cstring.cpp b/lib/cstring.cpp index 2d22337e6d..17c7cbd11a 100644 --- a/lib/cstring.cpp +++ b/lib/cstring.cpp @@ -16,6 +16,7 @@ limitations under the License. #include "cstring.h" +#include "absl/container/node_hash_set.h" #include "absl/strings/str_cat.h" #include "absl/strings/str_replace.h" @@ -135,45 +136,64 @@ class table_entry { return length() == other.length() && std::memcmp(string(), other.string(), length()) == 0; } + bool operator==(std::string_view other) const { + return length() == other.length() && std::memcmp(string(), other.data(), length()) == 0; + } + private: bool is_inplace() const { return (m_flags & table_entry_flags::inplace) == table_entry_flags::inplace; } }; -} // namespace -namespace std { -template <> -struct hash { - std::size_t operator()(const table_entry &entry) const { +// We'd make Util::Hash to be transparent instead. However, this would enable +// transparent hashing globally and in some cases in very undesired manner. So +// for now aim for more fine-grained approach. +struct TableEntryHash { + using is_transparent = void; + + // IMPORTANT: These two hashes MUST match in order for heterogenous + // lookup to work properly + size_t operator()(const table_entry &entry) const { return Util::hash(entry.string(), entry.length()); } + + size_t operator()(std::string_view entry) const { + return Util::hash(entry.data(), entry.length()); + } }; -} // namespace std -namespace { -std::unordered_set &cache() { - static std::unordered_set g_cache; +auto &cache() { + // We need node_hash_set due to SSO: we return address of embedded string + // that should be stable + static absl::node_hash_set> g_cache; return g_cache; } const char *save_to_cache(const char *string, std::size_t length, table_entry_flags flags) { - if ((flags & table_entry_flags::no_need_copy) == table_entry_flags::no_need_copy) { - return cache().emplace(string, length, flags).first->string(); - } + // Checks if string is already cached and if not, calls ctor to construct in + // place. As a result, only a single lookup is performed regardless whether + // entry is in cache or not. + return cache() + .lazy_emplace(table_entry(string, length, table_entry_flags::no_need_copy), + [string, length, flags](const auto &ctor) { ctor(string, length, flags); }) + ->string(); +} - // temporary table_entry, used for searching only. no need to copy string - auto found = cache().find(table_entry(string, length, table_entry_flags::no_need_copy)); +} // namespace - if (found == cache().end()) { - return cache().emplace(string, length, flags).first->string(); - } +bool cstring::is_cached(std::string_view s) { return cache().contains(s); } - return found->string(); -} +cstring cstring::get_cached(std::string_view s) { + auto entry = cache().find(s); + if (entry == cache().end()) return nullptr; -} // namespace + cstring res; + res.str = entry->string(); + + return res; +} void cstring::construct_from_shared(const char *string, std::size_t length) { str = save_to_cache(string, length, table_entry_flags::none); diff --git a/lib/cstring.h b/lib/cstring.h index 6abb264027..93d2c490b7 100644 --- a/lib/cstring.h +++ b/lib/cstring.h @@ -148,6 +148,11 @@ class cstring { return result; } + /// @return true if a given string is interned (contained in cstring cache) + static bool is_cached(std::string_view s); + /// @return corresponding cstring if it was interned, null cstring otherwise + static cstring get_cached(std::string_view s); + private: // passed string is shared, we not unique owners void construct_from_shared(const char *string, std::size_t length); diff --git a/test/gtest/cstring.cpp b/test/gtest/cstring.cpp index 38f360d236..66f315b62c 100644 --- a/test/gtest/cstring.cpp +++ b/test/gtest/cstring.cpp @@ -143,4 +143,19 @@ TEST(cstring, literalSuffix) { EXPECT_TRUE((std::is_same_v)); } +TEST(cstring, is_cached) { + [[maybe_unused]] cstring test = "test"_cs; + EXPECT_FALSE( + cstring::is_cached("we really do not expect that this string is already in cstring cache")); + EXPECT_TRUE(cstring::is_cached("test")); +} + +TEST(cstring, get_cached) { + [[maybe_unused]] cstring test = "test"_cs; + EXPECT_TRUE( + cstring::get_cached("we really do not expect that this string is already in cstring cache") + .isNull()); + EXPECT_FALSE(cstring::get_cached("test").isNullOrEmpty()); +} + } // namespace Test