diff options
author | vadim-xd <vadim-xd@yandex-team.com> | 2024-06-03 23:26:15 +0300 |
---|---|---|
committer | vadim-xd <vadim-xd@yandex-team.com> | 2024-06-03 23:35:19 +0300 |
commit | ca99fc2f20163d67dbab313a7fdf6589d83ba220 (patch) | |
tree | 4fcb2ee7466822ee8650cfaf684ab44c3c8dd5e8 /library/cpp/case_insensitive_string | |
parent | 7d8657f7553a8c96975b883afc7f2b42bc558616 (diff) | |
download | ydb-ca99fc2f20163d67dbab313a7fdf6589d83ba220.tar.gz |
Optimize hashing for case-insensitive strings
6e07ea929418b1fae4257a2af37aa0ed5799f22a
Diffstat (limited to 'library/cpp/case_insensitive_string')
7 files changed, 254 insertions, 6 deletions
diff --git a/library/cpp/case_insensitive_string/benchmark/main.cpp b/library/cpp/case_insensitive_string/benchmark/main.cpp new file mode 100644 index 0000000000..136be50f37 --- /dev/null +++ b/library/cpp/case_insensitive_string/benchmark/main.cpp @@ -0,0 +1,167 @@ +#include <benchmark/benchmark.h> +#include <library/cpp/case_insensitive_string/case_insensitive_string.h> +#include <library/cpp/digest/murmur/murmur.h> + +#include <util/generic/hash_table.h> +#include <util/generic/vector.h> +#include <util/string/ascii.h> +#include <util/random/random.h> + +namespace { + // THash<TCaseInsensitiveStringBuf>::operator() is not inlined + Y_NO_INLINE size_t NaiveHash(TCaseInsensitiveStringBuf str) noexcept { + TMurmurHash2A<size_t> hash; + for (size_t i = 0; i < str.size(); ++i) { + char lower = std::tolower(str[i]); + hash.Update(&lower, 1); + } + return hash.Value(); + } + + [[maybe_unused]] Y_NO_INLINE size_t OptimizedHashV1(TCaseInsensitiveStringBuf str) noexcept { + TMurmurHash2A<size_t> hash; + std::array<char, sizeof(size_t)> buf; + size_t headSize = str.size() - str.size() % buf.size(); + for (size_t i = 0; i < headSize; i += buf.size()) { + for (size_t j = 0; j < buf.size(); ++j) { + buf[j] = std::tolower(str[i + j]); + } + hash.Update(buf.data(), buf.size()); + } + for (size_t i = headSize; i < str.size(); ++i) { + char lower = std::tolower(str[i]); + hash.Update(&lower, 1); + } + return hash.Value(); + } + + [[maybe_unused]] Y_NO_INLINE size_t OptimizedHashDuplicateTailLoop(TCaseInsensitiveStringBuf str) noexcept { + TMurmurHash2A<size_t> hash; + + if (str.size() < sizeof(size_t)) { + for (size_t i = 0; i < str.size(); ++i) { + char lower = std::tolower(str[i]); + hash.Update(&lower, 1); + } + return hash.Value(); + } + std::array<char, sizeof(size_t)> buf; + size_t headSize = str.size() - str.size() % buf.size(); + for (size_t i = 0; i < headSize; i += buf.size()) { + for (size_t j = 0; j < buf.size(); ++j) { + buf[j] = std::tolower(str[i + j]); + } + hash.Update(buf.data(), buf.size()); + } + for (size_t i = headSize; i < str.size(); ++i) { + char lower = std::tolower(str[i]); + hash.Update(&lower, 1); + } + return hash.Value(); + } + + size_t HashTail(TMurmurHash2A<size_t>& hash, const char* data, size_t size) { + for (size_t i = 0; i < size; ++i) { + char lower = std::tolower(data[i]); + hash.Update(&lower, 1); + } + return hash.Value(); + } + + [[maybe_unused]] Y_NO_INLINE size_t OptimizedHashDuplicateTailLoopInFunc(TCaseInsensitiveStringBuf str) noexcept { + TMurmurHash2A<size_t> hash; + if (str.size() < sizeof(size_t)) { + return HashTail(hash, str.data(), str.size()); + } + std::array<char, sizeof(size_t)> buf; + size_t headSize = str.size() - str.size() % buf.size(); + for (size_t i = 0; i < headSize; i += buf.size()) { + for (size_t j = 0; j < buf.size(); ++j) { + buf[j] = std::tolower(str[i + j]); + } + hash.Update(buf.data(), buf.size()); + } + return HashTail(hash, str.data() + headSize, str.size() - headSize); + } + + // Currently shows the best performance, comparable with NaiveHash for short strings, only slower for empty strings. + // Should be equivalent to OptimizedHashV1 after inlining HashTail, but for some reason it's a bit larger but at the same time faster than OptimizedHashV1. + [[maybe_unused]] Y_NO_INLINE size_t OptimizedHashTailLoopInFunc(TCaseInsensitiveStringBuf str) noexcept { + TMurmurHash2A<size_t> hash; + std::array<char, sizeof(size_t)> buf; + size_t headSize = str.size() - str.size() % buf.size(); + for (size_t i = 0; i < headSize; i += buf.size()) { + for (size_t j = 0; j < buf.size(); ++j) { + buf[j] = std::tolower(str[i + j]); + } + hash.Update(buf.data(), buf.size()); + } + return HashTail(hash, str.data() + headSize, str.size() - headSize); + } + + Y_FORCE_INLINE size_t DefaultHash(TCaseInsensitiveStringBuf str) { + return ComputeHash(str); + } +} + +template <auto Impl> +void CaseInsensitiveHash(benchmark::State& state) { + SetRandomSeed(123 + state.range()); + TCaseInsensitiveString str; + for (int i = 0; i < state.range(); ++i) { + str.push_back(RandomNumber<unsigned char>()); + } + Y_ENSURE(Impl(str) == NaiveHash(str)); + for (auto _ : state) { + size_t hash = Impl(str); + benchmark::DoNotOptimize(hash); + } +} + +template <auto Impl> +void CaseInsensitiveHashRandomSizes(benchmark::State& state) { + SetRandomSeed(123); + size_t minStrLen = static_cast<size_t>(state.range(0)); + size_t maxStrLen = static_cast<size_t>(state.range(1)); + static constexpr size_t nStrings = 64; + TVector<TString> stringStorage(Reserve(nStrings)); + std::array<TCaseInsensitiveStringBuf, nStrings> strings; + for (size_t i = 0; i < nStrings; ++i) { + auto& str = stringStorage.emplace_back(); + size_t strLen = minStrLen + RandomNumber(maxStrLen - minStrLen + 1); + for (size_t i = 0; i < strLen; ++i) { + str.push_back(RandomNumber<unsigned char>()); + } + strings[i] = str; + } + for (auto _ : state) { + for (auto str : strings) { + size_t hash = Impl(str); + benchmark::DoNotOptimize(hash); + } + } +} + +#define BENCH_ARGS ArgName("strlen")->DenseRange(0, sizeof(size_t) + 1)->RangeMultiplier(2)->Range(sizeof(size_t) * 2, 64) + +BENCHMARK(CaseInsensitiveHash<NaiveHash>)->BENCH_ARGS; +BENCHMARK(CaseInsensitiveHash<DefaultHash>)->BENCH_ARGS; +#ifdef BENCHMARK_ALL_IMPLS +BENCHMARK(CaseInsensitiveHash<OptimizedHashV1>)->BENCH_ARGS; +BENCHMARK(CaseInsensitiveHash<OptimizedHashDuplicateTailLoop>)->BENCH_ARGS; +BENCHMARK(CaseInsensitiveHash<OptimizedHashDuplicateTailLoopInFunc>)->BENCH_ARGS; +BENCHMARK(CaseInsensitiveHash<OptimizedHashTailLoopInFunc>)->BENCH_ARGS; +#endif + +#undef BENCH_ARGS + +#define BENCH_ARGS \ + ArgNames({"minStrLen", "maxStrLen"}) \ + ->ArgPair(4, 16) /*Approximate http header lengths (real distribution may differ)*/ \ + ->ArgPair(4, 11) /*1/2 short, 1/2 long to check possible branch mispredictions*/ \ + ->ArgPair(1, 8) /*Mostly short strings to check performance regression*/ + +BENCHMARK(CaseInsensitiveHashRandomSizes<NaiveHash>)->BENCH_ARGS; +BENCHMARK(CaseInsensitiveHashRandomSizes<DefaultHash>)->BENCH_ARGS; + +#undef BENCH_ARGS diff --git a/library/cpp/case_insensitive_string/benchmark/ya.make b/library/cpp/case_insensitive_string/benchmark/ya.make new file mode 100644 index 0000000000..505faf41de --- /dev/null +++ b/library/cpp/case_insensitive_string/benchmark/ya.make @@ -0,0 +1,16 @@ +G_BENCHMARK() + +IF (NOT AUTOCHECK) + CFLAGS(-DBENCHMARK_ALL_IMPLS) +ENDIF() + +SRCS( + main.cpp +) + +PEERDIR( + library/cpp/case_insensitive_string + library/cpp/digest/murmur +) + +END() diff --git a/library/cpp/case_insensitive_string/case_insensitive_char_traits.cpp b/library/cpp/case_insensitive_string/case_insensitive_char_traits.cpp index 14e6d1d51f..78de43343e 100644 --- a/library/cpp/case_insensitive_string/case_insensitive_char_traits.cpp +++ b/library/cpp/case_insensitive_string/case_insensitive_char_traits.cpp @@ -31,4 +31,3 @@ TCaseInsensitiveString EscapeC(const TCaseInsensitiveString& str) { const auto result = EscapeC(str.data(), str.size()); return {result.data(), result.size()}; } - diff --git a/library/cpp/case_insensitive_string/case_insensitive_string.cpp b/library/cpp/case_insensitive_string/case_insensitive_string.cpp index 16c0f5ff7a..dce0ff4af8 100644 --- a/library/cpp/case_insensitive_string/case_insensitive_string.cpp +++ b/library/cpp/case_insensitive_string/case_insensitive_string.cpp @@ -2,15 +2,29 @@ #include <library/cpp/digest/murmur/murmur.h> -size_t THash<TCaseInsensitiveStringBuf>::operator()(TCaseInsensitiveStringBuf str) const noexcept { - TMurmurHash2A<size_t> hash; - for (size_t i = 0; i < str.size(); ++i) { - char lower = std::tolower(str[i]); +#include <array> + +static size_t HashTail(TMurmurHash2A<size_t>& hash, const char* data, size_t size) { + for (size_t i = 0; i < size; ++i) { + char lower = std::tolower(data[i]); hash.Update(&lower, 1); } return hash.Value(); } +size_t THash<TCaseInsensitiveStringBuf>::operator()(TCaseInsensitiveStringBuf str) const noexcept { + TMurmurHash2A<size_t> hash; + std::array<char, sizeof(size_t)> buf; + size_t headSize = str.size() - str.size() % buf.size(); + for (size_t i = 0; i < headSize; i += buf.size()) { + for (size_t j = 0; j < buf.size(); ++j) { + buf[j] = std::tolower(str[i + j]); + } + hash.Update(buf.data(), buf.size()); + } + return HashTail(hash, str.data() + headSize, str.size() - headSize); +} + template <> void Out<TCaseInsensitiveString>(IOutputStream& o, const TCaseInsensitiveString& p) { o.Write(p.data(), p.size()); diff --git a/library/cpp/case_insensitive_string/ut_gtest/case_insensitive_string_hash.cpp b/library/cpp/case_insensitive_string/ut_gtest/case_insensitive_string_hash.cpp new file mode 100644 index 0000000000..893494c96d --- /dev/null +++ b/library/cpp/case_insensitive_string/ut_gtest/case_insensitive_string_hash.cpp @@ -0,0 +1,36 @@ +#include <library/cpp/case_insensitive_string/case_insensitive_string.h> +#include <library/cpp/digest/murmur/murmur.h> +#include <library/cpp/testing/gtest/gtest.h> + +#include <util/generic/hash_table.h> +#include <util/random/random.h> + +TEST(CaseInsensitiveString, Hash) { + size_t h1 = ComputeHash(TCaseInsensitiveString("some long string...")); + size_t h2 = ComputeHash(TCaseInsensitiveString("Some Long String...")); + EXPECT_EQ(h1, h2); + size_t otherHash = ComputeHash(TCaseInsensitiveString("other long string...")); + EXPECT_NE(h1, otherHash); +} + +namespace { + size_t NaiveCaseInsensitiveHash(TCaseInsensitiveStringBuf str) noexcept { + TMurmurHash2A<size_t> hash; + for (size_t i = 0; i < str.size(); ++i) { + char lower = std::tolower(str[i]); + hash.Update(&lower, 1); + } + return hash.Value(); + } +} + +TEST(CaseInsensitiveString, HashValues) { + SetRandomSeed(123); + for (size_t n = 0; n < 64; ++n) { + TCaseInsensitiveString s; + for (size_t i = 0; i < n; ++i) { + s.push_back(RandomNumber<unsigned char>()); + } + EXPECT_EQ(ComputeHash(s), NaiveCaseInsensitiveHash(s)) << "Hashes for \"" << s << "\" differ"; + } +} diff --git a/library/cpp/case_insensitive_string/ut_gtest/ya.make b/library/cpp/case_insensitive_string/ut_gtest/ya.make new file mode 100644 index 0000000000..2dbb44d87b --- /dev/null +++ b/library/cpp/case_insensitive_string/ut_gtest/ya.make @@ -0,0 +1,12 @@ +GTEST() + +SRCS( + case_insensitive_string_hash.cpp +) + +PEERDIR( + library/cpp/case_insensitive_string + library/cpp/digest/murmur +) + +END() diff --git a/library/cpp/case_insensitive_string/ya.make b/library/cpp/case_insensitive_string/ya.make index 1057ab7ada..b05f167118 100644 --- a/library/cpp/case_insensitive_string/ya.make +++ b/library/cpp/case_insensitive_string/ya.make @@ -12,4 +12,8 @@ PEERDIR( END() -RECURSE_FOR_TESTS(ut) +RECURSE_FOR_TESTS( + benchmark + ut + ut_gtest +) |