aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/case_insensitive_string
diff options
context:
space:
mode:
authorvadim-xd <vadim-xd@yandex-team.com>2024-06-03 23:26:15 +0300
committervadim-xd <vadim-xd@yandex-team.com>2024-06-03 23:35:19 +0300
commitca99fc2f20163d67dbab313a7fdf6589d83ba220 (patch)
tree4fcb2ee7466822ee8650cfaf684ab44c3c8dd5e8 /library/cpp/case_insensitive_string
parent7d8657f7553a8c96975b883afc7f2b42bc558616 (diff)
downloadydb-ca99fc2f20163d67dbab313a7fdf6589d83ba220.tar.gz
Optimize hashing for case-insensitive strings
6e07ea929418b1fae4257a2af37aa0ed5799f22a
Diffstat (limited to 'library/cpp/case_insensitive_string')
-rw-r--r--library/cpp/case_insensitive_string/benchmark/main.cpp167
-rw-r--r--library/cpp/case_insensitive_string/benchmark/ya.make16
-rw-r--r--library/cpp/case_insensitive_string/case_insensitive_char_traits.cpp1
-rw-r--r--library/cpp/case_insensitive_string/case_insensitive_string.cpp22
-rw-r--r--library/cpp/case_insensitive_string/ut_gtest/case_insensitive_string_hash.cpp36
-rw-r--r--library/cpp/case_insensitive_string/ut_gtest/ya.make12
-rw-r--r--library/cpp/case_insensitive_string/ya.make6
7 files changed, 254 insertions, 6 deletions
diff --git a/library/cpp/case_insensitive_string/benchmark/main.cpp b/library/cpp/case_insensitive_string/benchmark/main.cpp
new file mode 100644
index 0000000000..136be50f37
--- /dev/null
+++ b/library/cpp/case_insensitive_string/benchmark/main.cpp
@@ -0,0 +1,167 @@
+#include <benchmark/benchmark.h>
+#include <library/cpp/case_insensitive_string/case_insensitive_string.h>
+#include <library/cpp/digest/murmur/murmur.h>
+
+#include <util/generic/hash_table.h>
+#include <util/generic/vector.h>
+#include <util/string/ascii.h>
+#include <util/random/random.h>
+
+namespace {
+ // THash<TCaseInsensitiveStringBuf>::operator() is not inlined
+ Y_NO_INLINE size_t NaiveHash(TCaseInsensitiveStringBuf str) noexcept {
+ TMurmurHash2A<size_t> hash;
+ for (size_t i = 0; i < str.size(); ++i) {
+ char lower = std::tolower(str[i]);
+ hash.Update(&lower, 1);
+ }
+ return hash.Value();
+ }
+
+ [[maybe_unused]] Y_NO_INLINE size_t OptimizedHashV1(TCaseInsensitiveStringBuf str) noexcept {
+ TMurmurHash2A<size_t> hash;
+ std::array<char, sizeof(size_t)> buf;
+ size_t headSize = str.size() - str.size() % buf.size();
+ for (size_t i = 0; i < headSize; i += buf.size()) {
+ for (size_t j = 0; j < buf.size(); ++j) {
+ buf[j] = std::tolower(str[i + j]);
+ }
+ hash.Update(buf.data(), buf.size());
+ }
+ for (size_t i = headSize; i < str.size(); ++i) {
+ char lower = std::tolower(str[i]);
+ hash.Update(&lower, 1);
+ }
+ return hash.Value();
+ }
+
+ [[maybe_unused]] Y_NO_INLINE size_t OptimizedHashDuplicateTailLoop(TCaseInsensitiveStringBuf str) noexcept {
+ TMurmurHash2A<size_t> hash;
+
+ if (str.size() < sizeof(size_t)) {
+ for (size_t i = 0; i < str.size(); ++i) {
+ char lower = std::tolower(str[i]);
+ hash.Update(&lower, 1);
+ }
+ return hash.Value();
+ }
+ std::array<char, sizeof(size_t)> buf;
+ size_t headSize = str.size() - str.size() % buf.size();
+ for (size_t i = 0; i < headSize; i += buf.size()) {
+ for (size_t j = 0; j < buf.size(); ++j) {
+ buf[j] = std::tolower(str[i + j]);
+ }
+ hash.Update(buf.data(), buf.size());
+ }
+ for (size_t i = headSize; i < str.size(); ++i) {
+ char lower = std::tolower(str[i]);
+ hash.Update(&lower, 1);
+ }
+ return hash.Value();
+ }
+
+ size_t HashTail(TMurmurHash2A<size_t>& hash, const char* data, size_t size) {
+ for (size_t i = 0; i < size; ++i) {
+ char lower = std::tolower(data[i]);
+ hash.Update(&lower, 1);
+ }
+ return hash.Value();
+ }
+
+ [[maybe_unused]] Y_NO_INLINE size_t OptimizedHashDuplicateTailLoopInFunc(TCaseInsensitiveStringBuf str) noexcept {
+ TMurmurHash2A<size_t> hash;
+ if (str.size() < sizeof(size_t)) {
+ return HashTail(hash, str.data(), str.size());
+ }
+ std::array<char, sizeof(size_t)> buf;
+ size_t headSize = str.size() - str.size() % buf.size();
+ for (size_t i = 0; i < headSize; i += buf.size()) {
+ for (size_t j = 0; j < buf.size(); ++j) {
+ buf[j] = std::tolower(str[i + j]);
+ }
+ hash.Update(buf.data(), buf.size());
+ }
+ return HashTail(hash, str.data() + headSize, str.size() - headSize);
+ }
+
+ // Currently shows the best performance, comparable with NaiveHash for short strings, only slower for empty strings.
+ // Should be equivalent to OptimizedHashV1 after inlining HashTail, but for some reason it's a bit larger but at the same time faster than OptimizedHashV1.
+ [[maybe_unused]] Y_NO_INLINE size_t OptimizedHashTailLoopInFunc(TCaseInsensitiveStringBuf str) noexcept {
+ TMurmurHash2A<size_t> hash;
+ std::array<char, sizeof(size_t)> buf;
+ size_t headSize = str.size() - str.size() % buf.size();
+ for (size_t i = 0; i < headSize; i += buf.size()) {
+ for (size_t j = 0; j < buf.size(); ++j) {
+ buf[j] = std::tolower(str[i + j]);
+ }
+ hash.Update(buf.data(), buf.size());
+ }
+ return HashTail(hash, str.data() + headSize, str.size() - headSize);
+ }
+
+ Y_FORCE_INLINE size_t DefaultHash(TCaseInsensitiveStringBuf str) {
+ return ComputeHash(str);
+ }
+}
+
+template <auto Impl>
+void CaseInsensitiveHash(benchmark::State& state) {
+ SetRandomSeed(123 + state.range());
+ TCaseInsensitiveString str;
+ for (int i = 0; i < state.range(); ++i) {
+ str.push_back(RandomNumber<unsigned char>());
+ }
+ Y_ENSURE(Impl(str) == NaiveHash(str));
+ for (auto _ : state) {
+ size_t hash = Impl(str);
+ benchmark::DoNotOptimize(hash);
+ }
+}
+
+template <auto Impl>
+void CaseInsensitiveHashRandomSizes(benchmark::State& state) {
+ SetRandomSeed(123);
+ size_t minStrLen = static_cast<size_t>(state.range(0));
+ size_t maxStrLen = static_cast<size_t>(state.range(1));
+ static constexpr size_t nStrings = 64;
+ TVector<TString> stringStorage(Reserve(nStrings));
+ std::array<TCaseInsensitiveStringBuf, nStrings> strings;
+ for (size_t i = 0; i < nStrings; ++i) {
+ auto& str = stringStorage.emplace_back();
+ size_t strLen = minStrLen + RandomNumber(maxStrLen - minStrLen + 1);
+ for (size_t i = 0; i < strLen; ++i) {
+ str.push_back(RandomNumber<unsigned char>());
+ }
+ strings[i] = str;
+ }
+ for (auto _ : state) {
+ for (auto str : strings) {
+ size_t hash = Impl(str);
+ benchmark::DoNotOptimize(hash);
+ }
+ }
+}
+
+#define BENCH_ARGS ArgName("strlen")->DenseRange(0, sizeof(size_t) + 1)->RangeMultiplier(2)->Range(sizeof(size_t) * 2, 64)
+
+BENCHMARK(CaseInsensitiveHash<NaiveHash>)->BENCH_ARGS;
+BENCHMARK(CaseInsensitiveHash<DefaultHash>)->BENCH_ARGS;
+#ifdef BENCHMARK_ALL_IMPLS
+BENCHMARK(CaseInsensitiveHash<OptimizedHashV1>)->BENCH_ARGS;
+BENCHMARK(CaseInsensitiveHash<OptimizedHashDuplicateTailLoop>)->BENCH_ARGS;
+BENCHMARK(CaseInsensitiveHash<OptimizedHashDuplicateTailLoopInFunc>)->BENCH_ARGS;
+BENCHMARK(CaseInsensitiveHash<OptimizedHashTailLoopInFunc>)->BENCH_ARGS;
+#endif
+
+#undef BENCH_ARGS
+
+#define BENCH_ARGS \
+ ArgNames({"minStrLen", "maxStrLen"}) \
+ ->ArgPair(4, 16) /*Approximate http header lengths (real distribution may differ)*/ \
+ ->ArgPair(4, 11) /*1/2 short, 1/2 long to check possible branch mispredictions*/ \
+ ->ArgPair(1, 8) /*Mostly short strings to check performance regression*/
+
+BENCHMARK(CaseInsensitiveHashRandomSizes<NaiveHash>)->BENCH_ARGS;
+BENCHMARK(CaseInsensitiveHashRandomSizes<DefaultHash>)->BENCH_ARGS;
+
+#undef BENCH_ARGS
diff --git a/library/cpp/case_insensitive_string/benchmark/ya.make b/library/cpp/case_insensitive_string/benchmark/ya.make
new file mode 100644
index 0000000000..505faf41de
--- /dev/null
+++ b/library/cpp/case_insensitive_string/benchmark/ya.make
@@ -0,0 +1,16 @@
+G_BENCHMARK()
+
+IF (NOT AUTOCHECK)
+ CFLAGS(-DBENCHMARK_ALL_IMPLS)
+ENDIF()
+
+SRCS(
+ main.cpp
+)
+
+PEERDIR(
+ library/cpp/case_insensitive_string
+ library/cpp/digest/murmur
+)
+
+END()
diff --git a/library/cpp/case_insensitive_string/case_insensitive_char_traits.cpp b/library/cpp/case_insensitive_string/case_insensitive_char_traits.cpp
index 14e6d1d51f..78de43343e 100644
--- a/library/cpp/case_insensitive_string/case_insensitive_char_traits.cpp
+++ b/library/cpp/case_insensitive_string/case_insensitive_char_traits.cpp
@@ -31,4 +31,3 @@ TCaseInsensitiveString EscapeC(const TCaseInsensitiveString& str) {
const auto result = EscapeC(str.data(), str.size());
return {result.data(), result.size()};
}
-
diff --git a/library/cpp/case_insensitive_string/case_insensitive_string.cpp b/library/cpp/case_insensitive_string/case_insensitive_string.cpp
index 16c0f5ff7a..dce0ff4af8 100644
--- a/library/cpp/case_insensitive_string/case_insensitive_string.cpp
+++ b/library/cpp/case_insensitive_string/case_insensitive_string.cpp
@@ -2,15 +2,29 @@
#include <library/cpp/digest/murmur/murmur.h>
-size_t THash<TCaseInsensitiveStringBuf>::operator()(TCaseInsensitiveStringBuf str) const noexcept {
- TMurmurHash2A<size_t> hash;
- for (size_t i = 0; i < str.size(); ++i) {
- char lower = std::tolower(str[i]);
+#include <array>
+
+static size_t HashTail(TMurmurHash2A<size_t>& hash, const char* data, size_t size) {
+ for (size_t i = 0; i < size; ++i) {
+ char lower = std::tolower(data[i]);
hash.Update(&lower, 1);
}
return hash.Value();
}
+size_t THash<TCaseInsensitiveStringBuf>::operator()(TCaseInsensitiveStringBuf str) const noexcept {
+ TMurmurHash2A<size_t> hash;
+ std::array<char, sizeof(size_t)> buf;
+ size_t headSize = str.size() - str.size() % buf.size();
+ for (size_t i = 0; i < headSize; i += buf.size()) {
+ for (size_t j = 0; j < buf.size(); ++j) {
+ buf[j] = std::tolower(str[i + j]);
+ }
+ hash.Update(buf.data(), buf.size());
+ }
+ return HashTail(hash, str.data() + headSize, str.size() - headSize);
+}
+
template <>
void Out<TCaseInsensitiveString>(IOutputStream& o, const TCaseInsensitiveString& p) {
o.Write(p.data(), p.size());
diff --git a/library/cpp/case_insensitive_string/ut_gtest/case_insensitive_string_hash.cpp b/library/cpp/case_insensitive_string/ut_gtest/case_insensitive_string_hash.cpp
new file mode 100644
index 0000000000..893494c96d
--- /dev/null
+++ b/library/cpp/case_insensitive_string/ut_gtest/case_insensitive_string_hash.cpp
@@ -0,0 +1,36 @@
+#include <library/cpp/case_insensitive_string/case_insensitive_string.h>
+#include <library/cpp/digest/murmur/murmur.h>
+#include <library/cpp/testing/gtest/gtest.h>
+
+#include <util/generic/hash_table.h>
+#include <util/random/random.h>
+
+TEST(CaseInsensitiveString, Hash) {
+ size_t h1 = ComputeHash(TCaseInsensitiveString("some long string..."));
+ size_t h2 = ComputeHash(TCaseInsensitiveString("Some Long String..."));
+ EXPECT_EQ(h1, h2);
+ size_t otherHash = ComputeHash(TCaseInsensitiveString("other long string..."));
+ EXPECT_NE(h1, otherHash);
+}
+
+namespace {
+ size_t NaiveCaseInsensitiveHash(TCaseInsensitiveStringBuf str) noexcept {
+ TMurmurHash2A<size_t> hash;
+ for (size_t i = 0; i < str.size(); ++i) {
+ char lower = std::tolower(str[i]);
+ hash.Update(&lower, 1);
+ }
+ return hash.Value();
+ }
+}
+
+TEST(CaseInsensitiveString, HashValues) {
+ SetRandomSeed(123);
+ for (size_t n = 0; n < 64; ++n) {
+ TCaseInsensitiveString s;
+ for (size_t i = 0; i < n; ++i) {
+ s.push_back(RandomNumber<unsigned char>());
+ }
+ EXPECT_EQ(ComputeHash(s), NaiveCaseInsensitiveHash(s)) << "Hashes for \"" << s << "\" differ";
+ }
+}
diff --git a/library/cpp/case_insensitive_string/ut_gtest/ya.make b/library/cpp/case_insensitive_string/ut_gtest/ya.make
new file mode 100644
index 0000000000..2dbb44d87b
--- /dev/null
+++ b/library/cpp/case_insensitive_string/ut_gtest/ya.make
@@ -0,0 +1,12 @@
+GTEST()
+
+SRCS(
+ case_insensitive_string_hash.cpp
+)
+
+PEERDIR(
+ library/cpp/case_insensitive_string
+ library/cpp/digest/murmur
+)
+
+END()
diff --git a/library/cpp/case_insensitive_string/ya.make b/library/cpp/case_insensitive_string/ya.make
index 1057ab7ada..b05f167118 100644
--- a/library/cpp/case_insensitive_string/ya.make
+++ b/library/cpp/case_insensitive_string/ya.make
@@ -12,4 +12,8 @@ PEERDIR(
END()
-RECURSE_FOR_TESTS(ut)
+RECURSE_FOR_TESTS(
+ benchmark
+ ut
+ ut_gtest
+)