diff options
author | PashaSmirnov <leue@yandex.ru> | 2022-02-10 16:50:38 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:50:38 +0300 |
commit | e440fc473cc191fe94becca370781e148709007a (patch) | |
tree | 94e9670fbe940a9d7e53dc7775b45b3e4ede0187 /library | |
parent | 16aec54b5c7e5d07400c5ee9122ee34839bff587 (diff) | |
download | ydb-e440fc473cc191fe94becca370781e148709007a.tar.gz |
Restoring authorship annotation for PashaSmirnov <leue@yandex.ru>. Commit 1 of 2.
Diffstat (limited to 'library')
-rw-r--r-- | library/cpp/containers/comptrie/README.md | 40 | ||||
-rw-r--r-- | library/cpp/containers/comptrie/benchmark/main.cpp | 502 | ||||
-rw-r--r-- | library/cpp/containers/comptrie/benchmark/ya.make | 24 | ||||
-rw-r--r-- | library/cpp/containers/comptrie/comptrie_builder.inl | 6 | ||||
-rw-r--r-- | library/cpp/containers/comptrie/comptrie_ut.cpp | 460 | ||||
-rw-r--r-- | library/cpp/containers/comptrie/pattern_searcher.h | 1202 | ||||
-rw-r--r-- | library/cpp/containers/comptrie/search_iterator.h | 88 |
7 files changed, 1161 insertions, 1161 deletions
diff --git a/library/cpp/containers/comptrie/README.md b/library/cpp/containers/comptrie/README.md index 43c298e2c8..64c40dc974 100644 --- a/library/cpp/containers/comptrie/README.md +++ b/library/cpp/containers/comptrie/README.md @@ -210,23 +210,23 @@ function itself instead of examining the flags. Note that the ε-link flags do not use the MT_SIZEMASK << MT_LEFTSHIFT` bits, which allows us to start using ε-links for some other purpose. - -Pattern Searcher -================ - -This is an implementation of Aho-Corasick algorithm on compact trie structure. -In order to create a pattern searcher one must fill a TCompactPatternSearcherBuilder -with patterns and call SaveAsPatternSearcher or SaveToFileAsPatternSearcher. -Then TCompactPatternSearcher must be created from the builder output. - -### Implementation details ### - -Aho-Corasick algorithm stores a suffix link in each node. -A suffix link of a node is the offset (relative to this node) of the largest suffix -of a string this node represents which is present in a trie. -Current implementation also stores a shortcut link to the largest suffix -for which the corresponding node in a trie is a final node. -These two links are stored as NCompactTrie::TSuffixLink structure of two 64-bit -integers. -In a trie layout these links are stored for each node right after the two bytes -containing service flags and a symbol. + +Pattern Searcher +================ + +This is an implementation of Aho-Corasick algorithm on compact trie structure. +In order to create a pattern searcher one must fill a TCompactPatternSearcherBuilder +with patterns and call SaveAsPatternSearcher or SaveToFileAsPatternSearcher. +Then TCompactPatternSearcher must be created from the builder output. + +### Implementation details ### + +Aho-Corasick algorithm stores a suffix link in each node. +A suffix link of a node is the offset (relative to this node) of the largest suffix +of a string this node represents which is present in a trie. +Current implementation also stores a shortcut link to the largest suffix +for which the corresponding node in a trie is a final node. +These two links are stored as NCompactTrie::TSuffixLink structure of two 64-bit +integers. +In a trie layout these links are stored for each node right after the two bytes +containing service flags and a symbol. diff --git a/library/cpp/containers/comptrie/benchmark/main.cpp b/library/cpp/containers/comptrie/benchmark/main.cpp index 6e42dad18a..2d024eb2f2 100644 --- a/library/cpp/containers/comptrie/benchmark/main.cpp +++ b/library/cpp/containers/comptrie/benchmark/main.cpp @@ -1,260 +1,260 @@ #include <library/cpp/testing/benchmark/bench.h> - + #include <library/cpp/containers/comptrie/comptrie_trie.h> #include <library/cpp/containers/comptrie/comptrie_builder.h> #include <library/cpp/containers/comptrie/search_iterator.h> #include <library/cpp/containers/comptrie/pattern_searcher.h> - + #include <library/cpp/on_disk/aho_corasick/writer.h> #include <library/cpp/on_disk/aho_corasick/reader.h> #include <library/cpp/on_disk/aho_corasick/helpers.h> - + #include <library/cpp/containers/dense_hash/dense_hash.h> - -#include <util/stream/file.h> -#include <util/generic/algorithm.h> -#include <util/random/fast.h> -#include <util/random/shuffle.h> - -///////////////// -// COMMON DATA // -///////////////// - -const size_t MAX_PATTERN_LENGTH = 11; - -TVector<TString> letters = { - "а", "б", "в", "г", "д", "е", "ё", "ж", "з", "и", "й", - "к", "л", "м", "н", "о", "п", "р", "с", "т", "у", "ф", - "х", "ц", "ч", "ж", "щ", "ъ", "ы", "ь", "э", "ю", "я" -}; - -TString GenerateOneString( - TFastRng<ui64>& rng, - size_t maxLength, - const TVector<TString>& sequences -) { - size_t length = rng.GenRand() % maxLength + 1; - TString result; - while (result.size() < length) { - result += sequences[rng.GenRand() % sequences.size()]; - } - return result; -} - -TVector<TString> GenerateStrings( - TFastRng<ui64>& rng, - size_t num, - size_t maxLength, - const TVector<TString>& sequences -) { - TVector<TString> strings; - while (strings.size() < num) { - strings.push_back(GenerateOneString(rng, maxLength, sequences)); - } - return strings; -} - -struct TDatasetInstance { - TDatasetInstance(const TVector<TString>& sequences) { - TFastRng<ui64> rng(0); - - TVector<TString> prefixes = GenerateStrings(rng, /*num*/10, /*maxLength*/3, sequences); - prefixes.push_back(""); - - TVector<TString> roots = GenerateStrings(rng, /*num*/1000, /*maxLength*/5, sequences); - - TVector<TString> suffixes = GenerateStrings(rng, /*num*/10, /*maxLength*/3, sequences); - suffixes.push_back(""); - - TVector<TString> dictionary; - for (const auto& root : roots) { - for (const auto& prefix : prefixes) { - for (const auto& suffix : suffixes) { - dictionary.push_back(prefix + root + suffix); - Y_ASSERT(dictionary.back().size() < MAX_PATTERN_LENGTH); - } - } - } - Shuffle(dictionary.begin(), dictionary.end()); - - Patterns.assign(dictionary.begin(), dictionary.begin() + 10'000); - - for (size_t sampleIdx = 0; sampleIdx < /*samplesNum*/1'000'000; ++sampleIdx) { - Samples.emplace_back(); - size_t wordsNum = rng.GenRand() % 10; - for (size_t wordIdx = 0; wordIdx < wordsNum; ++wordIdx) { - if (wordIdx > 0) { - Samples.back() += " "; - } - Samples.back() += dictionary[rng.GenRand() % dictionary.size()]; - } - } - }; - - TString GetSample(size_t iteration) const { - TFastRng<ui64> rng(iteration); - return Samples[rng.GenRand() % Samples.size()]; - } - - - TVector<TString> Patterns; - TVector<TString> Samples; -}; - -static const TDatasetInstance dataset(letters); - -////////////////////////// -// NEW PATTERN SEARCHER // -////////////////////////// - -struct TPatternSearcherInstance { - TPatternSearcherInstance() { - TCompactPatternSearcherBuilder<char, ui32> builder; - - for (ui32 patternId = 0; patternId < dataset.Patterns.size(); ++patternId) { - builder.Add(dataset.Patterns[patternId], patternId); - } - - TBufferOutput buffer; - builder.Save(buffer); - - Instance.Reset( - new TCompactPatternSearcher<char, ui32>( - buffer.Buffer().Data(), - buffer.Buffer().Size() - ) - ); - } - - THolder<TCompactPatternSearcher<char, ui32>> Instance; -}; - -static const TPatternSearcherInstance patternSearcherInstance; - -Y_CPU_BENCHMARK(PatternSearcher, iface) { - TVector<TVector<std::pair<ui32, ui32>>> result; - for (size_t iteration = 0; iteration < iface.Iterations(); ++iteration) { - result.emplace_back(); - TString testString = dataset.GetSample(iteration); - auto matches = patternSearcherInstance.Instance->SearchMatches(testString); - for (auto& match : matches) { - result.back().emplace_back(match.End, match.Data); - } - } -} - -////////////////////// -// OLD AHO CORASICK // -////////////////////// - -struct TAhoCorasickInstance { - TAhoCorasickInstance() { - TAhoCorasickBuilder<TString, ui32> builder; - - for (ui32 patternId = 0; patternId < dataset.Patterns.size(); ++patternId) { - builder.AddString(dataset.Patterns[patternId], patternId); - } - - TBufferOutput buffer; - builder.SaveToStream(&buffer); - - Instance.Reset(new TDefaultMappedAhoCorasick(TBlob::FromBuffer(buffer.Buffer()))); - }; - - THolder<TDefaultMappedAhoCorasick> Instance; -}; - -static const TAhoCorasickInstance ahoCorasickInstance; - -Y_CPU_BENCHMARK(AhoCorasick, iface) { - TVector<TDeque<std::pair<ui32, ui32>>> result; - for (size_t iteration = 0; iteration < iface.Iterations(); ++iteration) { - result.emplace_back(); - TString testString = dataset.GetSample(iteration); - auto matches = ahoCorasickInstance.Instance->AhoSearch(testString); - result.push_back(matches); - } -} - -//////////////////////////////// -// COMPTRIE + SIMPLE MATCHING // -//////////////////////////////// - -struct TCompactTrieInstance { - TCompactTrieInstance() { - TCompactTrieBuilder<char, ui32> builder; - - for (ui32 patternId = 0; patternId < dataset.Patterns.size(); ++patternId) { - builder.Add(dataset.Patterns[patternId], patternId); - } - - - TBufferOutput buffer; - CompactTrieMinimizeAndMakeFastLayout(buffer, builder); - - Instance.Reset(new TCompactTrie<char, ui32>( - buffer.Buffer().Data(), - buffer.Buffer().Size() - )); - } - - THolder<TCompactTrie<char, ui32>> Instance; -}; - -static const TCompactTrieInstance compactTrieInstance; - -Y_CPU_BENCHMARK(ComptrieSimple, iface) { - TVector<TVector<std::pair<ui32, ui32>>> result; - for (size_t iteration = 0; iteration < iface.Iterations(); ++iteration) { - result.emplace_back(); - TString testString = dataset.GetSample(iteration); - for (ui32 startPos = 0; startPos < testString.size(); ++startPos) { - TSearchIterator<TCompactTrie<char, ui32>> iter(*(compactTrieInstance.Instance)); - for (ui32 position = startPos; position < testString.size(); ++position) { - if (!iter.Advance(testString[position])) { - break; - } - ui32 answer; - if (iter.GetValue(&answer)) { - result.back().emplace_back(position, answer); - } - } - } - } -} - -//////////////// -// DENSE_HASH // -//////////////// - -struct TDenseHashInstance { - TDenseHashInstance() { - for (ui32 patternId = 0; patternId < dataset.Patterns.size(); ++patternId) { - Instance[dataset.Patterns[patternId]] = patternId; - } - } - - TDenseHash<TString, ui32> Instance; -}; - -static const TDenseHashInstance denseHashInstance; - -Y_CPU_BENCHMARK(DenseHash, iface) { - TVector<TVector<std::pair<ui32, ui32>>> result; - for (size_t iteration = 0; iteration < iface.Iterations(); ++iteration) { - result.emplace_back(); - TString testString = dataset.GetSample(iteration); - for (size_t start = 0; start < testString.size(); ++start) { - for ( - size_t length = 1; - length <= MAX_PATTERN_LENGTH && start + length <= testString.size(); - ++length - ) { - auto value = denseHashInstance.Instance.find(testString.substr(start, length)); - if (value != denseHashInstance.Instance.end()) { - result.back().emplace_back(start + length - 1, value->second); - } - } - } - } -} + +#include <util/stream/file.h> +#include <util/generic/algorithm.h> +#include <util/random/fast.h> +#include <util/random/shuffle.h> + +///////////////// +// COMMON DATA // +///////////////// + +const size_t MAX_PATTERN_LENGTH = 11; + +TVector<TString> letters = { + "а", "б", "в", "г", "д", "е", "ё", "ж", "з", "и", "й", + "к", "л", "м", "н", "о", "п", "р", "с", "т", "у", "ф", + "х", "ц", "ч", "ж", "щ", "ъ", "ы", "ь", "э", "ю", "я" +}; + +TString GenerateOneString( + TFastRng<ui64>& rng, + size_t maxLength, + const TVector<TString>& sequences +) { + size_t length = rng.GenRand() % maxLength + 1; + TString result; + while (result.size() < length) { + result += sequences[rng.GenRand() % sequences.size()]; + } + return result; +} + +TVector<TString> GenerateStrings( + TFastRng<ui64>& rng, + size_t num, + size_t maxLength, + const TVector<TString>& sequences +) { + TVector<TString> strings; + while (strings.size() < num) { + strings.push_back(GenerateOneString(rng, maxLength, sequences)); + } + return strings; +} + +struct TDatasetInstance { + TDatasetInstance(const TVector<TString>& sequences) { + TFastRng<ui64> rng(0); + + TVector<TString> prefixes = GenerateStrings(rng, /*num*/10, /*maxLength*/3, sequences); + prefixes.push_back(""); + + TVector<TString> roots = GenerateStrings(rng, /*num*/1000, /*maxLength*/5, sequences); + + TVector<TString> suffixes = GenerateStrings(rng, /*num*/10, /*maxLength*/3, sequences); + suffixes.push_back(""); + + TVector<TString> dictionary; + for (const auto& root : roots) { + for (const auto& prefix : prefixes) { + for (const auto& suffix : suffixes) { + dictionary.push_back(prefix + root + suffix); + Y_ASSERT(dictionary.back().size() < MAX_PATTERN_LENGTH); + } + } + } + Shuffle(dictionary.begin(), dictionary.end()); + + Patterns.assign(dictionary.begin(), dictionary.begin() + 10'000); + + for (size_t sampleIdx = 0; sampleIdx < /*samplesNum*/1'000'000; ++sampleIdx) { + Samples.emplace_back(); + size_t wordsNum = rng.GenRand() % 10; + for (size_t wordIdx = 0; wordIdx < wordsNum; ++wordIdx) { + if (wordIdx > 0) { + Samples.back() += " "; + } + Samples.back() += dictionary[rng.GenRand() % dictionary.size()]; + } + } + }; + + TString GetSample(size_t iteration) const { + TFastRng<ui64> rng(iteration); + return Samples[rng.GenRand() % Samples.size()]; + } + + + TVector<TString> Patterns; + TVector<TString> Samples; +}; + +static const TDatasetInstance dataset(letters); + +////////////////////////// +// NEW PATTERN SEARCHER // +////////////////////////// + +struct TPatternSearcherInstance { + TPatternSearcherInstance() { + TCompactPatternSearcherBuilder<char, ui32> builder; + + for (ui32 patternId = 0; patternId < dataset.Patterns.size(); ++patternId) { + builder.Add(dataset.Patterns[patternId], patternId); + } + + TBufferOutput buffer; + builder.Save(buffer); + + Instance.Reset( + new TCompactPatternSearcher<char, ui32>( + buffer.Buffer().Data(), + buffer.Buffer().Size() + ) + ); + } + + THolder<TCompactPatternSearcher<char, ui32>> Instance; +}; + +static const TPatternSearcherInstance patternSearcherInstance; + +Y_CPU_BENCHMARK(PatternSearcher, iface) { + TVector<TVector<std::pair<ui32, ui32>>> result; + for (size_t iteration = 0; iteration < iface.Iterations(); ++iteration) { + result.emplace_back(); + TString testString = dataset.GetSample(iteration); + auto matches = patternSearcherInstance.Instance->SearchMatches(testString); + for (auto& match : matches) { + result.back().emplace_back(match.End, match.Data); + } + } +} + +////////////////////// +// OLD AHO CORASICK // +////////////////////// + +struct TAhoCorasickInstance { + TAhoCorasickInstance() { + TAhoCorasickBuilder<TString, ui32> builder; + + for (ui32 patternId = 0; patternId < dataset.Patterns.size(); ++patternId) { + builder.AddString(dataset.Patterns[patternId], patternId); + } + + TBufferOutput buffer; + builder.SaveToStream(&buffer); + + Instance.Reset(new TDefaultMappedAhoCorasick(TBlob::FromBuffer(buffer.Buffer()))); + }; + + THolder<TDefaultMappedAhoCorasick> Instance; +}; + +static const TAhoCorasickInstance ahoCorasickInstance; + +Y_CPU_BENCHMARK(AhoCorasick, iface) { + TVector<TDeque<std::pair<ui32, ui32>>> result; + for (size_t iteration = 0; iteration < iface.Iterations(); ++iteration) { + result.emplace_back(); + TString testString = dataset.GetSample(iteration); + auto matches = ahoCorasickInstance.Instance->AhoSearch(testString); + result.push_back(matches); + } +} + +//////////////////////////////// +// COMPTRIE + SIMPLE MATCHING // +//////////////////////////////// + +struct TCompactTrieInstance { + TCompactTrieInstance() { + TCompactTrieBuilder<char, ui32> builder; + + for (ui32 patternId = 0; patternId < dataset.Patterns.size(); ++patternId) { + builder.Add(dataset.Patterns[patternId], patternId); + } + + + TBufferOutput buffer; + CompactTrieMinimizeAndMakeFastLayout(buffer, builder); + + Instance.Reset(new TCompactTrie<char, ui32>( + buffer.Buffer().Data(), + buffer.Buffer().Size() + )); + } + + THolder<TCompactTrie<char, ui32>> Instance; +}; + +static const TCompactTrieInstance compactTrieInstance; + +Y_CPU_BENCHMARK(ComptrieSimple, iface) { + TVector<TVector<std::pair<ui32, ui32>>> result; + for (size_t iteration = 0; iteration < iface.Iterations(); ++iteration) { + result.emplace_back(); + TString testString = dataset.GetSample(iteration); + for (ui32 startPos = 0; startPos < testString.size(); ++startPos) { + TSearchIterator<TCompactTrie<char, ui32>> iter(*(compactTrieInstance.Instance)); + for (ui32 position = startPos; position < testString.size(); ++position) { + if (!iter.Advance(testString[position])) { + break; + } + ui32 answer; + if (iter.GetValue(&answer)) { + result.back().emplace_back(position, answer); + } + } + } + } +} + +//////////////// +// DENSE_HASH // +//////////////// + +struct TDenseHashInstance { + TDenseHashInstance() { + for (ui32 patternId = 0; patternId < dataset.Patterns.size(); ++patternId) { + Instance[dataset.Patterns[patternId]] = patternId; + } + } + + TDenseHash<TString, ui32> Instance; +}; + +static const TDenseHashInstance denseHashInstance; + +Y_CPU_BENCHMARK(DenseHash, iface) { + TVector<TVector<std::pair<ui32, ui32>>> result; + for (size_t iteration = 0; iteration < iface.Iterations(); ++iteration) { + result.emplace_back(); + TString testString = dataset.GetSample(iteration); + for (size_t start = 0; start < testString.size(); ++start) { + for ( + size_t length = 1; + length <= MAX_PATTERN_LENGTH && start + length <= testString.size(); + ++length + ) { + auto value = denseHashInstance.Instance.find(testString.substr(start, length)); + if (value != denseHashInstance.Instance.end()) { + result.back().emplace_back(start + length - 1, value->second); + } + } + } + } +} diff --git a/library/cpp/containers/comptrie/benchmark/ya.make b/library/cpp/containers/comptrie/benchmark/ya.make index 16fa19530d..b0a1fc5b92 100644 --- a/library/cpp/containers/comptrie/benchmark/ya.make +++ b/library/cpp/containers/comptrie/benchmark/ya.make @@ -1,14 +1,14 @@ Y_BENCHMARK() - -OWNER(smirnovpavel) - -SRCS( - main.cpp -) - -PEERDIR( + +OWNER(smirnovpavel) + +SRCS( + main.cpp +) + +PEERDIR( library/cpp/containers/comptrie - util -) - -END() + util +) + +END() diff --git a/library/cpp/containers/comptrie/comptrie_builder.inl b/library/cpp/containers/comptrie/comptrie_builder.inl index f273fa6571..6c5b142f8d 100644 --- a/library/cpp/containers/comptrie/comptrie_builder.inl +++ b/library/cpp/containers/comptrie/comptrie_builder.inl @@ -60,15 +60,15 @@ protected: size_t NodeMeasureLeafValue(TNode* thiz) const; ui64 NodeSaveLeafValue(TNode* thiz, IOutputStream& os) const; - virtual ui64 ArcMeasure(const TArc* thiz, size_t leftsize, size_t rightsize) const; + virtual ui64 ArcMeasure(const TArc* thiz, size_t leftsize, size_t rightsize) const; - virtual ui64 ArcSaveSelf(const TArc* thiz, IOutputStream& os) const; + virtual ui64 ArcSaveSelf(const TArc* thiz, IOutputStream& os) const; ui64 ArcSave(const TArc* thiz, IOutputStream& os) const; ui64 ArcSaveAndDestroy(const TArc* thiz, IOutputStream& os); public: TCompactTrieBuilderImpl(TCompactTrieBuilderFlags flags, TPacker packer, IAllocator* alloc); - virtual ~TCompactTrieBuilderImpl(); + virtual ~TCompactTrieBuilderImpl(); void DestroyNode(TNode* node); void NodeReleasePayload(TNode* thiz); diff --git a/library/cpp/containers/comptrie/comptrie_ut.cpp b/library/cpp/containers/comptrie/comptrie_ut.cpp index 74bee09b5d..5b7b33e8c4 100644 --- a/library/cpp/containers/comptrie/comptrie_ut.cpp +++ b/library/cpp/containers/comptrie/comptrie_ut.cpp @@ -15,7 +15,7 @@ #include <util/folder/dirut.h> #include <util/random/random.h> -#include <util/random/fast.h> +#include <util/random/fast.h> #include <util/string/hex.h> #include <util/string/cast.h> @@ -24,7 +24,7 @@ #include "set.h" #include "first_symbol_iterator.h" #include "search_iterator.h" -#include "pattern_searcher.h" +#include "pattern_searcher.h" #include <array> #include <iterator> @@ -109,10 +109,10 @@ private: UNIT_TEST(TestBuilderFindLongestPrefix); UNIT_TEST(TestBuilderFindLongestPrefixWithEmptyValue); - UNIT_TEST(TestPatternSearcherEmpty); - UNIT_TEST(TestPatternSearcherSimple); - UNIT_TEST(TestPatternSearcherRandom); - + UNIT_TEST(TestPatternSearcherEmpty); + UNIT_TEST(TestPatternSearcherSimple); + UNIT_TEST(TestPatternSearcherRandom); + UNIT_TEST_SUITE_END(); static const char* SampleData[]; @@ -246,21 +246,21 @@ public: void TestBuilderFindLongestPrefix(); void TestBuilderFindLongestPrefix(size_t keysCount, double branchProbability, bool isPrefixGrouped, bool hasEmptyKey); void TestBuilderFindLongestPrefixWithEmptyValue(); - - void TestPatternSearcherOnDataset( - const TVector<TString>& patterns, - const TVector<TString>& samples - ); - void TestPatternSearcherEmpty(); - void TestPatternSearcherSimple(); - void TestPatternSearcherRandom( - size_t patternsNum, - size_t patternMaxLength, - size_t strMaxLength, - int maxChar, - TFastRng<ui64>& rng - ); - void TestPatternSearcherRandom(); + + void TestPatternSearcherOnDataset( + const TVector<TString>& patterns, + const TVector<TString>& samples + ); + void TestPatternSearcherEmpty(); + void TestPatternSearcherSimple(); + void TestPatternSearcherRandom( + size_t patternsNum, + size_t patternMaxLength, + size_t strMaxLength, + int maxChar, + TFastRng<ui64>& rng + ); + void TestPatternSearcherRandom(); }; UNIT_TEST_SUITE_REGISTRATION(TCompactTrieTest); @@ -1580,212 +1580,212 @@ void TCompactTrieTest::TestBuilderFindLongestPrefixWithEmptyValue() { TBufferOutput buffer; builder.Save(buffer); } - -void TCompactTrieTest::TestPatternSearcherEmpty() { - TCompactPatternSearcherBuilder<char, ui32> builder; - - TBufferOutput searcherData; - builder.Save(searcherData); - - TCompactPatternSearcher<char, ui32> searcher( - searcherData.Buffer().Data(), - searcherData.Buffer().Size() - ); - - UNIT_ASSERT(searcher.SearchMatches("a").empty()); - UNIT_ASSERT(searcher.SearchMatches("").empty()); - UNIT_ASSERT(searcher.SearchMatches("abc").empty()); -} - -void TCompactTrieTest::TestPatternSearcherOnDataset( - const TVector<TString>& patterns, - const TVector<TString>& samples -) { - TCompactPatternSearcherBuilder<char, ui32> builder; - - for (size_t patternIdx = 0; patternIdx < patterns.size(); ++patternIdx) { - builder.Add(patterns[patternIdx], patternIdx); - } - - TBufferOutput searcherData; - builder.Save(searcherData); - - TCompactPatternSearcher<char, ui32> searcher( - searcherData.Buffer().Data(), - searcherData.Buffer().Size() - ); - - for (const auto& sample : samples) { - const auto matches = searcher.SearchMatches(sample); - - size_t matchesNum = 0; - THashSet<TString> processedPatterns; - for (const auto& pattern : patterns) { - if (pattern.Empty() || processedPatterns.contains(pattern)) { - continue; - } - for (size_t start = 0; start + pattern.Size() <= sample.Size(); ++start) { - matchesNum += (pattern == sample.substr(start, pattern.Size())); - } - processedPatterns.insert(pattern); - } - UNIT_ASSERT_VALUES_EQUAL(matchesNum, matches.size()); - - - TSet<std::pair<size_t, ui32>> foundMatches; - for (const auto& match : matches) { - std::pair<size_t, ui32> matchParams(match.End, match.Data); - UNIT_ASSERT(!foundMatches.contains(matchParams)); - foundMatches.insert(matchParams); - - const auto& pattern = patterns[match.Data]; - UNIT_ASSERT_VALUES_EQUAL( - sample.substr(match.End - pattern.size() + 1, pattern.size()), - pattern - ); - } - } -} - -void TCompactTrieTest::TestPatternSearcherSimple() { - TestPatternSearcherOnDataset( - { // patterns - "abcd", - "abc", - "ab", - "a", - "" - }, - { // samples - "abcde", - "abcd", - "abc", - "ab", - "a", - "" - } - ); - TestPatternSearcherOnDataset( - { // patterns - "a" - "ab", - "abcd", - }, - { // samples - "abcde", - "abcd", - "abc", - "ab", - "a", - "" - } - ); - TestPatternSearcherOnDataset( - { // patterns - "aaaa", - "aaa", - "aa", - "a", - }, - { // samples - "aaaaaaaaaaaa" - } - ); - TestPatternSearcherOnDataset( - { // patterns - "aa", "ab", "ac", "ad", "ae", "af", - "ba", "bb", "bc", "bd", "be", "bf", - "ca", "cb", "cc", "cd", "ce", "cf", - "da", "db", "dc", "dd", "de", "df", - "ea", "eb", "ec", "ed", "ee", "ef", - "fa", "fb", "fc", "fd", "fe", "ff" - }, - { // samples - "dcabafeebfdcbacddacadbaabecdbaeffecdbfabcdcabcfaefecdfebacfedacefbdcacfeb", - "abcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefancdefancdef", - "fedcbafedcbafedcbafedcbafedcbafedcbafedcbafedcbafedcbafedcbafedcbafedcba", - "", - "a", "b", "c", "d", "e", "f", - "aa", "ab", "ac", "ad", "ae", "af", - "ba", "bb", "bc", "bd", "be", "bf", - "ca", "cb", "cc", "cd", "ce", "cf", - "da", "db", "dc", "dd", "de", "df", - "ea", "eb", "ec", "ed", "ee", "ef", - "fa", "fb", "fc", "fd", "fe", "ff" - } - ); -} - -static char RandChar( - TFastRng<ui64>& rng, - int maxChar -) { - return static_cast<char>(rng.GenRand() % (maxChar + 1)); -} - -static TString RandStr( - TFastRng<ui64>& rng, - size_t maxLength, - int maxChar, - bool nonEmpty = false -) { - Y_ASSERT(maxLength > 0); - - size_t length; - if (nonEmpty) { - length = rng.GenRand() % maxLength + 1; - } else { - length = rng.GenRand() % (maxLength + 1); - } - - TString result; - while (result.size() < length) { - result += RandChar(rng, maxChar); - } - - return result; -} - -void TCompactTrieTest::TestPatternSearcherRandom( - size_t patternsNum, - size_t patternMaxLength, - size_t strMaxLength, - int maxChar, - TFastRng<ui64>& rng -) { - auto patternToSearch = RandStr(rng, patternMaxLength, maxChar, /*nonEmpty*/true); - - TVector<TString> patterns = {patternToSearch}; - while (patterns.size() < patternsNum) { - patterns.push_back(RandStr(rng, patternMaxLength, maxChar, /*nonEmpty*/true)); - } - - auto filler = RandStr(rng, strMaxLength - patternToSearch.Size() + 1, maxChar); - size_t leftFillerSize = rng.GenRand() % (filler.size() + 1); - auto leftFiller = filler.substr(0, leftFillerSize); - auto rightFiller = filler.substr(leftFillerSize, filler.size() - leftFillerSize); - auto sample = leftFiller + patternToSearch + rightFiller; - - TestPatternSearcherOnDataset(patterns, {sample}); -} - -void TCompactTrieTest::TestPatternSearcherRandom() { - TFastRng<ui64> rng(0); - for (size_t patternMaxLen : {1, 2, 10}) { - for (size_t strMaxLen : TVector<size_t>{patternMaxLen, 2 * patternMaxLen, 10}) { - for (int maxChar : {0, 1, 5, 255}) { - for (size_t patternsNum : {1, 10}) { - for (size_t testIdx = 0; testIdx < 3; ++testIdx) { - TestPatternSearcherRandom( - patternsNum, - patternMaxLen, - strMaxLen, - maxChar, - rng - ); - } - } - } - } - } -} + +void TCompactTrieTest::TestPatternSearcherEmpty() { + TCompactPatternSearcherBuilder<char, ui32> builder; + + TBufferOutput searcherData; + builder.Save(searcherData); + + TCompactPatternSearcher<char, ui32> searcher( + searcherData.Buffer().Data(), + searcherData.Buffer().Size() + ); + + UNIT_ASSERT(searcher.SearchMatches("a").empty()); + UNIT_ASSERT(searcher.SearchMatches("").empty()); + UNIT_ASSERT(searcher.SearchMatches("abc").empty()); +} + +void TCompactTrieTest::TestPatternSearcherOnDataset( + const TVector<TString>& patterns, + const TVector<TString>& samples +) { + TCompactPatternSearcherBuilder<char, ui32> builder; + + for (size_t patternIdx = 0; patternIdx < patterns.size(); ++patternIdx) { + builder.Add(patterns[patternIdx], patternIdx); + } + + TBufferOutput searcherData; + builder.Save(searcherData); + + TCompactPatternSearcher<char, ui32> searcher( + searcherData.Buffer().Data(), + searcherData.Buffer().Size() + ); + + for (const auto& sample : samples) { + const auto matches = searcher.SearchMatches(sample); + + size_t matchesNum = 0; + THashSet<TString> processedPatterns; + for (const auto& pattern : patterns) { + if (pattern.Empty() || processedPatterns.contains(pattern)) { + continue; + } + for (size_t start = 0; start + pattern.Size() <= sample.Size(); ++start) { + matchesNum += (pattern == sample.substr(start, pattern.Size())); + } + processedPatterns.insert(pattern); + } + UNIT_ASSERT_VALUES_EQUAL(matchesNum, matches.size()); + + + TSet<std::pair<size_t, ui32>> foundMatches; + for (const auto& match : matches) { + std::pair<size_t, ui32> matchParams(match.End, match.Data); + UNIT_ASSERT(!foundMatches.contains(matchParams)); + foundMatches.insert(matchParams); + + const auto& pattern = patterns[match.Data]; + UNIT_ASSERT_VALUES_EQUAL( + sample.substr(match.End - pattern.size() + 1, pattern.size()), + pattern + ); + } + } +} + +void TCompactTrieTest::TestPatternSearcherSimple() { + TestPatternSearcherOnDataset( + { // patterns + "abcd", + "abc", + "ab", + "a", + "" + }, + { // samples + "abcde", + "abcd", + "abc", + "ab", + "a", + "" + } + ); + TestPatternSearcherOnDataset( + { // patterns + "a" + "ab", + "abcd", + }, + { // samples + "abcde", + "abcd", + "abc", + "ab", + "a", + "" + } + ); + TestPatternSearcherOnDataset( + { // patterns + "aaaa", + "aaa", + "aa", + "a", + }, + { // samples + "aaaaaaaaaaaa" + } + ); + TestPatternSearcherOnDataset( + { // patterns + "aa", "ab", "ac", "ad", "ae", "af", + "ba", "bb", "bc", "bd", "be", "bf", + "ca", "cb", "cc", "cd", "ce", "cf", + "da", "db", "dc", "dd", "de", "df", + "ea", "eb", "ec", "ed", "ee", "ef", + "fa", "fb", "fc", "fd", "fe", "ff" + }, + { // samples + "dcabafeebfdcbacddacadbaabecdbaeffecdbfabcdcabcfaefecdfebacfedacefbdcacfeb", + "abcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefancdefancdef", + "fedcbafedcbafedcbafedcbafedcbafedcbafedcbafedcbafedcbafedcbafedcbafedcba", + "", + "a", "b", "c", "d", "e", "f", + "aa", "ab", "ac", "ad", "ae", "af", + "ba", "bb", "bc", "bd", "be", "bf", + "ca", "cb", "cc", "cd", "ce", "cf", + "da", "db", "dc", "dd", "de", "df", + "ea", "eb", "ec", "ed", "ee", "ef", + "fa", "fb", "fc", "fd", "fe", "ff" + } + ); +} + +static char RandChar( + TFastRng<ui64>& rng, + int maxChar +) { + return static_cast<char>(rng.GenRand() % (maxChar + 1)); +} + +static TString RandStr( + TFastRng<ui64>& rng, + size_t maxLength, + int maxChar, + bool nonEmpty = false +) { + Y_ASSERT(maxLength > 0); + + size_t length; + if (nonEmpty) { + length = rng.GenRand() % maxLength + 1; + } else { + length = rng.GenRand() % (maxLength + 1); + } + + TString result; + while (result.size() < length) { + result += RandChar(rng, maxChar); + } + + return result; +} + +void TCompactTrieTest::TestPatternSearcherRandom( + size_t patternsNum, + size_t patternMaxLength, + size_t strMaxLength, + int maxChar, + TFastRng<ui64>& rng +) { + auto patternToSearch = RandStr(rng, patternMaxLength, maxChar, /*nonEmpty*/true); + + TVector<TString> patterns = {patternToSearch}; + while (patterns.size() < patternsNum) { + patterns.push_back(RandStr(rng, patternMaxLength, maxChar, /*nonEmpty*/true)); + } + + auto filler = RandStr(rng, strMaxLength - patternToSearch.Size() + 1, maxChar); + size_t leftFillerSize = rng.GenRand() % (filler.size() + 1); + auto leftFiller = filler.substr(0, leftFillerSize); + auto rightFiller = filler.substr(leftFillerSize, filler.size() - leftFillerSize); + auto sample = leftFiller + patternToSearch + rightFiller; + + TestPatternSearcherOnDataset(patterns, {sample}); +} + +void TCompactTrieTest::TestPatternSearcherRandom() { + TFastRng<ui64> rng(0); + for (size_t patternMaxLen : {1, 2, 10}) { + for (size_t strMaxLen : TVector<size_t>{patternMaxLen, 2 * patternMaxLen, 10}) { + for (int maxChar : {0, 1, 5, 255}) { + for (size_t patternsNum : {1, 10}) { + for (size_t testIdx = 0; testIdx < 3; ++testIdx) { + TestPatternSearcherRandom( + patternsNum, + patternMaxLen, + strMaxLen, + maxChar, + rng + ); + } + } + } + } + } +} diff --git a/library/cpp/containers/comptrie/pattern_searcher.h b/library/cpp/containers/comptrie/pattern_searcher.h index caab51dc1c..d0a0ca37a8 100644 --- a/library/cpp/containers/comptrie/pattern_searcher.h +++ b/library/cpp/containers/comptrie/pattern_searcher.h @@ -1,606 +1,606 @@ -#pragma once - +#pragma once + #include "comptrie_builder.h" #include "comptrie_trie.h" #include "comptrie_impl.h" #include <library/cpp/packers/packers.h> - -#include <util/system/yassert.h> -#include <util/generic/vector.h> -#include <util/generic/deque.h> -#include <util/stream/str.h> - -// Aho-Corasick algorithm implementation using CompactTrie implementation of Sedgewick's T-trie - -namespace NCompactTrie { - struct TSuffixLink { - ui64 NextSuffixOffset; - ui64 NextSuffixWithDataOffset; - - TSuffixLink(ui64 nextSuffixOffset = 0, ui64 nextSuffixWithDataOffset = 0) - : NextSuffixOffset(nextSuffixOffset) - , NextSuffixWithDataOffset(nextSuffixWithDataOffset) - { - } - }; - - const size_t FLAGS_SIZE = sizeof(char); - const size_t SYMBOL_SIZE = sizeof(char); -}; - -template <class T = char, class D = ui64, class S = TCompactTriePacker<D>> -class TCompactPatternSearcherBuilder : protected TCompactTrieBuilder<T, D, S> { -public: - typedef T TSymbol; - typedef D TData; - typedef S TPacker; - - typedef typename TCompactTrieKeySelector<TSymbol>::TKey TKey; - typedef typename TCompactTrieKeySelector<TSymbol>::TKeyBuf TKeyBuf; - - typedef TCompactTrieBuilder<T, D, S> TBase; - -public: - TCompactPatternSearcherBuilder() { + +#include <util/system/yassert.h> +#include <util/generic/vector.h> +#include <util/generic/deque.h> +#include <util/stream/str.h> + +// Aho-Corasick algorithm implementation using CompactTrie implementation of Sedgewick's T-trie + +namespace NCompactTrie { + struct TSuffixLink { + ui64 NextSuffixOffset; + ui64 NextSuffixWithDataOffset; + + TSuffixLink(ui64 nextSuffixOffset = 0, ui64 nextSuffixWithDataOffset = 0) + : NextSuffixOffset(nextSuffixOffset) + , NextSuffixWithDataOffset(nextSuffixWithDataOffset) + { + } + }; + + const size_t FLAGS_SIZE = sizeof(char); + const size_t SYMBOL_SIZE = sizeof(char); +}; + +template <class T = char, class D = ui64, class S = TCompactTriePacker<D>> +class TCompactPatternSearcherBuilder : protected TCompactTrieBuilder<T, D, S> { +public: + typedef T TSymbol; + typedef D TData; + typedef S TPacker; + + typedef typename TCompactTrieKeySelector<TSymbol>::TKey TKey; + typedef typename TCompactTrieKeySelector<TSymbol>::TKeyBuf TKeyBuf; + + typedef TCompactTrieBuilder<T, D, S> TBase; + +public: + TCompactPatternSearcherBuilder() { TBase::Impl = MakeHolder<TCompactPatternSearcherBuilderImpl>(); - } - - bool Add(const TSymbol* key, size_t keyLength, const TData& value) { - return TBase::Impl->AddEntry(key, keyLength, value); - } - bool Add(const TKeyBuf& key, const TData& value) { - return Add(key.data(), key.size(), value); - } - - bool Find(const TSymbol* key, size_t keyLength, TData* value) const { - return TBase::Impl->FindEntry(key, keyLength, value); - } - bool Find(const TKeyBuf& key, TData* value = nullptr) const { - return Find(key.data(), key.size(), value); - } - - size_t Save(IOutputStream& os) const { - size_t trieSize = TBase::Impl->MeasureByteSize(); - TBufferOutput serializedTrie(trieSize); - TBase::Impl->Save(serializedTrie); - - auto serializedTrieBuffer = serializedTrie.Buffer(); - CalculateSuffixLinks( - serializedTrieBuffer.Data(), - serializedTrieBuffer.Data() + serializedTrieBuffer.Size() - ); - - os.Write(serializedTrieBuffer.Data(), serializedTrieBuffer.Size()); - return trieSize; - } - - TBlob Save() const { - TBufferStream buffer; - Save(buffer); - return TBlob::FromStream(buffer); - } - - size_t SaveToFile(const TString& fileName) const { - TFileOutput out(fileName); - return Save(out); - } - - size_t MeasureByteSize() const { - return TBase::Impl->MeasureByteSize(); - } - -private: - void CalculateSuffixLinks(char* trieStart, const char* trieEnd) const; - -protected: - class TCompactPatternSearcherBuilderImpl : public TBase::TCompactTrieBuilderImpl { - public: - typedef typename TBase::TCompactTrieBuilderImpl TImplBase; - - TCompactPatternSearcherBuilderImpl( - TCompactTrieBuilderFlags flags = CTBF_NONE, - TPacker packer = TPacker(), - IAllocator* alloc = TDefaultAllocator::Instance() - ) : TImplBase(flags, packer, alloc) { - } - - ui64 ArcMeasure( - const typename TImplBase::TArc* arc, - size_t leftSize, - size_t rightSize - ) const override { - using namespace NCompactTrie; - - size_t coreSize = SYMBOL_SIZE + FLAGS_SIZE + - sizeof(TSuffixLink) + - this->NodeMeasureLeafValue(arc->Node); - size_t treeSize = this->NodeMeasureSubtree(arc->Node); - - if (arc->Label.Length() > 0) - treeSize += (SYMBOL_SIZE + FLAGS_SIZE + sizeof(TSuffixLink)) * - (arc->Label.Length() - 1); - - // Triple measurements are needed because the space needed to store the offset - // shall be added to the offset itself. Hence three iterations. - size_t leftOffsetSize = 0; - size_t rightOffsetSize = 0; - for (size_t iteration = 0; iteration < 3; ++iteration) { - leftOffsetSize = leftSize ? MeasureOffset( - coreSize + treeSize + leftOffsetSize + rightOffsetSize) : 0; - rightOffsetSize = rightSize ? MeasureOffset( - coreSize + treeSize + leftSize + leftOffsetSize + rightOffsetSize) : 0; - } - - coreSize += leftOffsetSize + rightOffsetSize; - arc->LeftOffset = leftSize ? coreSize + treeSize : 0; - arc->RightOffset = rightSize ? coreSize + treeSize + leftSize : 0; - - return coreSize + treeSize + leftSize + rightSize; - } - - ui64 ArcSaveSelf(const typename TImplBase::TArc* arc, IOutputStream& os) const override { - using namespace NCompactTrie; - - ui64 written = 0; - - size_t leftOffsetSize = MeasureOffset(arc->LeftOffset); - size_t rightOffsetSize = MeasureOffset(arc->RightOffset); - - size_t labelLen = arc->Label.Length(); - - for (size_t labelPos = 0; labelPos < labelLen; ++labelPos) { - char flags = 0; - - if (labelPos == 0) { - flags |= (leftOffsetSize << MT_LEFTSHIFT); - flags |= (rightOffsetSize << MT_RIGHTSHIFT); - } - - if (labelPos == labelLen - 1) { - if (arc->Node->IsFinal()) - flags |= MT_FINAL; - if (!arc->Node->IsLast()) - flags |= MT_NEXT; - } else { - flags |= MT_NEXT; - } - - os.Write(&flags, 1); - os.Write(&arc->Label.AsCharPtr()[labelPos], 1); - written += 2; - - TSuffixLink suffixlink; - os.Write(&suffixlink, sizeof(TSuffixLink)); - written += sizeof(TSuffixLink); - - if (labelPos == 0) { - written += ArcSaveOffset(arc->LeftOffset, os); - written += ArcSaveOffset(arc->RightOffset, os); - } - } - - written += this->NodeSaveLeafValue(arc->Node, os); - return written; - } - }; -}; - - -template <class T> -struct TPatternMatch { - ui64 End; - T Data; - - TPatternMatch(ui64 end, const T& data) - : End(end) - , Data(data) - { - } -}; - - -template <class T = char, class D = ui64, class S = TCompactTriePacker<D>> -class TCompactPatternSearcher { -public: - typedef T TSymbol; - typedef D TData; - typedef S TPacker; - - typedef typename TCompactTrieKeySelector<TSymbol>::TKey TKey; - typedef typename TCompactTrieKeySelector<TSymbol>::TKeyBuf TKeyBuf; - - typedef TCompactTrie<TSymbol, TData, TPacker> TTrie; -public: - TCompactPatternSearcher() - { - } - - explicit TCompactPatternSearcher(const TBlob& data) - : Trie(data) - { - } - - TCompactPatternSearcher(const char* data, size_t size) - : Trie(data, size) - { - } - - TVector<TPatternMatch<TData>> SearchMatches(const TSymbol* text, size_t textSize) const; - TVector<TPatternMatch<TData>> SearchMatches(const TKeyBuf& text) const { - return SearchMatches(text.data(), text.size()); - } -private: - TTrie Trie; -}; - -//////////////////// -// Implementation // -//////////////////// - -namespace { - -template <class TData, class TPacker> -char ReadNode( - char* nodeStart, - char*& leftSibling, - char*& rightSibling, - char*& directChild, - NCompactTrie::TSuffixLink*& suffixLink, - TPacker packer = TPacker() -) { - char* dataPos = nodeStart; - char flags = *(dataPos++); - - Y_ASSERT(!NCompactTrie::IsEpsilonLink(flags)); // Epsilon links are not allowed - - char label = *(dataPos++); - - suffixLink = (NCompactTrie::TSuffixLink*)dataPos; - dataPos += sizeof(NCompactTrie::TSuffixLink); - - { // Left branch - size_t offsetLength = NCompactTrie::LeftOffsetLen(flags); - size_t leftOffset = NCompactTrie::UnpackOffset(dataPos, offsetLength); - leftSibling = leftOffset ? (nodeStart + leftOffset) : nullptr; - - dataPos += offsetLength; - } - - - { // Right branch - size_t offsetLength = NCompactTrie::RightOffsetLen(flags); - size_t rightOffset = NCompactTrie::UnpackOffset(dataPos, offsetLength); - rightSibling = rightOffset ? (nodeStart + rightOffset) : nullptr; - - dataPos += offsetLength; - } - - directChild = nullptr; - if (flags & NCompactTrie::MT_NEXT) { - directChild = dataPos; - if (flags & NCompactTrie::MT_FINAL) { - directChild += packer.SkipLeaf(directChild); - } - } - - return label; -} - -template <class TData, class TPacker> -char ReadNodeConst( - const char* nodeStart, - const char*& leftSibling, - const char*& rightSibling, - const char*& directChild, - const char*& data, - NCompactTrie::TSuffixLink& suffixLink, - TPacker packer = TPacker() -) { - const char* dataPos = nodeStart; - char flags = *(dataPos++); - - Y_ASSERT(!NCompactTrie::IsEpsilonLink(flags)); // Epsilon links are not allowed - - char label = *(dataPos++); - - suffixLink = *((NCompactTrie::TSuffixLink*)dataPos); - dataPos += sizeof(NCompactTrie::TSuffixLink); - - { // Left branch - size_t offsetLength = NCompactTrie::LeftOffsetLen(flags); - size_t leftOffset = NCompactTrie::UnpackOffset(dataPos, offsetLength); - leftSibling = leftOffset ? (nodeStart + leftOffset) : nullptr; - - dataPos += offsetLength; - } - - - { // Right branch - size_t offsetLength = NCompactTrie::RightOffsetLen(flags); - size_t rightOffset = NCompactTrie::UnpackOffset(dataPos, offsetLength); - rightSibling = rightOffset ? (nodeStart + rightOffset) : nullptr; - - dataPos += offsetLength; - } - - data = nullptr; - if (flags & NCompactTrie::MT_FINAL) { - data = dataPos; - } - directChild = nullptr; - if (flags & NCompactTrie::MT_NEXT) { - directChild = dataPos; - if (flags & NCompactTrie::MT_FINAL) { - directChild += packer.SkipLeaf(directChild); - } - } - - return label; -} - -Y_FORCE_INLINE bool Advance( - const char*& dataPos, - const char* const dataEnd, - char label -) { - if (dataPos == nullptr) { - return false; - } - - while (dataPos < dataEnd) { - size_t offsetLength, offset; - const char* startPos = dataPos; - char flags = *(dataPos++); - char symbol = *(dataPos++); - dataPos += sizeof(NCompactTrie::TSuffixLink); - - // Left branch - offsetLength = NCompactTrie::LeftOffsetLen(flags); - if ((unsigned char)label < (unsigned char)symbol) { - offset = NCompactTrie::UnpackOffset(dataPos, offsetLength); - if (!offset) - break; - - dataPos = startPos + offset; - continue; - } - - dataPos += offsetLength; - - // Right branch - offsetLength = NCompactTrie::RightOffsetLen(flags); - if ((unsigned char)label > (unsigned char)symbol) { - offset = NCompactTrie::UnpackOffset(dataPos, offsetLength); - if (!offset) - break; - - dataPos = startPos + offset; - continue; - } - - dataPos = startPos; - return true; - } - - // if we got here, we're past the dataend - bail out ASAP - dataPos = nullptr; - return false; -} - -} // anonymous - -template <class T, class D, class S> -void TCompactPatternSearcherBuilder<T, D, S>::CalculateSuffixLinks( - char* trieStart, - const char* trieEnd -) const { - struct TBfsElement { - char* Node; - const char* Parent; - - TBfsElement(char* node, const char* parent) - : Node(node) - , Parent(parent) - { - } - }; - - TDeque<TBfsElement> bfsQueue; - if (trieStart && trieStart != trieEnd) { - bfsQueue.emplace_back(trieStart, nullptr); - } - - while (!bfsQueue.empty()) { - auto front = bfsQueue.front(); - char* node = front.Node; - const char* parent = front.Parent; - bfsQueue.pop_front(); - - char* leftSibling; - char* rightSibling; - char* directChild; - NCompactTrie::TSuffixLink* suffixLink; - - char label = ReadNode<TData, TPacker>( - node, - leftSibling, - rightSibling, - directChild, - suffixLink - ); - - const char* suffix; - - if (parent == nullptr) { - suffix = node; - } else { - const char* parentOfSuffix = parent; - const char* temp; - do { - NCompactTrie::TSuffixLink parentOfSuffixSuffixLink; - - ReadNodeConst<TData, TPacker>( - parentOfSuffix, - /*left*/temp, - /*right*/temp, - /*direct*/temp, - /*data*/temp, - parentOfSuffixSuffixLink - ); - if (parentOfSuffixSuffixLink.NextSuffixOffset == 0) { - suffix = trieStart; - if (!Advance(suffix, trieEnd, label)) { - suffix = node; - } - break; - } - parentOfSuffix += parentOfSuffixSuffixLink.NextSuffixOffset; - - NCompactTrie::TSuffixLink tempSuffixLink; - ReadNodeConst<TData, TPacker>( - parentOfSuffix, - /*left*/temp, - /*right*/temp, - /*direct*/suffix, - /*data*/temp, - tempSuffixLink - ); - - if (suffix == nullptr) { - continue; - } - } while (!Advance(suffix, trieEnd, label)); - } - - suffixLink->NextSuffixOffset = suffix - node; - - NCompactTrie::TSuffixLink suffixSuffixLink; - const char* suffixData; - const char* temp; - ReadNodeConst<TData, TPacker>( - suffix, - /*left*/temp, - /*right*/temp, - /*direct*/temp, - suffixData, - suffixSuffixLink - ); - suffixLink->NextSuffixWithDataOffset = suffix - node; - if (suffixData == nullptr) { - suffixLink->NextSuffixWithDataOffset += suffixSuffixLink.NextSuffixWithDataOffset; - } - - if (directChild) { - bfsQueue.emplace_back(directChild, node); - } - - if (leftSibling) { - bfsQueue.emplace_front(leftSibling, parent); - } - - if (rightSibling) { - bfsQueue.emplace_front(rightSibling, parent); - } - } -} - - -template<class T, class D, class S> -TVector<TPatternMatch<D>> TCompactPatternSearcher<T, D, S>::SearchMatches( - const TSymbol* text, - size_t textSize -) const { - const char* temp; - NCompactTrie::TSuffixLink tempSuffixLink; - - const auto& trieData = Trie.Data(); - const char* trieStart = trieData.AsCharPtr(); - size_t dataSize = trieData.Length(); - const char* trieEnd = trieStart + dataSize; - - const char* lastNode = nullptr; - const char* currentSubtree = trieStart; - - TVector<TPatternMatch<TData>> matches; - - for (const TSymbol* position = text; position < text + textSize; ++position) { - TSymbol symbol = *position; - for (i64 i = (i64)NCompactTrie::ExtraBits<TSymbol>(); i >= 0; i -= 8) { - char label = (char)(symbol >> i); - - // Find first suffix extendable by label - while (true) { - const char* nextLastNode = currentSubtree; - if (Advance(nextLastNode, trieEnd, label)) { - lastNode = nextLastNode; - ReadNodeConst<TData, TPacker>( - lastNode, - /*left*/temp, - /*right*/temp, - currentSubtree, - /*data*/temp, - tempSuffixLink - ); - break; - } else { - if (lastNode == nullptr) { - break; - } - } - - NCompactTrie::TSuffixLink suffixLink; - ReadNodeConst<TData, TPacker>( - lastNode, - /*left*/temp, - /*right*/temp, - /*direct*/temp, - /*data*/temp, - suffixLink - ); - if (suffixLink.NextSuffixOffset == 0) { - lastNode = nullptr; - currentSubtree = trieStart; - continue; - } - lastNode += suffixLink.NextSuffixOffset; - ReadNodeConst<TData, TPacker>( - lastNode, - /*left*/temp, - /*right*/temp, - currentSubtree, - /*data*/temp, - tempSuffixLink - ); - } - - // Iterate through all suffixes - const char* suffix = lastNode; - while (suffix != nullptr) { - const char* nodeData; - NCompactTrie::TSuffixLink suffixLink; - ReadNodeConst<TData, TPacker>( - suffix, - /*left*/temp, - /*right*/temp, - /*direct*/temp, - nodeData, - suffixLink - ); - if (nodeData != nullptr) { - TData data; - Trie.GetPacker().UnpackLeaf(nodeData, data); - matches.emplace_back( - position - text, - data - ); - } - if (suffixLink.NextSuffixOffset == 0) { - break; - } - suffix += suffixLink.NextSuffixWithDataOffset; - } - } - } - - return matches; -} + } + + bool Add(const TSymbol* key, size_t keyLength, const TData& value) { + return TBase::Impl->AddEntry(key, keyLength, value); + } + bool Add(const TKeyBuf& key, const TData& value) { + return Add(key.data(), key.size(), value); + } + + bool Find(const TSymbol* key, size_t keyLength, TData* value) const { + return TBase::Impl->FindEntry(key, keyLength, value); + } + bool Find(const TKeyBuf& key, TData* value = nullptr) const { + return Find(key.data(), key.size(), value); + } + + size_t Save(IOutputStream& os) const { + size_t trieSize = TBase::Impl->MeasureByteSize(); + TBufferOutput serializedTrie(trieSize); + TBase::Impl->Save(serializedTrie); + + auto serializedTrieBuffer = serializedTrie.Buffer(); + CalculateSuffixLinks( + serializedTrieBuffer.Data(), + serializedTrieBuffer.Data() + serializedTrieBuffer.Size() + ); + + os.Write(serializedTrieBuffer.Data(), serializedTrieBuffer.Size()); + return trieSize; + } + + TBlob Save() const { + TBufferStream buffer; + Save(buffer); + return TBlob::FromStream(buffer); + } + + size_t SaveToFile(const TString& fileName) const { + TFileOutput out(fileName); + return Save(out); + } + + size_t MeasureByteSize() const { + return TBase::Impl->MeasureByteSize(); + } + +private: + void CalculateSuffixLinks(char* trieStart, const char* trieEnd) const; + +protected: + class TCompactPatternSearcherBuilderImpl : public TBase::TCompactTrieBuilderImpl { + public: + typedef typename TBase::TCompactTrieBuilderImpl TImplBase; + + TCompactPatternSearcherBuilderImpl( + TCompactTrieBuilderFlags flags = CTBF_NONE, + TPacker packer = TPacker(), + IAllocator* alloc = TDefaultAllocator::Instance() + ) : TImplBase(flags, packer, alloc) { + } + + ui64 ArcMeasure( + const typename TImplBase::TArc* arc, + size_t leftSize, + size_t rightSize + ) const override { + using namespace NCompactTrie; + + size_t coreSize = SYMBOL_SIZE + FLAGS_SIZE + + sizeof(TSuffixLink) + + this->NodeMeasureLeafValue(arc->Node); + size_t treeSize = this->NodeMeasureSubtree(arc->Node); + + if (arc->Label.Length() > 0) + treeSize += (SYMBOL_SIZE + FLAGS_SIZE + sizeof(TSuffixLink)) * + (arc->Label.Length() - 1); + + // Triple measurements are needed because the space needed to store the offset + // shall be added to the offset itself. Hence three iterations. + size_t leftOffsetSize = 0; + size_t rightOffsetSize = 0; + for (size_t iteration = 0; iteration < 3; ++iteration) { + leftOffsetSize = leftSize ? MeasureOffset( + coreSize + treeSize + leftOffsetSize + rightOffsetSize) : 0; + rightOffsetSize = rightSize ? MeasureOffset( + coreSize + treeSize + leftSize + leftOffsetSize + rightOffsetSize) : 0; + } + + coreSize += leftOffsetSize + rightOffsetSize; + arc->LeftOffset = leftSize ? coreSize + treeSize : 0; + arc->RightOffset = rightSize ? coreSize + treeSize + leftSize : 0; + + return coreSize + treeSize + leftSize + rightSize; + } + + ui64 ArcSaveSelf(const typename TImplBase::TArc* arc, IOutputStream& os) const override { + using namespace NCompactTrie; + + ui64 written = 0; + + size_t leftOffsetSize = MeasureOffset(arc->LeftOffset); + size_t rightOffsetSize = MeasureOffset(arc->RightOffset); + + size_t labelLen = arc->Label.Length(); + + for (size_t labelPos = 0; labelPos < labelLen; ++labelPos) { + char flags = 0; + + if (labelPos == 0) { + flags |= (leftOffsetSize << MT_LEFTSHIFT); + flags |= (rightOffsetSize << MT_RIGHTSHIFT); + } + + if (labelPos == labelLen - 1) { + if (arc->Node->IsFinal()) + flags |= MT_FINAL; + if (!arc->Node->IsLast()) + flags |= MT_NEXT; + } else { + flags |= MT_NEXT; + } + + os.Write(&flags, 1); + os.Write(&arc->Label.AsCharPtr()[labelPos], 1); + written += 2; + + TSuffixLink suffixlink; + os.Write(&suffixlink, sizeof(TSuffixLink)); + written += sizeof(TSuffixLink); + + if (labelPos == 0) { + written += ArcSaveOffset(arc->LeftOffset, os); + written += ArcSaveOffset(arc->RightOffset, os); + } + } + + written += this->NodeSaveLeafValue(arc->Node, os); + return written; + } + }; +}; + + +template <class T> +struct TPatternMatch { + ui64 End; + T Data; + + TPatternMatch(ui64 end, const T& data) + : End(end) + , Data(data) + { + } +}; + + +template <class T = char, class D = ui64, class S = TCompactTriePacker<D>> +class TCompactPatternSearcher { +public: + typedef T TSymbol; + typedef D TData; + typedef S TPacker; + + typedef typename TCompactTrieKeySelector<TSymbol>::TKey TKey; + typedef typename TCompactTrieKeySelector<TSymbol>::TKeyBuf TKeyBuf; + + typedef TCompactTrie<TSymbol, TData, TPacker> TTrie; +public: + TCompactPatternSearcher() + { + } + + explicit TCompactPatternSearcher(const TBlob& data) + : Trie(data) + { + } + + TCompactPatternSearcher(const char* data, size_t size) + : Trie(data, size) + { + } + + TVector<TPatternMatch<TData>> SearchMatches(const TSymbol* text, size_t textSize) const; + TVector<TPatternMatch<TData>> SearchMatches(const TKeyBuf& text) const { + return SearchMatches(text.data(), text.size()); + } +private: + TTrie Trie; +}; + +//////////////////// +// Implementation // +//////////////////// + +namespace { + +template <class TData, class TPacker> +char ReadNode( + char* nodeStart, + char*& leftSibling, + char*& rightSibling, + char*& directChild, + NCompactTrie::TSuffixLink*& suffixLink, + TPacker packer = TPacker() +) { + char* dataPos = nodeStart; + char flags = *(dataPos++); + + Y_ASSERT(!NCompactTrie::IsEpsilonLink(flags)); // Epsilon links are not allowed + + char label = *(dataPos++); + + suffixLink = (NCompactTrie::TSuffixLink*)dataPos; + dataPos += sizeof(NCompactTrie::TSuffixLink); + + { // Left branch + size_t offsetLength = NCompactTrie::LeftOffsetLen(flags); + size_t leftOffset = NCompactTrie::UnpackOffset(dataPos, offsetLength); + leftSibling = leftOffset ? (nodeStart + leftOffset) : nullptr; + + dataPos += offsetLength; + } + + + { // Right branch + size_t offsetLength = NCompactTrie::RightOffsetLen(flags); + size_t rightOffset = NCompactTrie::UnpackOffset(dataPos, offsetLength); + rightSibling = rightOffset ? (nodeStart + rightOffset) : nullptr; + + dataPos += offsetLength; + } + + directChild = nullptr; + if (flags & NCompactTrie::MT_NEXT) { + directChild = dataPos; + if (flags & NCompactTrie::MT_FINAL) { + directChild += packer.SkipLeaf(directChild); + } + } + + return label; +} + +template <class TData, class TPacker> +char ReadNodeConst( + const char* nodeStart, + const char*& leftSibling, + const char*& rightSibling, + const char*& directChild, + const char*& data, + NCompactTrie::TSuffixLink& suffixLink, + TPacker packer = TPacker() +) { + const char* dataPos = nodeStart; + char flags = *(dataPos++); + + Y_ASSERT(!NCompactTrie::IsEpsilonLink(flags)); // Epsilon links are not allowed + + char label = *(dataPos++); + + suffixLink = *((NCompactTrie::TSuffixLink*)dataPos); + dataPos += sizeof(NCompactTrie::TSuffixLink); + + { // Left branch + size_t offsetLength = NCompactTrie::LeftOffsetLen(flags); + size_t leftOffset = NCompactTrie::UnpackOffset(dataPos, offsetLength); + leftSibling = leftOffset ? (nodeStart + leftOffset) : nullptr; + + dataPos += offsetLength; + } + + + { // Right branch + size_t offsetLength = NCompactTrie::RightOffsetLen(flags); + size_t rightOffset = NCompactTrie::UnpackOffset(dataPos, offsetLength); + rightSibling = rightOffset ? (nodeStart + rightOffset) : nullptr; + + dataPos += offsetLength; + } + + data = nullptr; + if (flags & NCompactTrie::MT_FINAL) { + data = dataPos; + } + directChild = nullptr; + if (flags & NCompactTrie::MT_NEXT) { + directChild = dataPos; + if (flags & NCompactTrie::MT_FINAL) { + directChild += packer.SkipLeaf(directChild); + } + } + + return label; +} + +Y_FORCE_INLINE bool Advance( + const char*& dataPos, + const char* const dataEnd, + char label +) { + if (dataPos == nullptr) { + return false; + } + + while (dataPos < dataEnd) { + size_t offsetLength, offset; + const char* startPos = dataPos; + char flags = *(dataPos++); + char symbol = *(dataPos++); + dataPos += sizeof(NCompactTrie::TSuffixLink); + + // Left branch + offsetLength = NCompactTrie::LeftOffsetLen(flags); + if ((unsigned char)label < (unsigned char)symbol) { + offset = NCompactTrie::UnpackOffset(dataPos, offsetLength); + if (!offset) + break; + + dataPos = startPos + offset; + continue; + } + + dataPos += offsetLength; + + // Right branch + offsetLength = NCompactTrie::RightOffsetLen(flags); + if ((unsigned char)label > (unsigned char)symbol) { + offset = NCompactTrie::UnpackOffset(dataPos, offsetLength); + if (!offset) + break; + + dataPos = startPos + offset; + continue; + } + + dataPos = startPos; + return true; + } + + // if we got here, we're past the dataend - bail out ASAP + dataPos = nullptr; + return false; +} + +} // anonymous + +template <class T, class D, class S> +void TCompactPatternSearcherBuilder<T, D, S>::CalculateSuffixLinks( + char* trieStart, + const char* trieEnd +) const { + struct TBfsElement { + char* Node; + const char* Parent; + + TBfsElement(char* node, const char* parent) + : Node(node) + , Parent(parent) + { + } + }; + + TDeque<TBfsElement> bfsQueue; + if (trieStart && trieStart != trieEnd) { + bfsQueue.emplace_back(trieStart, nullptr); + } + + while (!bfsQueue.empty()) { + auto front = bfsQueue.front(); + char* node = front.Node; + const char* parent = front.Parent; + bfsQueue.pop_front(); + + char* leftSibling; + char* rightSibling; + char* directChild; + NCompactTrie::TSuffixLink* suffixLink; + + char label = ReadNode<TData, TPacker>( + node, + leftSibling, + rightSibling, + directChild, + suffixLink + ); + + const char* suffix; + + if (parent == nullptr) { + suffix = node; + } else { + const char* parentOfSuffix = parent; + const char* temp; + do { + NCompactTrie::TSuffixLink parentOfSuffixSuffixLink; + + ReadNodeConst<TData, TPacker>( + parentOfSuffix, + /*left*/temp, + /*right*/temp, + /*direct*/temp, + /*data*/temp, + parentOfSuffixSuffixLink + ); + if (parentOfSuffixSuffixLink.NextSuffixOffset == 0) { + suffix = trieStart; + if (!Advance(suffix, trieEnd, label)) { + suffix = node; + } + break; + } + parentOfSuffix += parentOfSuffixSuffixLink.NextSuffixOffset; + + NCompactTrie::TSuffixLink tempSuffixLink; + ReadNodeConst<TData, TPacker>( + parentOfSuffix, + /*left*/temp, + /*right*/temp, + /*direct*/suffix, + /*data*/temp, + tempSuffixLink + ); + + if (suffix == nullptr) { + continue; + } + } while (!Advance(suffix, trieEnd, label)); + } + + suffixLink->NextSuffixOffset = suffix - node; + + NCompactTrie::TSuffixLink suffixSuffixLink; + const char* suffixData; + const char* temp; + ReadNodeConst<TData, TPacker>( + suffix, + /*left*/temp, + /*right*/temp, + /*direct*/temp, + suffixData, + suffixSuffixLink + ); + suffixLink->NextSuffixWithDataOffset = suffix - node; + if (suffixData == nullptr) { + suffixLink->NextSuffixWithDataOffset += suffixSuffixLink.NextSuffixWithDataOffset; + } + + if (directChild) { + bfsQueue.emplace_back(directChild, node); + } + + if (leftSibling) { + bfsQueue.emplace_front(leftSibling, parent); + } + + if (rightSibling) { + bfsQueue.emplace_front(rightSibling, parent); + } + } +} + + +template<class T, class D, class S> +TVector<TPatternMatch<D>> TCompactPatternSearcher<T, D, S>::SearchMatches( + const TSymbol* text, + size_t textSize +) const { + const char* temp; + NCompactTrie::TSuffixLink tempSuffixLink; + + const auto& trieData = Trie.Data(); + const char* trieStart = trieData.AsCharPtr(); + size_t dataSize = trieData.Length(); + const char* trieEnd = trieStart + dataSize; + + const char* lastNode = nullptr; + const char* currentSubtree = trieStart; + + TVector<TPatternMatch<TData>> matches; + + for (const TSymbol* position = text; position < text + textSize; ++position) { + TSymbol symbol = *position; + for (i64 i = (i64)NCompactTrie::ExtraBits<TSymbol>(); i >= 0; i -= 8) { + char label = (char)(symbol >> i); + + // Find first suffix extendable by label + while (true) { + const char* nextLastNode = currentSubtree; + if (Advance(nextLastNode, trieEnd, label)) { + lastNode = nextLastNode; + ReadNodeConst<TData, TPacker>( + lastNode, + /*left*/temp, + /*right*/temp, + currentSubtree, + /*data*/temp, + tempSuffixLink + ); + break; + } else { + if (lastNode == nullptr) { + break; + } + } + + NCompactTrie::TSuffixLink suffixLink; + ReadNodeConst<TData, TPacker>( + lastNode, + /*left*/temp, + /*right*/temp, + /*direct*/temp, + /*data*/temp, + suffixLink + ); + if (suffixLink.NextSuffixOffset == 0) { + lastNode = nullptr; + currentSubtree = trieStart; + continue; + } + lastNode += suffixLink.NextSuffixOffset; + ReadNodeConst<TData, TPacker>( + lastNode, + /*left*/temp, + /*right*/temp, + currentSubtree, + /*data*/temp, + tempSuffixLink + ); + } + + // Iterate through all suffixes + const char* suffix = lastNode; + while (suffix != nullptr) { + const char* nodeData; + NCompactTrie::TSuffixLink suffixLink; + ReadNodeConst<TData, TPacker>( + suffix, + /*left*/temp, + /*right*/temp, + /*direct*/temp, + nodeData, + suffixLink + ); + if (nodeData != nullptr) { + TData data; + Trie.GetPacker().UnpackLeaf(nodeData, data); + matches.emplace_back( + position - text, + data + ); + } + if (suffixLink.NextSuffixOffset == 0) { + break; + } + suffix += suffixLink.NextSuffixWithDataOffset; + } + } + } + + return matches; +} diff --git a/library/cpp/containers/comptrie/search_iterator.h b/library/cpp/containers/comptrie/search_iterator.h index 247f7e5936..90a831c268 100644 --- a/library/cpp/containers/comptrie/search_iterator.h +++ b/library/cpp/containers/comptrie/search_iterator.h @@ -1,12 +1,12 @@ #pragma once #include "comptrie_trie.h" -#include "first_symbol_iterator.h" - -#include <util/str_stl.h> -#include <util/digest/numeric.h> -#include <util/digest/multi.h> +#include "first_symbol_iterator.h" +#include <util/str_stl.h> +#include <util/digest/numeric.h> +#include <util/digest/multi.h> + // Iterator for incremental searching. // All Advance() methods shift the iterator using specifed key/char. // The subsequent Advance() call starts searching from the previous state. @@ -23,26 +23,26 @@ public: using TSymbol = typename TTrie::TSymbol; using TKeyBuf = typename TTrie::TKeyBuf; - TSearchIterator() = default; - + TSearchIterator() = default; + explicit TSearchIterator(const TTrie& trie) : Trie(&trie) , DataPos(trie.DataHolder.AsCharPtr()) , DataEnd(DataPos + trie.DataHolder.Length()) - , ValuePos(trie.EmptyValue) - { - } - - explicit TSearchIterator(const TTrie& trie, const TTrie& subTrie) - : Trie(&trie) - , DataPos(subTrie.Data().AsCharPtr()) - , DataEnd(trie.DataHolder.AsCharPtr() + trie.DataHolder.Length()) - , ValuePos(subTrie.EmptyValue) + , ValuePos(trie.EmptyValue) { } + explicit TSearchIterator(const TTrie& trie, const TTrie& subTrie) + : Trie(&trie) + , DataPos(subTrie.Data().AsCharPtr()) + , DataEnd(trie.DataHolder.AsCharPtr() + trie.DataHolder.Length()) + , ValuePos(subTrie.EmptyValue) + { + } + bool operator==(const TSearchIterator& other) const { - Y_ASSERT(Trie && other.Trie); + Y_ASSERT(Trie && other.Trie); return Trie == other.Trie && DataPos == other.DataPos && DataEnd == other.DataEnd && @@ -53,7 +53,7 @@ public: } inline bool Advance(TSymbol label) { - Y_ASSERT(Trie); + Y_ASSERT(Trie); if (DataPos == nullptr || DataPos >= DataEnd) { return false; } @@ -64,14 +64,14 @@ public: } bool Advance(const TSymbol* key, size_t keylen); bool GetValue(TData* value = nullptr) const; - bool HasValue() const; - inline size_t GetHash() const; + bool HasValue() const; + inline size_t GetHash() const; private: - const TTrie* Trie = nullptr; - const char* DataPos = nullptr; - const char* DataEnd = nullptr; - const char* ValuePos = nullptr; + const TTrie* Trie = nullptr; + const char* DataPos = nullptr; + const char* DataEnd = nullptr; + const char* ValuePos = nullptr; }; template <class TTrie> @@ -79,18 +79,18 @@ inline TSearchIterator<TTrie> MakeSearchIterator(const TTrie& trie) { return TSearchIterator<TTrie>(trie); } -template <class TTrie> -struct THash<TSearchIterator<TTrie>> { - inline size_t operator()(const TSearchIterator<TTrie>& item) { - return item.GetHash(); - } -}; - +template <class TTrie> +struct THash<TSearchIterator<TTrie>> { + inline size_t operator()(const TSearchIterator<TTrie>& item) { + return item.GetHash(); + } +}; + //---------------------------------------------------------------------------- template <class TTrie> bool TSearchIterator<TTrie>::Advance(const TSymbol* key, size_t keylen) { - Y_ASSERT(Trie); + Y_ASSERT(Trie); if (!key || DataPos == nullptr || DataPos >= DataEnd) { return false; } @@ -112,7 +112,7 @@ bool TSearchIterator<TTrie>::Advance(const TSymbol* key, size_t keylen) { template <class TTrie> bool TSearchIterator<TTrie>::GetValue(TData* value) const { - Y_ASSERT(Trie); + Y_ASSERT(Trie); bool result = false; if (value) { if (ValuePos) { @@ -122,19 +122,19 @@ bool TSearchIterator<TTrie>::GetValue(TData* value) const { } return result; } - -template <class TTrie> -bool TSearchIterator<TTrie>::HasValue() const { - Y_ASSERT(Trie); - return ValuePos; -} - -template <class TTrie> -inline size_t TSearchIterator<TTrie>::GetHash() const { - Y_ASSERT(Trie); + +template <class TTrie> +bool TSearchIterator<TTrie>::HasValue() const { + Y_ASSERT(Trie); + return ValuePos; +} + +template <class TTrie> +inline size_t TSearchIterator<TTrie>::GetHash() const { + Y_ASSERT(Trie); return MultiHash( static_cast<const void*>(Trie), static_cast<const void*>(DataPos), static_cast<const void*>(DataEnd), static_cast<const void*>(ValuePos)); -} +} |