aboutsummaryrefslogtreecommitdiffstats
path: root/library
diff options
context:
space:
mode:
authorPashaSmirnov <leue@yandex.ru>2022-02-10 16:50:38 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-02-10 16:50:38 +0300
commite440fc473cc191fe94becca370781e148709007a (patch)
tree94e9670fbe940a9d7e53dc7775b45b3e4ede0187 /library
parent16aec54b5c7e5d07400c5ee9122ee34839bff587 (diff)
downloadydb-e440fc473cc191fe94becca370781e148709007a.tar.gz
Restoring authorship annotation for PashaSmirnov <leue@yandex.ru>. Commit 1 of 2.
Diffstat (limited to 'library')
-rw-r--r--library/cpp/containers/comptrie/README.md40
-rw-r--r--library/cpp/containers/comptrie/benchmark/main.cpp502
-rw-r--r--library/cpp/containers/comptrie/benchmark/ya.make24
-rw-r--r--library/cpp/containers/comptrie/comptrie_builder.inl6
-rw-r--r--library/cpp/containers/comptrie/comptrie_ut.cpp460
-rw-r--r--library/cpp/containers/comptrie/pattern_searcher.h1202
-rw-r--r--library/cpp/containers/comptrie/search_iterator.h88
7 files changed, 1161 insertions, 1161 deletions
diff --git a/library/cpp/containers/comptrie/README.md b/library/cpp/containers/comptrie/README.md
index 43c298e2c8..64c40dc974 100644
--- a/library/cpp/containers/comptrie/README.md
+++ b/library/cpp/containers/comptrie/README.md
@@ -210,23 +210,23 @@ function itself instead of examining the flags.
Note that the ε-link flags do not use the MT_SIZEMASK <<
MT_LEFTSHIFT` bits, which allows us to start using ε-links for some other purpose.
-
-Pattern Searcher
-================
-
-This is an implementation of Aho-Corasick algorithm on compact trie structure.
-In order to create a pattern searcher one must fill a TCompactPatternSearcherBuilder
-with patterns and call SaveAsPatternSearcher or SaveToFileAsPatternSearcher.
-Then TCompactPatternSearcher must be created from the builder output.
-
-### Implementation details ###
-
-Aho-Corasick algorithm stores a suffix link in each node.
-A suffix link of a node is the offset (relative to this node) of the largest suffix
-of a string this node represents which is present in a trie.
-Current implementation also stores a shortcut link to the largest suffix
-for which the corresponding node in a trie is a final node.
-These two links are stored as NCompactTrie::TSuffixLink structure of two 64-bit
-integers.
-In a trie layout these links are stored for each node right after the two bytes
-containing service flags and a symbol.
+
+Pattern Searcher
+================
+
+This is an implementation of Aho-Corasick algorithm on compact trie structure.
+In order to create a pattern searcher one must fill a TCompactPatternSearcherBuilder
+with patterns and call SaveAsPatternSearcher or SaveToFileAsPatternSearcher.
+Then TCompactPatternSearcher must be created from the builder output.
+
+### Implementation details ###
+
+Aho-Corasick algorithm stores a suffix link in each node.
+A suffix link of a node is the offset (relative to this node) of the largest suffix
+of a string this node represents which is present in a trie.
+Current implementation also stores a shortcut link to the largest suffix
+for which the corresponding node in a trie is a final node.
+These two links are stored as NCompactTrie::TSuffixLink structure of two 64-bit
+integers.
+In a trie layout these links are stored for each node right after the two bytes
+containing service flags and a symbol.
diff --git a/library/cpp/containers/comptrie/benchmark/main.cpp b/library/cpp/containers/comptrie/benchmark/main.cpp
index 6e42dad18a..2d024eb2f2 100644
--- a/library/cpp/containers/comptrie/benchmark/main.cpp
+++ b/library/cpp/containers/comptrie/benchmark/main.cpp
@@ -1,260 +1,260 @@
#include <library/cpp/testing/benchmark/bench.h>
-
+
#include <library/cpp/containers/comptrie/comptrie_trie.h>
#include <library/cpp/containers/comptrie/comptrie_builder.h>
#include <library/cpp/containers/comptrie/search_iterator.h>
#include <library/cpp/containers/comptrie/pattern_searcher.h>
-
+
#include <library/cpp/on_disk/aho_corasick/writer.h>
#include <library/cpp/on_disk/aho_corasick/reader.h>
#include <library/cpp/on_disk/aho_corasick/helpers.h>
-
+
#include <library/cpp/containers/dense_hash/dense_hash.h>
-
-#include <util/stream/file.h>
-#include <util/generic/algorithm.h>
-#include <util/random/fast.h>
-#include <util/random/shuffle.h>
-
-/////////////////
-// COMMON DATA //
-/////////////////
-
-const size_t MAX_PATTERN_LENGTH = 11;
-
-TVector<TString> letters = {
- "а", "б", "в", "г", "д", "е", "ё", "ж", "з", "и", "й",
- "к", "л", "м", "н", "о", "п", "р", "с", "т", "у", "ф",
- "х", "ц", "ч", "ж", "щ", "ъ", "ы", "ь", "э", "ю", "я"
-};
-
-TString GenerateOneString(
- TFastRng<ui64>& rng,
- size_t maxLength,
- const TVector<TString>& sequences
-) {
- size_t length = rng.GenRand() % maxLength + 1;
- TString result;
- while (result.size() < length) {
- result += sequences[rng.GenRand() % sequences.size()];
- }
- return result;
-}
-
-TVector<TString> GenerateStrings(
- TFastRng<ui64>& rng,
- size_t num,
- size_t maxLength,
- const TVector<TString>& sequences
-) {
- TVector<TString> strings;
- while (strings.size() < num) {
- strings.push_back(GenerateOneString(rng, maxLength, sequences));
- }
- return strings;
-}
-
-struct TDatasetInstance {
- TDatasetInstance(const TVector<TString>& sequences) {
- TFastRng<ui64> rng(0);
-
- TVector<TString> prefixes = GenerateStrings(rng, /*num*/10, /*maxLength*/3, sequences);
- prefixes.push_back("");
-
- TVector<TString> roots = GenerateStrings(rng, /*num*/1000, /*maxLength*/5, sequences);
-
- TVector<TString> suffixes = GenerateStrings(rng, /*num*/10, /*maxLength*/3, sequences);
- suffixes.push_back("");
-
- TVector<TString> dictionary;
- for (const auto& root : roots) {
- for (const auto& prefix : prefixes) {
- for (const auto& suffix : suffixes) {
- dictionary.push_back(prefix + root + suffix);
- Y_ASSERT(dictionary.back().size() < MAX_PATTERN_LENGTH);
- }
- }
- }
- Shuffle(dictionary.begin(), dictionary.end());
-
- Patterns.assign(dictionary.begin(), dictionary.begin() + 10'000);
-
- for (size_t sampleIdx = 0; sampleIdx < /*samplesNum*/1'000'000; ++sampleIdx) {
- Samples.emplace_back();
- size_t wordsNum = rng.GenRand() % 10;
- for (size_t wordIdx = 0; wordIdx < wordsNum; ++wordIdx) {
- if (wordIdx > 0) {
- Samples.back() += " ";
- }
- Samples.back() += dictionary[rng.GenRand() % dictionary.size()];
- }
- }
- };
-
- TString GetSample(size_t iteration) const {
- TFastRng<ui64> rng(iteration);
- return Samples[rng.GenRand() % Samples.size()];
- }
-
-
- TVector<TString> Patterns;
- TVector<TString> Samples;
-};
-
-static const TDatasetInstance dataset(letters);
-
-//////////////////////////
-// NEW PATTERN SEARCHER //
-//////////////////////////
-
-struct TPatternSearcherInstance {
- TPatternSearcherInstance() {
- TCompactPatternSearcherBuilder<char, ui32> builder;
-
- for (ui32 patternId = 0; patternId < dataset.Patterns.size(); ++patternId) {
- builder.Add(dataset.Patterns[patternId], patternId);
- }
-
- TBufferOutput buffer;
- builder.Save(buffer);
-
- Instance.Reset(
- new TCompactPatternSearcher<char, ui32>(
- buffer.Buffer().Data(),
- buffer.Buffer().Size()
- )
- );
- }
-
- THolder<TCompactPatternSearcher<char, ui32>> Instance;
-};
-
-static const TPatternSearcherInstance patternSearcherInstance;
-
-Y_CPU_BENCHMARK(PatternSearcher, iface) {
- TVector<TVector<std::pair<ui32, ui32>>> result;
- for (size_t iteration = 0; iteration < iface.Iterations(); ++iteration) {
- result.emplace_back();
- TString testString = dataset.GetSample(iteration);
- auto matches = patternSearcherInstance.Instance->SearchMatches(testString);
- for (auto& match : matches) {
- result.back().emplace_back(match.End, match.Data);
- }
- }
-}
-
-//////////////////////
-// OLD AHO CORASICK //
-//////////////////////
-
-struct TAhoCorasickInstance {
- TAhoCorasickInstance() {
- TAhoCorasickBuilder<TString, ui32> builder;
-
- for (ui32 patternId = 0; patternId < dataset.Patterns.size(); ++patternId) {
- builder.AddString(dataset.Patterns[patternId], patternId);
- }
-
- TBufferOutput buffer;
- builder.SaveToStream(&buffer);
-
- Instance.Reset(new TDefaultMappedAhoCorasick(TBlob::FromBuffer(buffer.Buffer())));
- };
-
- THolder<TDefaultMappedAhoCorasick> Instance;
-};
-
-static const TAhoCorasickInstance ahoCorasickInstance;
-
-Y_CPU_BENCHMARK(AhoCorasick, iface) {
- TVector<TDeque<std::pair<ui32, ui32>>> result;
- for (size_t iteration = 0; iteration < iface.Iterations(); ++iteration) {
- result.emplace_back();
- TString testString = dataset.GetSample(iteration);
- auto matches = ahoCorasickInstance.Instance->AhoSearch(testString);
- result.push_back(matches);
- }
-}
-
-////////////////////////////////
-// COMPTRIE + SIMPLE MATCHING //
-////////////////////////////////
-
-struct TCompactTrieInstance {
- TCompactTrieInstance() {
- TCompactTrieBuilder<char, ui32> builder;
-
- for (ui32 patternId = 0; patternId < dataset.Patterns.size(); ++patternId) {
- builder.Add(dataset.Patterns[patternId], patternId);
- }
-
-
- TBufferOutput buffer;
- CompactTrieMinimizeAndMakeFastLayout(buffer, builder);
-
- Instance.Reset(new TCompactTrie<char, ui32>(
- buffer.Buffer().Data(),
- buffer.Buffer().Size()
- ));
- }
-
- THolder<TCompactTrie<char, ui32>> Instance;
-};
-
-static const TCompactTrieInstance compactTrieInstance;
-
-Y_CPU_BENCHMARK(ComptrieSimple, iface) {
- TVector<TVector<std::pair<ui32, ui32>>> result;
- for (size_t iteration = 0; iteration < iface.Iterations(); ++iteration) {
- result.emplace_back();
- TString testString = dataset.GetSample(iteration);
- for (ui32 startPos = 0; startPos < testString.size(); ++startPos) {
- TSearchIterator<TCompactTrie<char, ui32>> iter(*(compactTrieInstance.Instance));
- for (ui32 position = startPos; position < testString.size(); ++position) {
- if (!iter.Advance(testString[position])) {
- break;
- }
- ui32 answer;
- if (iter.GetValue(&answer)) {
- result.back().emplace_back(position, answer);
- }
- }
- }
- }
-}
-
-////////////////
-// DENSE_HASH //
-////////////////
-
-struct TDenseHashInstance {
- TDenseHashInstance() {
- for (ui32 patternId = 0; patternId < dataset.Patterns.size(); ++patternId) {
- Instance[dataset.Patterns[patternId]] = patternId;
- }
- }
-
- TDenseHash<TString, ui32> Instance;
-};
-
-static const TDenseHashInstance denseHashInstance;
-
-Y_CPU_BENCHMARK(DenseHash, iface) {
- TVector<TVector<std::pair<ui32, ui32>>> result;
- for (size_t iteration = 0; iteration < iface.Iterations(); ++iteration) {
- result.emplace_back();
- TString testString = dataset.GetSample(iteration);
- for (size_t start = 0; start < testString.size(); ++start) {
- for (
- size_t length = 1;
- length <= MAX_PATTERN_LENGTH && start + length <= testString.size();
- ++length
- ) {
- auto value = denseHashInstance.Instance.find(testString.substr(start, length));
- if (value != denseHashInstance.Instance.end()) {
- result.back().emplace_back(start + length - 1, value->second);
- }
- }
- }
- }
-}
+
+#include <util/stream/file.h>
+#include <util/generic/algorithm.h>
+#include <util/random/fast.h>
+#include <util/random/shuffle.h>
+
+/////////////////
+// COMMON DATA //
+/////////////////
+
+const size_t MAX_PATTERN_LENGTH = 11;
+
+TVector<TString> letters = {
+ "а", "б", "в", "г", "д", "е", "ё", "ж", "з", "и", "й",
+ "к", "л", "м", "н", "о", "п", "р", "с", "т", "у", "ф",
+ "х", "ц", "ч", "ж", "щ", "ъ", "ы", "ь", "э", "ю", "я"
+};
+
+TString GenerateOneString(
+ TFastRng<ui64>& rng,
+ size_t maxLength,
+ const TVector<TString>& sequences
+) {
+ size_t length = rng.GenRand() % maxLength + 1;
+ TString result;
+ while (result.size() < length) {
+ result += sequences[rng.GenRand() % sequences.size()];
+ }
+ return result;
+}
+
+TVector<TString> GenerateStrings(
+ TFastRng<ui64>& rng,
+ size_t num,
+ size_t maxLength,
+ const TVector<TString>& sequences
+) {
+ TVector<TString> strings;
+ while (strings.size() < num) {
+ strings.push_back(GenerateOneString(rng, maxLength, sequences));
+ }
+ return strings;
+}
+
+struct TDatasetInstance {
+ TDatasetInstance(const TVector<TString>& sequences) {
+ TFastRng<ui64> rng(0);
+
+ TVector<TString> prefixes = GenerateStrings(rng, /*num*/10, /*maxLength*/3, sequences);
+ prefixes.push_back("");
+
+ TVector<TString> roots = GenerateStrings(rng, /*num*/1000, /*maxLength*/5, sequences);
+
+ TVector<TString> suffixes = GenerateStrings(rng, /*num*/10, /*maxLength*/3, sequences);
+ suffixes.push_back("");
+
+ TVector<TString> dictionary;
+ for (const auto& root : roots) {
+ for (const auto& prefix : prefixes) {
+ for (const auto& suffix : suffixes) {
+ dictionary.push_back(prefix + root + suffix);
+ Y_ASSERT(dictionary.back().size() < MAX_PATTERN_LENGTH);
+ }
+ }
+ }
+ Shuffle(dictionary.begin(), dictionary.end());
+
+ Patterns.assign(dictionary.begin(), dictionary.begin() + 10'000);
+
+ for (size_t sampleIdx = 0; sampleIdx < /*samplesNum*/1'000'000; ++sampleIdx) {
+ Samples.emplace_back();
+ size_t wordsNum = rng.GenRand() % 10;
+ for (size_t wordIdx = 0; wordIdx < wordsNum; ++wordIdx) {
+ if (wordIdx > 0) {
+ Samples.back() += " ";
+ }
+ Samples.back() += dictionary[rng.GenRand() % dictionary.size()];
+ }
+ }
+ };
+
+ TString GetSample(size_t iteration) const {
+ TFastRng<ui64> rng(iteration);
+ return Samples[rng.GenRand() % Samples.size()];
+ }
+
+
+ TVector<TString> Patterns;
+ TVector<TString> Samples;
+};
+
+static const TDatasetInstance dataset(letters);
+
+//////////////////////////
+// NEW PATTERN SEARCHER //
+//////////////////////////
+
+struct TPatternSearcherInstance {
+ TPatternSearcherInstance() {
+ TCompactPatternSearcherBuilder<char, ui32> builder;
+
+ for (ui32 patternId = 0; patternId < dataset.Patterns.size(); ++patternId) {
+ builder.Add(dataset.Patterns[patternId], patternId);
+ }
+
+ TBufferOutput buffer;
+ builder.Save(buffer);
+
+ Instance.Reset(
+ new TCompactPatternSearcher<char, ui32>(
+ buffer.Buffer().Data(),
+ buffer.Buffer().Size()
+ )
+ );
+ }
+
+ THolder<TCompactPatternSearcher<char, ui32>> Instance;
+};
+
+static const TPatternSearcherInstance patternSearcherInstance;
+
+Y_CPU_BENCHMARK(PatternSearcher, iface) {
+ TVector<TVector<std::pair<ui32, ui32>>> result;
+ for (size_t iteration = 0; iteration < iface.Iterations(); ++iteration) {
+ result.emplace_back();
+ TString testString = dataset.GetSample(iteration);
+ auto matches = patternSearcherInstance.Instance->SearchMatches(testString);
+ for (auto& match : matches) {
+ result.back().emplace_back(match.End, match.Data);
+ }
+ }
+}
+
+//////////////////////
+// OLD AHO CORASICK //
+//////////////////////
+
+struct TAhoCorasickInstance {
+ TAhoCorasickInstance() {
+ TAhoCorasickBuilder<TString, ui32> builder;
+
+ for (ui32 patternId = 0; patternId < dataset.Patterns.size(); ++patternId) {
+ builder.AddString(dataset.Patterns[patternId], patternId);
+ }
+
+ TBufferOutput buffer;
+ builder.SaveToStream(&buffer);
+
+ Instance.Reset(new TDefaultMappedAhoCorasick(TBlob::FromBuffer(buffer.Buffer())));
+ };
+
+ THolder<TDefaultMappedAhoCorasick> Instance;
+};
+
+static const TAhoCorasickInstance ahoCorasickInstance;
+
+Y_CPU_BENCHMARK(AhoCorasick, iface) {
+ TVector<TDeque<std::pair<ui32, ui32>>> result;
+ for (size_t iteration = 0; iteration < iface.Iterations(); ++iteration) {
+ result.emplace_back();
+ TString testString = dataset.GetSample(iteration);
+ auto matches = ahoCorasickInstance.Instance->AhoSearch(testString);
+ result.push_back(matches);
+ }
+}
+
+////////////////////////////////
+// COMPTRIE + SIMPLE MATCHING //
+////////////////////////////////
+
+struct TCompactTrieInstance {
+ TCompactTrieInstance() {
+ TCompactTrieBuilder<char, ui32> builder;
+
+ for (ui32 patternId = 0; patternId < dataset.Patterns.size(); ++patternId) {
+ builder.Add(dataset.Patterns[patternId], patternId);
+ }
+
+
+ TBufferOutput buffer;
+ CompactTrieMinimizeAndMakeFastLayout(buffer, builder);
+
+ Instance.Reset(new TCompactTrie<char, ui32>(
+ buffer.Buffer().Data(),
+ buffer.Buffer().Size()
+ ));
+ }
+
+ THolder<TCompactTrie<char, ui32>> Instance;
+};
+
+static const TCompactTrieInstance compactTrieInstance;
+
+Y_CPU_BENCHMARK(ComptrieSimple, iface) {
+ TVector<TVector<std::pair<ui32, ui32>>> result;
+ for (size_t iteration = 0; iteration < iface.Iterations(); ++iteration) {
+ result.emplace_back();
+ TString testString = dataset.GetSample(iteration);
+ for (ui32 startPos = 0; startPos < testString.size(); ++startPos) {
+ TSearchIterator<TCompactTrie<char, ui32>> iter(*(compactTrieInstance.Instance));
+ for (ui32 position = startPos; position < testString.size(); ++position) {
+ if (!iter.Advance(testString[position])) {
+ break;
+ }
+ ui32 answer;
+ if (iter.GetValue(&answer)) {
+ result.back().emplace_back(position, answer);
+ }
+ }
+ }
+ }
+}
+
+////////////////
+// DENSE_HASH //
+////////////////
+
+struct TDenseHashInstance {
+ TDenseHashInstance() {
+ for (ui32 patternId = 0; patternId < dataset.Patterns.size(); ++patternId) {
+ Instance[dataset.Patterns[patternId]] = patternId;
+ }
+ }
+
+ TDenseHash<TString, ui32> Instance;
+};
+
+static const TDenseHashInstance denseHashInstance;
+
+Y_CPU_BENCHMARK(DenseHash, iface) {
+ TVector<TVector<std::pair<ui32, ui32>>> result;
+ for (size_t iteration = 0; iteration < iface.Iterations(); ++iteration) {
+ result.emplace_back();
+ TString testString = dataset.GetSample(iteration);
+ for (size_t start = 0; start < testString.size(); ++start) {
+ for (
+ size_t length = 1;
+ length <= MAX_PATTERN_LENGTH && start + length <= testString.size();
+ ++length
+ ) {
+ auto value = denseHashInstance.Instance.find(testString.substr(start, length));
+ if (value != denseHashInstance.Instance.end()) {
+ result.back().emplace_back(start + length - 1, value->second);
+ }
+ }
+ }
+ }
+}
diff --git a/library/cpp/containers/comptrie/benchmark/ya.make b/library/cpp/containers/comptrie/benchmark/ya.make
index 16fa19530d..b0a1fc5b92 100644
--- a/library/cpp/containers/comptrie/benchmark/ya.make
+++ b/library/cpp/containers/comptrie/benchmark/ya.make
@@ -1,14 +1,14 @@
Y_BENCHMARK()
-
-OWNER(smirnovpavel)
-
-SRCS(
- main.cpp
-)
-
-PEERDIR(
+
+OWNER(smirnovpavel)
+
+SRCS(
+ main.cpp
+)
+
+PEERDIR(
library/cpp/containers/comptrie
- util
-)
-
-END()
+ util
+)
+
+END()
diff --git a/library/cpp/containers/comptrie/comptrie_builder.inl b/library/cpp/containers/comptrie/comptrie_builder.inl
index f273fa6571..6c5b142f8d 100644
--- a/library/cpp/containers/comptrie/comptrie_builder.inl
+++ b/library/cpp/containers/comptrie/comptrie_builder.inl
@@ -60,15 +60,15 @@ protected:
size_t NodeMeasureLeafValue(TNode* thiz) const;
ui64 NodeSaveLeafValue(TNode* thiz, IOutputStream& os) const;
- virtual ui64 ArcMeasure(const TArc* thiz, size_t leftsize, size_t rightsize) const;
+ virtual ui64 ArcMeasure(const TArc* thiz, size_t leftsize, size_t rightsize) const;
- virtual ui64 ArcSaveSelf(const TArc* thiz, IOutputStream& os) const;
+ virtual ui64 ArcSaveSelf(const TArc* thiz, IOutputStream& os) const;
ui64 ArcSave(const TArc* thiz, IOutputStream& os) const;
ui64 ArcSaveAndDestroy(const TArc* thiz, IOutputStream& os);
public:
TCompactTrieBuilderImpl(TCompactTrieBuilderFlags flags, TPacker packer, IAllocator* alloc);
- virtual ~TCompactTrieBuilderImpl();
+ virtual ~TCompactTrieBuilderImpl();
void DestroyNode(TNode* node);
void NodeReleasePayload(TNode* thiz);
diff --git a/library/cpp/containers/comptrie/comptrie_ut.cpp b/library/cpp/containers/comptrie/comptrie_ut.cpp
index 74bee09b5d..5b7b33e8c4 100644
--- a/library/cpp/containers/comptrie/comptrie_ut.cpp
+++ b/library/cpp/containers/comptrie/comptrie_ut.cpp
@@ -15,7 +15,7 @@
#include <util/folder/dirut.h>
#include <util/random/random.h>
-#include <util/random/fast.h>
+#include <util/random/fast.h>
#include <util/string/hex.h>
#include <util/string/cast.h>
@@ -24,7 +24,7 @@
#include "set.h"
#include "first_symbol_iterator.h"
#include "search_iterator.h"
-#include "pattern_searcher.h"
+#include "pattern_searcher.h"
#include <array>
#include <iterator>
@@ -109,10 +109,10 @@ private:
UNIT_TEST(TestBuilderFindLongestPrefix);
UNIT_TEST(TestBuilderFindLongestPrefixWithEmptyValue);
- UNIT_TEST(TestPatternSearcherEmpty);
- UNIT_TEST(TestPatternSearcherSimple);
- UNIT_TEST(TestPatternSearcherRandom);
-
+ UNIT_TEST(TestPatternSearcherEmpty);
+ UNIT_TEST(TestPatternSearcherSimple);
+ UNIT_TEST(TestPatternSearcherRandom);
+
UNIT_TEST_SUITE_END();
static const char* SampleData[];
@@ -246,21 +246,21 @@ public:
void TestBuilderFindLongestPrefix();
void TestBuilderFindLongestPrefix(size_t keysCount, double branchProbability, bool isPrefixGrouped, bool hasEmptyKey);
void TestBuilderFindLongestPrefixWithEmptyValue();
-
- void TestPatternSearcherOnDataset(
- const TVector<TString>& patterns,
- const TVector<TString>& samples
- );
- void TestPatternSearcherEmpty();
- void TestPatternSearcherSimple();
- void TestPatternSearcherRandom(
- size_t patternsNum,
- size_t patternMaxLength,
- size_t strMaxLength,
- int maxChar,
- TFastRng<ui64>& rng
- );
- void TestPatternSearcherRandom();
+
+ void TestPatternSearcherOnDataset(
+ const TVector<TString>& patterns,
+ const TVector<TString>& samples
+ );
+ void TestPatternSearcherEmpty();
+ void TestPatternSearcherSimple();
+ void TestPatternSearcherRandom(
+ size_t patternsNum,
+ size_t patternMaxLength,
+ size_t strMaxLength,
+ int maxChar,
+ TFastRng<ui64>& rng
+ );
+ void TestPatternSearcherRandom();
};
UNIT_TEST_SUITE_REGISTRATION(TCompactTrieTest);
@@ -1580,212 +1580,212 @@ void TCompactTrieTest::TestBuilderFindLongestPrefixWithEmptyValue() {
TBufferOutput buffer;
builder.Save(buffer);
}
-
-void TCompactTrieTest::TestPatternSearcherEmpty() {
- TCompactPatternSearcherBuilder<char, ui32> builder;
-
- TBufferOutput searcherData;
- builder.Save(searcherData);
-
- TCompactPatternSearcher<char, ui32> searcher(
- searcherData.Buffer().Data(),
- searcherData.Buffer().Size()
- );
-
- UNIT_ASSERT(searcher.SearchMatches("a").empty());
- UNIT_ASSERT(searcher.SearchMatches("").empty());
- UNIT_ASSERT(searcher.SearchMatches("abc").empty());
-}
-
-void TCompactTrieTest::TestPatternSearcherOnDataset(
- const TVector<TString>& patterns,
- const TVector<TString>& samples
-) {
- TCompactPatternSearcherBuilder<char, ui32> builder;
-
- for (size_t patternIdx = 0; patternIdx < patterns.size(); ++patternIdx) {
- builder.Add(patterns[patternIdx], patternIdx);
- }
-
- TBufferOutput searcherData;
- builder.Save(searcherData);
-
- TCompactPatternSearcher<char, ui32> searcher(
- searcherData.Buffer().Data(),
- searcherData.Buffer().Size()
- );
-
- for (const auto& sample : samples) {
- const auto matches = searcher.SearchMatches(sample);
-
- size_t matchesNum = 0;
- THashSet<TString> processedPatterns;
- for (const auto& pattern : patterns) {
- if (pattern.Empty() || processedPatterns.contains(pattern)) {
- continue;
- }
- for (size_t start = 0; start + pattern.Size() <= sample.Size(); ++start) {
- matchesNum += (pattern == sample.substr(start, pattern.Size()));
- }
- processedPatterns.insert(pattern);
- }
- UNIT_ASSERT_VALUES_EQUAL(matchesNum, matches.size());
-
-
- TSet<std::pair<size_t, ui32>> foundMatches;
- for (const auto& match : matches) {
- std::pair<size_t, ui32> matchParams(match.End, match.Data);
- UNIT_ASSERT(!foundMatches.contains(matchParams));
- foundMatches.insert(matchParams);
-
- const auto& pattern = patterns[match.Data];
- UNIT_ASSERT_VALUES_EQUAL(
- sample.substr(match.End - pattern.size() + 1, pattern.size()),
- pattern
- );
- }
- }
-}
-
-void TCompactTrieTest::TestPatternSearcherSimple() {
- TestPatternSearcherOnDataset(
- { // patterns
- "abcd",
- "abc",
- "ab",
- "a",
- ""
- },
- { // samples
- "abcde",
- "abcd",
- "abc",
- "ab",
- "a",
- ""
- }
- );
- TestPatternSearcherOnDataset(
- { // patterns
- "a"
- "ab",
- "abcd",
- },
- { // samples
- "abcde",
- "abcd",
- "abc",
- "ab",
- "a",
- ""
- }
- );
- TestPatternSearcherOnDataset(
- { // patterns
- "aaaa",
- "aaa",
- "aa",
- "a",
- },
- { // samples
- "aaaaaaaaaaaa"
- }
- );
- TestPatternSearcherOnDataset(
- { // patterns
- "aa", "ab", "ac", "ad", "ae", "af",
- "ba", "bb", "bc", "bd", "be", "bf",
- "ca", "cb", "cc", "cd", "ce", "cf",
- "da", "db", "dc", "dd", "de", "df",
- "ea", "eb", "ec", "ed", "ee", "ef",
- "fa", "fb", "fc", "fd", "fe", "ff"
- },
- { // samples
- "dcabafeebfdcbacddacadbaabecdbaeffecdbfabcdcabcfaefecdfebacfedacefbdcacfeb",
- "abcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefancdefancdef",
- "fedcbafedcbafedcbafedcbafedcbafedcbafedcbafedcbafedcbafedcbafedcbafedcba",
- "",
- "a", "b", "c", "d", "e", "f",
- "aa", "ab", "ac", "ad", "ae", "af",
- "ba", "bb", "bc", "bd", "be", "bf",
- "ca", "cb", "cc", "cd", "ce", "cf",
- "da", "db", "dc", "dd", "de", "df",
- "ea", "eb", "ec", "ed", "ee", "ef",
- "fa", "fb", "fc", "fd", "fe", "ff"
- }
- );
-}
-
-static char RandChar(
- TFastRng<ui64>& rng,
- int maxChar
-) {
- return static_cast<char>(rng.GenRand() % (maxChar + 1));
-}
-
-static TString RandStr(
- TFastRng<ui64>& rng,
- size_t maxLength,
- int maxChar,
- bool nonEmpty = false
-) {
- Y_ASSERT(maxLength > 0);
-
- size_t length;
- if (nonEmpty) {
- length = rng.GenRand() % maxLength + 1;
- } else {
- length = rng.GenRand() % (maxLength + 1);
- }
-
- TString result;
- while (result.size() < length) {
- result += RandChar(rng, maxChar);
- }
-
- return result;
-}
-
-void TCompactTrieTest::TestPatternSearcherRandom(
- size_t patternsNum,
- size_t patternMaxLength,
- size_t strMaxLength,
- int maxChar,
- TFastRng<ui64>& rng
-) {
- auto patternToSearch = RandStr(rng, patternMaxLength, maxChar, /*nonEmpty*/true);
-
- TVector<TString> patterns = {patternToSearch};
- while (patterns.size() < patternsNum) {
- patterns.push_back(RandStr(rng, patternMaxLength, maxChar, /*nonEmpty*/true));
- }
-
- auto filler = RandStr(rng, strMaxLength - patternToSearch.Size() + 1, maxChar);
- size_t leftFillerSize = rng.GenRand() % (filler.size() + 1);
- auto leftFiller = filler.substr(0, leftFillerSize);
- auto rightFiller = filler.substr(leftFillerSize, filler.size() - leftFillerSize);
- auto sample = leftFiller + patternToSearch + rightFiller;
-
- TestPatternSearcherOnDataset(patterns, {sample});
-}
-
-void TCompactTrieTest::TestPatternSearcherRandom() {
- TFastRng<ui64> rng(0);
- for (size_t patternMaxLen : {1, 2, 10}) {
- for (size_t strMaxLen : TVector<size_t>{patternMaxLen, 2 * patternMaxLen, 10}) {
- for (int maxChar : {0, 1, 5, 255}) {
- for (size_t patternsNum : {1, 10}) {
- for (size_t testIdx = 0; testIdx < 3; ++testIdx) {
- TestPatternSearcherRandom(
- patternsNum,
- patternMaxLen,
- strMaxLen,
- maxChar,
- rng
- );
- }
- }
- }
- }
- }
-}
+
+void TCompactTrieTest::TestPatternSearcherEmpty() {
+ TCompactPatternSearcherBuilder<char, ui32> builder;
+
+ TBufferOutput searcherData;
+ builder.Save(searcherData);
+
+ TCompactPatternSearcher<char, ui32> searcher(
+ searcherData.Buffer().Data(),
+ searcherData.Buffer().Size()
+ );
+
+ UNIT_ASSERT(searcher.SearchMatches("a").empty());
+ UNIT_ASSERT(searcher.SearchMatches("").empty());
+ UNIT_ASSERT(searcher.SearchMatches("abc").empty());
+}
+
+void TCompactTrieTest::TestPatternSearcherOnDataset(
+ const TVector<TString>& patterns,
+ const TVector<TString>& samples
+) {
+ TCompactPatternSearcherBuilder<char, ui32> builder;
+
+ for (size_t patternIdx = 0; patternIdx < patterns.size(); ++patternIdx) {
+ builder.Add(patterns[patternIdx], patternIdx);
+ }
+
+ TBufferOutput searcherData;
+ builder.Save(searcherData);
+
+ TCompactPatternSearcher<char, ui32> searcher(
+ searcherData.Buffer().Data(),
+ searcherData.Buffer().Size()
+ );
+
+ for (const auto& sample : samples) {
+ const auto matches = searcher.SearchMatches(sample);
+
+ size_t matchesNum = 0;
+ THashSet<TString> processedPatterns;
+ for (const auto& pattern : patterns) {
+ if (pattern.Empty() || processedPatterns.contains(pattern)) {
+ continue;
+ }
+ for (size_t start = 0; start + pattern.Size() <= sample.Size(); ++start) {
+ matchesNum += (pattern == sample.substr(start, pattern.Size()));
+ }
+ processedPatterns.insert(pattern);
+ }
+ UNIT_ASSERT_VALUES_EQUAL(matchesNum, matches.size());
+
+
+ TSet<std::pair<size_t, ui32>> foundMatches;
+ for (const auto& match : matches) {
+ std::pair<size_t, ui32> matchParams(match.End, match.Data);
+ UNIT_ASSERT(!foundMatches.contains(matchParams));
+ foundMatches.insert(matchParams);
+
+ const auto& pattern = patterns[match.Data];
+ UNIT_ASSERT_VALUES_EQUAL(
+ sample.substr(match.End - pattern.size() + 1, pattern.size()),
+ pattern
+ );
+ }
+ }
+}
+
+void TCompactTrieTest::TestPatternSearcherSimple() {
+ TestPatternSearcherOnDataset(
+ { // patterns
+ "abcd",
+ "abc",
+ "ab",
+ "a",
+ ""
+ },
+ { // samples
+ "abcde",
+ "abcd",
+ "abc",
+ "ab",
+ "a",
+ ""
+ }
+ );
+ TestPatternSearcherOnDataset(
+ { // patterns
+ "a"
+ "ab",
+ "abcd",
+ },
+ { // samples
+ "abcde",
+ "abcd",
+ "abc",
+ "ab",
+ "a",
+ ""
+ }
+ );
+ TestPatternSearcherOnDataset(
+ { // patterns
+ "aaaa",
+ "aaa",
+ "aa",
+ "a",
+ },
+ { // samples
+ "aaaaaaaaaaaa"
+ }
+ );
+ TestPatternSearcherOnDataset(
+ { // patterns
+ "aa", "ab", "ac", "ad", "ae", "af",
+ "ba", "bb", "bc", "bd", "be", "bf",
+ "ca", "cb", "cc", "cd", "ce", "cf",
+ "da", "db", "dc", "dd", "de", "df",
+ "ea", "eb", "ec", "ed", "ee", "ef",
+ "fa", "fb", "fc", "fd", "fe", "ff"
+ },
+ { // samples
+ "dcabafeebfdcbacddacadbaabecdbaeffecdbfabcdcabcfaefecdfebacfedacefbdcacfeb",
+ "abcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefancdefancdef",
+ "fedcbafedcbafedcbafedcbafedcbafedcbafedcbafedcbafedcbafedcbafedcbafedcba",
+ "",
+ "a", "b", "c", "d", "e", "f",
+ "aa", "ab", "ac", "ad", "ae", "af",
+ "ba", "bb", "bc", "bd", "be", "bf",
+ "ca", "cb", "cc", "cd", "ce", "cf",
+ "da", "db", "dc", "dd", "de", "df",
+ "ea", "eb", "ec", "ed", "ee", "ef",
+ "fa", "fb", "fc", "fd", "fe", "ff"
+ }
+ );
+}
+
+static char RandChar(
+ TFastRng<ui64>& rng,
+ int maxChar
+) {
+ return static_cast<char>(rng.GenRand() % (maxChar + 1));
+}
+
+static TString RandStr(
+ TFastRng<ui64>& rng,
+ size_t maxLength,
+ int maxChar,
+ bool nonEmpty = false
+) {
+ Y_ASSERT(maxLength > 0);
+
+ size_t length;
+ if (nonEmpty) {
+ length = rng.GenRand() % maxLength + 1;
+ } else {
+ length = rng.GenRand() % (maxLength + 1);
+ }
+
+ TString result;
+ while (result.size() < length) {
+ result += RandChar(rng, maxChar);
+ }
+
+ return result;
+}
+
+void TCompactTrieTest::TestPatternSearcherRandom(
+ size_t patternsNum,
+ size_t patternMaxLength,
+ size_t strMaxLength,
+ int maxChar,
+ TFastRng<ui64>& rng
+) {
+ auto patternToSearch = RandStr(rng, patternMaxLength, maxChar, /*nonEmpty*/true);
+
+ TVector<TString> patterns = {patternToSearch};
+ while (patterns.size() < patternsNum) {
+ patterns.push_back(RandStr(rng, patternMaxLength, maxChar, /*nonEmpty*/true));
+ }
+
+ auto filler = RandStr(rng, strMaxLength - patternToSearch.Size() + 1, maxChar);
+ size_t leftFillerSize = rng.GenRand() % (filler.size() + 1);
+ auto leftFiller = filler.substr(0, leftFillerSize);
+ auto rightFiller = filler.substr(leftFillerSize, filler.size() - leftFillerSize);
+ auto sample = leftFiller + patternToSearch + rightFiller;
+
+ TestPatternSearcherOnDataset(patterns, {sample});
+}
+
+void TCompactTrieTest::TestPatternSearcherRandom() {
+ TFastRng<ui64> rng(0);
+ for (size_t patternMaxLen : {1, 2, 10}) {
+ for (size_t strMaxLen : TVector<size_t>{patternMaxLen, 2 * patternMaxLen, 10}) {
+ for (int maxChar : {0, 1, 5, 255}) {
+ for (size_t patternsNum : {1, 10}) {
+ for (size_t testIdx = 0; testIdx < 3; ++testIdx) {
+ TestPatternSearcherRandom(
+ patternsNum,
+ patternMaxLen,
+ strMaxLen,
+ maxChar,
+ rng
+ );
+ }
+ }
+ }
+ }
+ }
+}
diff --git a/library/cpp/containers/comptrie/pattern_searcher.h b/library/cpp/containers/comptrie/pattern_searcher.h
index caab51dc1c..d0a0ca37a8 100644
--- a/library/cpp/containers/comptrie/pattern_searcher.h
+++ b/library/cpp/containers/comptrie/pattern_searcher.h
@@ -1,606 +1,606 @@
-#pragma once
-
+#pragma once
+
#include "comptrie_builder.h"
#include "comptrie_trie.h"
#include "comptrie_impl.h"
#include <library/cpp/packers/packers.h>
-
-#include <util/system/yassert.h>
-#include <util/generic/vector.h>
-#include <util/generic/deque.h>
-#include <util/stream/str.h>
-
-// Aho-Corasick algorithm implementation using CompactTrie implementation of Sedgewick's T-trie
-
-namespace NCompactTrie {
- struct TSuffixLink {
- ui64 NextSuffixOffset;
- ui64 NextSuffixWithDataOffset;
-
- TSuffixLink(ui64 nextSuffixOffset = 0, ui64 nextSuffixWithDataOffset = 0)
- : NextSuffixOffset(nextSuffixOffset)
- , NextSuffixWithDataOffset(nextSuffixWithDataOffset)
- {
- }
- };
-
- const size_t FLAGS_SIZE = sizeof(char);
- const size_t SYMBOL_SIZE = sizeof(char);
-};
-
-template <class T = char, class D = ui64, class S = TCompactTriePacker<D>>
-class TCompactPatternSearcherBuilder : protected TCompactTrieBuilder<T, D, S> {
-public:
- typedef T TSymbol;
- typedef D TData;
- typedef S TPacker;
-
- typedef typename TCompactTrieKeySelector<TSymbol>::TKey TKey;
- typedef typename TCompactTrieKeySelector<TSymbol>::TKeyBuf TKeyBuf;
-
- typedef TCompactTrieBuilder<T, D, S> TBase;
-
-public:
- TCompactPatternSearcherBuilder() {
+
+#include <util/system/yassert.h>
+#include <util/generic/vector.h>
+#include <util/generic/deque.h>
+#include <util/stream/str.h>
+
+// Aho-Corasick algorithm implementation using CompactTrie implementation of Sedgewick's T-trie
+
+namespace NCompactTrie {
+ struct TSuffixLink {
+ ui64 NextSuffixOffset;
+ ui64 NextSuffixWithDataOffset;
+
+ TSuffixLink(ui64 nextSuffixOffset = 0, ui64 nextSuffixWithDataOffset = 0)
+ : NextSuffixOffset(nextSuffixOffset)
+ , NextSuffixWithDataOffset(nextSuffixWithDataOffset)
+ {
+ }
+ };
+
+ const size_t FLAGS_SIZE = sizeof(char);
+ const size_t SYMBOL_SIZE = sizeof(char);
+};
+
+template <class T = char, class D = ui64, class S = TCompactTriePacker<D>>
+class TCompactPatternSearcherBuilder : protected TCompactTrieBuilder<T, D, S> {
+public:
+ typedef T TSymbol;
+ typedef D TData;
+ typedef S TPacker;
+
+ typedef typename TCompactTrieKeySelector<TSymbol>::TKey TKey;
+ typedef typename TCompactTrieKeySelector<TSymbol>::TKeyBuf TKeyBuf;
+
+ typedef TCompactTrieBuilder<T, D, S> TBase;
+
+public:
+ TCompactPatternSearcherBuilder() {
TBase::Impl = MakeHolder<TCompactPatternSearcherBuilderImpl>();
- }
-
- bool Add(const TSymbol* key, size_t keyLength, const TData& value) {
- return TBase::Impl->AddEntry(key, keyLength, value);
- }
- bool Add(const TKeyBuf& key, const TData& value) {
- return Add(key.data(), key.size(), value);
- }
-
- bool Find(const TSymbol* key, size_t keyLength, TData* value) const {
- return TBase::Impl->FindEntry(key, keyLength, value);
- }
- bool Find(const TKeyBuf& key, TData* value = nullptr) const {
- return Find(key.data(), key.size(), value);
- }
-
- size_t Save(IOutputStream& os) const {
- size_t trieSize = TBase::Impl->MeasureByteSize();
- TBufferOutput serializedTrie(trieSize);
- TBase::Impl->Save(serializedTrie);
-
- auto serializedTrieBuffer = serializedTrie.Buffer();
- CalculateSuffixLinks(
- serializedTrieBuffer.Data(),
- serializedTrieBuffer.Data() + serializedTrieBuffer.Size()
- );
-
- os.Write(serializedTrieBuffer.Data(), serializedTrieBuffer.Size());
- return trieSize;
- }
-
- TBlob Save() const {
- TBufferStream buffer;
- Save(buffer);
- return TBlob::FromStream(buffer);
- }
-
- size_t SaveToFile(const TString& fileName) const {
- TFileOutput out(fileName);
- return Save(out);
- }
-
- size_t MeasureByteSize() const {
- return TBase::Impl->MeasureByteSize();
- }
-
-private:
- void CalculateSuffixLinks(char* trieStart, const char* trieEnd) const;
-
-protected:
- class TCompactPatternSearcherBuilderImpl : public TBase::TCompactTrieBuilderImpl {
- public:
- typedef typename TBase::TCompactTrieBuilderImpl TImplBase;
-
- TCompactPatternSearcherBuilderImpl(
- TCompactTrieBuilderFlags flags = CTBF_NONE,
- TPacker packer = TPacker(),
- IAllocator* alloc = TDefaultAllocator::Instance()
- ) : TImplBase(flags, packer, alloc) {
- }
-
- ui64 ArcMeasure(
- const typename TImplBase::TArc* arc,
- size_t leftSize,
- size_t rightSize
- ) const override {
- using namespace NCompactTrie;
-
- size_t coreSize = SYMBOL_SIZE + FLAGS_SIZE +
- sizeof(TSuffixLink) +
- this->NodeMeasureLeafValue(arc->Node);
- size_t treeSize = this->NodeMeasureSubtree(arc->Node);
-
- if (arc->Label.Length() > 0)
- treeSize += (SYMBOL_SIZE + FLAGS_SIZE + sizeof(TSuffixLink)) *
- (arc->Label.Length() - 1);
-
- // Triple measurements are needed because the space needed to store the offset
- // shall be added to the offset itself. Hence three iterations.
- size_t leftOffsetSize = 0;
- size_t rightOffsetSize = 0;
- for (size_t iteration = 0; iteration < 3; ++iteration) {
- leftOffsetSize = leftSize ? MeasureOffset(
- coreSize + treeSize + leftOffsetSize + rightOffsetSize) : 0;
- rightOffsetSize = rightSize ? MeasureOffset(
- coreSize + treeSize + leftSize + leftOffsetSize + rightOffsetSize) : 0;
- }
-
- coreSize += leftOffsetSize + rightOffsetSize;
- arc->LeftOffset = leftSize ? coreSize + treeSize : 0;
- arc->RightOffset = rightSize ? coreSize + treeSize + leftSize : 0;
-
- return coreSize + treeSize + leftSize + rightSize;
- }
-
- ui64 ArcSaveSelf(const typename TImplBase::TArc* arc, IOutputStream& os) const override {
- using namespace NCompactTrie;
-
- ui64 written = 0;
-
- size_t leftOffsetSize = MeasureOffset(arc->LeftOffset);
- size_t rightOffsetSize = MeasureOffset(arc->RightOffset);
-
- size_t labelLen = arc->Label.Length();
-
- for (size_t labelPos = 0; labelPos < labelLen; ++labelPos) {
- char flags = 0;
-
- if (labelPos == 0) {
- flags |= (leftOffsetSize << MT_LEFTSHIFT);
- flags |= (rightOffsetSize << MT_RIGHTSHIFT);
- }
-
- if (labelPos == labelLen - 1) {
- if (arc->Node->IsFinal())
- flags |= MT_FINAL;
- if (!arc->Node->IsLast())
- flags |= MT_NEXT;
- } else {
- flags |= MT_NEXT;
- }
-
- os.Write(&flags, 1);
- os.Write(&arc->Label.AsCharPtr()[labelPos], 1);
- written += 2;
-
- TSuffixLink suffixlink;
- os.Write(&suffixlink, sizeof(TSuffixLink));
- written += sizeof(TSuffixLink);
-
- if (labelPos == 0) {
- written += ArcSaveOffset(arc->LeftOffset, os);
- written += ArcSaveOffset(arc->RightOffset, os);
- }
- }
-
- written += this->NodeSaveLeafValue(arc->Node, os);
- return written;
- }
- };
-};
-
-
-template <class T>
-struct TPatternMatch {
- ui64 End;
- T Data;
-
- TPatternMatch(ui64 end, const T& data)
- : End(end)
- , Data(data)
- {
- }
-};
-
-
-template <class T = char, class D = ui64, class S = TCompactTriePacker<D>>
-class TCompactPatternSearcher {
-public:
- typedef T TSymbol;
- typedef D TData;
- typedef S TPacker;
-
- typedef typename TCompactTrieKeySelector<TSymbol>::TKey TKey;
- typedef typename TCompactTrieKeySelector<TSymbol>::TKeyBuf TKeyBuf;
-
- typedef TCompactTrie<TSymbol, TData, TPacker> TTrie;
-public:
- TCompactPatternSearcher()
- {
- }
-
- explicit TCompactPatternSearcher(const TBlob& data)
- : Trie(data)
- {
- }
-
- TCompactPatternSearcher(const char* data, size_t size)
- : Trie(data, size)
- {
- }
-
- TVector<TPatternMatch<TData>> SearchMatches(const TSymbol* text, size_t textSize) const;
- TVector<TPatternMatch<TData>> SearchMatches(const TKeyBuf& text) const {
- return SearchMatches(text.data(), text.size());
- }
-private:
- TTrie Trie;
-};
-
-////////////////////
-// Implementation //
-////////////////////
-
-namespace {
-
-template <class TData, class TPacker>
-char ReadNode(
- char* nodeStart,
- char*& leftSibling,
- char*& rightSibling,
- char*& directChild,
- NCompactTrie::TSuffixLink*& suffixLink,
- TPacker packer = TPacker()
-) {
- char* dataPos = nodeStart;
- char flags = *(dataPos++);
-
- Y_ASSERT(!NCompactTrie::IsEpsilonLink(flags)); // Epsilon links are not allowed
-
- char label = *(dataPos++);
-
- suffixLink = (NCompactTrie::TSuffixLink*)dataPos;
- dataPos += sizeof(NCompactTrie::TSuffixLink);
-
- { // Left branch
- size_t offsetLength = NCompactTrie::LeftOffsetLen(flags);
- size_t leftOffset = NCompactTrie::UnpackOffset(dataPos, offsetLength);
- leftSibling = leftOffset ? (nodeStart + leftOffset) : nullptr;
-
- dataPos += offsetLength;
- }
-
-
- { // Right branch
- size_t offsetLength = NCompactTrie::RightOffsetLen(flags);
- size_t rightOffset = NCompactTrie::UnpackOffset(dataPos, offsetLength);
- rightSibling = rightOffset ? (nodeStart + rightOffset) : nullptr;
-
- dataPos += offsetLength;
- }
-
- directChild = nullptr;
- if (flags & NCompactTrie::MT_NEXT) {
- directChild = dataPos;
- if (flags & NCompactTrie::MT_FINAL) {
- directChild += packer.SkipLeaf(directChild);
- }
- }
-
- return label;
-}
-
-template <class TData, class TPacker>
-char ReadNodeConst(
- const char* nodeStart,
- const char*& leftSibling,
- const char*& rightSibling,
- const char*& directChild,
- const char*& data,
- NCompactTrie::TSuffixLink& suffixLink,
- TPacker packer = TPacker()
-) {
- const char* dataPos = nodeStart;
- char flags = *(dataPos++);
-
- Y_ASSERT(!NCompactTrie::IsEpsilonLink(flags)); // Epsilon links are not allowed
-
- char label = *(dataPos++);
-
- suffixLink = *((NCompactTrie::TSuffixLink*)dataPos);
- dataPos += sizeof(NCompactTrie::TSuffixLink);
-
- { // Left branch
- size_t offsetLength = NCompactTrie::LeftOffsetLen(flags);
- size_t leftOffset = NCompactTrie::UnpackOffset(dataPos, offsetLength);
- leftSibling = leftOffset ? (nodeStart + leftOffset) : nullptr;
-
- dataPos += offsetLength;
- }
-
-
- { // Right branch
- size_t offsetLength = NCompactTrie::RightOffsetLen(flags);
- size_t rightOffset = NCompactTrie::UnpackOffset(dataPos, offsetLength);
- rightSibling = rightOffset ? (nodeStart + rightOffset) : nullptr;
-
- dataPos += offsetLength;
- }
-
- data = nullptr;
- if (flags & NCompactTrie::MT_FINAL) {
- data = dataPos;
- }
- directChild = nullptr;
- if (flags & NCompactTrie::MT_NEXT) {
- directChild = dataPos;
- if (flags & NCompactTrie::MT_FINAL) {
- directChild += packer.SkipLeaf(directChild);
- }
- }
-
- return label;
-}
-
-Y_FORCE_INLINE bool Advance(
- const char*& dataPos,
- const char* const dataEnd,
- char label
-) {
- if (dataPos == nullptr) {
- return false;
- }
-
- while (dataPos < dataEnd) {
- size_t offsetLength, offset;
- const char* startPos = dataPos;
- char flags = *(dataPos++);
- char symbol = *(dataPos++);
- dataPos += sizeof(NCompactTrie::TSuffixLink);
-
- // Left branch
- offsetLength = NCompactTrie::LeftOffsetLen(flags);
- if ((unsigned char)label < (unsigned char)symbol) {
- offset = NCompactTrie::UnpackOffset(dataPos, offsetLength);
- if (!offset)
- break;
-
- dataPos = startPos + offset;
- continue;
- }
-
- dataPos += offsetLength;
-
- // Right branch
- offsetLength = NCompactTrie::RightOffsetLen(flags);
- if ((unsigned char)label > (unsigned char)symbol) {
- offset = NCompactTrie::UnpackOffset(dataPos, offsetLength);
- if (!offset)
- break;
-
- dataPos = startPos + offset;
- continue;
- }
-
- dataPos = startPos;
- return true;
- }
-
- // if we got here, we're past the dataend - bail out ASAP
- dataPos = nullptr;
- return false;
-}
-
-} // anonymous
-
-template <class T, class D, class S>
-void TCompactPatternSearcherBuilder<T, D, S>::CalculateSuffixLinks(
- char* trieStart,
- const char* trieEnd
-) const {
- struct TBfsElement {
- char* Node;
- const char* Parent;
-
- TBfsElement(char* node, const char* parent)
- : Node(node)
- , Parent(parent)
- {
- }
- };
-
- TDeque<TBfsElement> bfsQueue;
- if (trieStart && trieStart != trieEnd) {
- bfsQueue.emplace_back(trieStart, nullptr);
- }
-
- while (!bfsQueue.empty()) {
- auto front = bfsQueue.front();
- char* node = front.Node;
- const char* parent = front.Parent;
- bfsQueue.pop_front();
-
- char* leftSibling;
- char* rightSibling;
- char* directChild;
- NCompactTrie::TSuffixLink* suffixLink;
-
- char label = ReadNode<TData, TPacker>(
- node,
- leftSibling,
- rightSibling,
- directChild,
- suffixLink
- );
-
- const char* suffix;
-
- if (parent == nullptr) {
- suffix = node;
- } else {
- const char* parentOfSuffix = parent;
- const char* temp;
- do {
- NCompactTrie::TSuffixLink parentOfSuffixSuffixLink;
-
- ReadNodeConst<TData, TPacker>(
- parentOfSuffix,
- /*left*/temp,
- /*right*/temp,
- /*direct*/temp,
- /*data*/temp,
- parentOfSuffixSuffixLink
- );
- if (parentOfSuffixSuffixLink.NextSuffixOffset == 0) {
- suffix = trieStart;
- if (!Advance(suffix, trieEnd, label)) {
- suffix = node;
- }
- break;
- }
- parentOfSuffix += parentOfSuffixSuffixLink.NextSuffixOffset;
-
- NCompactTrie::TSuffixLink tempSuffixLink;
- ReadNodeConst<TData, TPacker>(
- parentOfSuffix,
- /*left*/temp,
- /*right*/temp,
- /*direct*/suffix,
- /*data*/temp,
- tempSuffixLink
- );
-
- if (suffix == nullptr) {
- continue;
- }
- } while (!Advance(suffix, trieEnd, label));
- }
-
- suffixLink->NextSuffixOffset = suffix - node;
-
- NCompactTrie::TSuffixLink suffixSuffixLink;
- const char* suffixData;
- const char* temp;
- ReadNodeConst<TData, TPacker>(
- suffix,
- /*left*/temp,
- /*right*/temp,
- /*direct*/temp,
- suffixData,
- suffixSuffixLink
- );
- suffixLink->NextSuffixWithDataOffset = suffix - node;
- if (suffixData == nullptr) {
- suffixLink->NextSuffixWithDataOffset += suffixSuffixLink.NextSuffixWithDataOffset;
- }
-
- if (directChild) {
- bfsQueue.emplace_back(directChild, node);
- }
-
- if (leftSibling) {
- bfsQueue.emplace_front(leftSibling, parent);
- }
-
- if (rightSibling) {
- bfsQueue.emplace_front(rightSibling, parent);
- }
- }
-}
-
-
-template<class T, class D, class S>
-TVector<TPatternMatch<D>> TCompactPatternSearcher<T, D, S>::SearchMatches(
- const TSymbol* text,
- size_t textSize
-) const {
- const char* temp;
- NCompactTrie::TSuffixLink tempSuffixLink;
-
- const auto& trieData = Trie.Data();
- const char* trieStart = trieData.AsCharPtr();
- size_t dataSize = trieData.Length();
- const char* trieEnd = trieStart + dataSize;
-
- const char* lastNode = nullptr;
- const char* currentSubtree = trieStart;
-
- TVector<TPatternMatch<TData>> matches;
-
- for (const TSymbol* position = text; position < text + textSize; ++position) {
- TSymbol symbol = *position;
- for (i64 i = (i64)NCompactTrie::ExtraBits<TSymbol>(); i >= 0; i -= 8) {
- char label = (char)(symbol >> i);
-
- // Find first suffix extendable by label
- while (true) {
- const char* nextLastNode = currentSubtree;
- if (Advance(nextLastNode, trieEnd, label)) {
- lastNode = nextLastNode;
- ReadNodeConst<TData, TPacker>(
- lastNode,
- /*left*/temp,
- /*right*/temp,
- currentSubtree,
- /*data*/temp,
- tempSuffixLink
- );
- break;
- } else {
- if (lastNode == nullptr) {
- break;
- }
- }
-
- NCompactTrie::TSuffixLink suffixLink;
- ReadNodeConst<TData, TPacker>(
- lastNode,
- /*left*/temp,
- /*right*/temp,
- /*direct*/temp,
- /*data*/temp,
- suffixLink
- );
- if (suffixLink.NextSuffixOffset == 0) {
- lastNode = nullptr;
- currentSubtree = trieStart;
- continue;
- }
- lastNode += suffixLink.NextSuffixOffset;
- ReadNodeConst<TData, TPacker>(
- lastNode,
- /*left*/temp,
- /*right*/temp,
- currentSubtree,
- /*data*/temp,
- tempSuffixLink
- );
- }
-
- // Iterate through all suffixes
- const char* suffix = lastNode;
- while (suffix != nullptr) {
- const char* nodeData;
- NCompactTrie::TSuffixLink suffixLink;
- ReadNodeConst<TData, TPacker>(
- suffix,
- /*left*/temp,
- /*right*/temp,
- /*direct*/temp,
- nodeData,
- suffixLink
- );
- if (nodeData != nullptr) {
- TData data;
- Trie.GetPacker().UnpackLeaf(nodeData, data);
- matches.emplace_back(
- position - text,
- data
- );
- }
- if (suffixLink.NextSuffixOffset == 0) {
- break;
- }
- suffix += suffixLink.NextSuffixWithDataOffset;
- }
- }
- }
-
- return matches;
-}
+ }
+
+ bool Add(const TSymbol* key, size_t keyLength, const TData& value) {
+ return TBase::Impl->AddEntry(key, keyLength, value);
+ }
+ bool Add(const TKeyBuf& key, const TData& value) {
+ return Add(key.data(), key.size(), value);
+ }
+
+ bool Find(const TSymbol* key, size_t keyLength, TData* value) const {
+ return TBase::Impl->FindEntry(key, keyLength, value);
+ }
+ bool Find(const TKeyBuf& key, TData* value = nullptr) const {
+ return Find(key.data(), key.size(), value);
+ }
+
+ size_t Save(IOutputStream& os) const {
+ size_t trieSize = TBase::Impl->MeasureByteSize();
+ TBufferOutput serializedTrie(trieSize);
+ TBase::Impl->Save(serializedTrie);
+
+ auto serializedTrieBuffer = serializedTrie.Buffer();
+ CalculateSuffixLinks(
+ serializedTrieBuffer.Data(),
+ serializedTrieBuffer.Data() + serializedTrieBuffer.Size()
+ );
+
+ os.Write(serializedTrieBuffer.Data(), serializedTrieBuffer.Size());
+ return trieSize;
+ }
+
+ TBlob Save() const {
+ TBufferStream buffer;
+ Save(buffer);
+ return TBlob::FromStream(buffer);
+ }
+
+ size_t SaveToFile(const TString& fileName) const {
+ TFileOutput out(fileName);
+ return Save(out);
+ }
+
+ size_t MeasureByteSize() const {
+ return TBase::Impl->MeasureByteSize();
+ }
+
+private:
+ void CalculateSuffixLinks(char* trieStart, const char* trieEnd) const;
+
+protected:
+ class TCompactPatternSearcherBuilderImpl : public TBase::TCompactTrieBuilderImpl {
+ public:
+ typedef typename TBase::TCompactTrieBuilderImpl TImplBase;
+
+ TCompactPatternSearcherBuilderImpl(
+ TCompactTrieBuilderFlags flags = CTBF_NONE,
+ TPacker packer = TPacker(),
+ IAllocator* alloc = TDefaultAllocator::Instance()
+ ) : TImplBase(flags, packer, alloc) {
+ }
+
+ ui64 ArcMeasure(
+ const typename TImplBase::TArc* arc,
+ size_t leftSize,
+ size_t rightSize
+ ) const override {
+ using namespace NCompactTrie;
+
+ size_t coreSize = SYMBOL_SIZE + FLAGS_SIZE +
+ sizeof(TSuffixLink) +
+ this->NodeMeasureLeafValue(arc->Node);
+ size_t treeSize = this->NodeMeasureSubtree(arc->Node);
+
+ if (arc->Label.Length() > 0)
+ treeSize += (SYMBOL_SIZE + FLAGS_SIZE + sizeof(TSuffixLink)) *
+ (arc->Label.Length() - 1);
+
+ // Triple measurements are needed because the space needed to store the offset
+ // shall be added to the offset itself. Hence three iterations.
+ size_t leftOffsetSize = 0;
+ size_t rightOffsetSize = 0;
+ for (size_t iteration = 0; iteration < 3; ++iteration) {
+ leftOffsetSize = leftSize ? MeasureOffset(
+ coreSize + treeSize + leftOffsetSize + rightOffsetSize) : 0;
+ rightOffsetSize = rightSize ? MeasureOffset(
+ coreSize + treeSize + leftSize + leftOffsetSize + rightOffsetSize) : 0;
+ }
+
+ coreSize += leftOffsetSize + rightOffsetSize;
+ arc->LeftOffset = leftSize ? coreSize + treeSize : 0;
+ arc->RightOffset = rightSize ? coreSize + treeSize + leftSize : 0;
+
+ return coreSize + treeSize + leftSize + rightSize;
+ }
+
+ ui64 ArcSaveSelf(const typename TImplBase::TArc* arc, IOutputStream& os) const override {
+ using namespace NCompactTrie;
+
+ ui64 written = 0;
+
+ size_t leftOffsetSize = MeasureOffset(arc->LeftOffset);
+ size_t rightOffsetSize = MeasureOffset(arc->RightOffset);
+
+ size_t labelLen = arc->Label.Length();
+
+ for (size_t labelPos = 0; labelPos < labelLen; ++labelPos) {
+ char flags = 0;
+
+ if (labelPos == 0) {
+ flags |= (leftOffsetSize << MT_LEFTSHIFT);
+ flags |= (rightOffsetSize << MT_RIGHTSHIFT);
+ }
+
+ if (labelPos == labelLen - 1) {
+ if (arc->Node->IsFinal())
+ flags |= MT_FINAL;
+ if (!arc->Node->IsLast())
+ flags |= MT_NEXT;
+ } else {
+ flags |= MT_NEXT;
+ }
+
+ os.Write(&flags, 1);
+ os.Write(&arc->Label.AsCharPtr()[labelPos], 1);
+ written += 2;
+
+ TSuffixLink suffixlink;
+ os.Write(&suffixlink, sizeof(TSuffixLink));
+ written += sizeof(TSuffixLink);
+
+ if (labelPos == 0) {
+ written += ArcSaveOffset(arc->LeftOffset, os);
+ written += ArcSaveOffset(arc->RightOffset, os);
+ }
+ }
+
+ written += this->NodeSaveLeafValue(arc->Node, os);
+ return written;
+ }
+ };
+};
+
+
+template <class T>
+struct TPatternMatch {
+ ui64 End;
+ T Data;
+
+ TPatternMatch(ui64 end, const T& data)
+ : End(end)
+ , Data(data)
+ {
+ }
+};
+
+
+template <class T = char, class D = ui64, class S = TCompactTriePacker<D>>
+class TCompactPatternSearcher {
+public:
+ typedef T TSymbol;
+ typedef D TData;
+ typedef S TPacker;
+
+ typedef typename TCompactTrieKeySelector<TSymbol>::TKey TKey;
+ typedef typename TCompactTrieKeySelector<TSymbol>::TKeyBuf TKeyBuf;
+
+ typedef TCompactTrie<TSymbol, TData, TPacker> TTrie;
+public:
+ TCompactPatternSearcher()
+ {
+ }
+
+ explicit TCompactPatternSearcher(const TBlob& data)
+ : Trie(data)
+ {
+ }
+
+ TCompactPatternSearcher(const char* data, size_t size)
+ : Trie(data, size)
+ {
+ }
+
+ TVector<TPatternMatch<TData>> SearchMatches(const TSymbol* text, size_t textSize) const;
+ TVector<TPatternMatch<TData>> SearchMatches(const TKeyBuf& text) const {
+ return SearchMatches(text.data(), text.size());
+ }
+private:
+ TTrie Trie;
+};
+
+////////////////////
+// Implementation //
+////////////////////
+
+namespace {
+
+template <class TData, class TPacker>
+char ReadNode(
+ char* nodeStart,
+ char*& leftSibling,
+ char*& rightSibling,
+ char*& directChild,
+ NCompactTrie::TSuffixLink*& suffixLink,
+ TPacker packer = TPacker()
+) {
+ char* dataPos = nodeStart;
+ char flags = *(dataPos++);
+
+ Y_ASSERT(!NCompactTrie::IsEpsilonLink(flags)); // Epsilon links are not allowed
+
+ char label = *(dataPos++);
+
+ suffixLink = (NCompactTrie::TSuffixLink*)dataPos;
+ dataPos += sizeof(NCompactTrie::TSuffixLink);
+
+ { // Left branch
+ size_t offsetLength = NCompactTrie::LeftOffsetLen(flags);
+ size_t leftOffset = NCompactTrie::UnpackOffset(dataPos, offsetLength);
+ leftSibling = leftOffset ? (nodeStart + leftOffset) : nullptr;
+
+ dataPos += offsetLength;
+ }
+
+
+ { // Right branch
+ size_t offsetLength = NCompactTrie::RightOffsetLen(flags);
+ size_t rightOffset = NCompactTrie::UnpackOffset(dataPos, offsetLength);
+ rightSibling = rightOffset ? (nodeStart + rightOffset) : nullptr;
+
+ dataPos += offsetLength;
+ }
+
+ directChild = nullptr;
+ if (flags & NCompactTrie::MT_NEXT) {
+ directChild = dataPos;
+ if (flags & NCompactTrie::MT_FINAL) {
+ directChild += packer.SkipLeaf(directChild);
+ }
+ }
+
+ return label;
+}
+
+template <class TData, class TPacker>
+char ReadNodeConst(
+ const char* nodeStart,
+ const char*& leftSibling,
+ const char*& rightSibling,
+ const char*& directChild,
+ const char*& data,
+ NCompactTrie::TSuffixLink& suffixLink,
+ TPacker packer = TPacker()
+) {
+ const char* dataPos = nodeStart;
+ char flags = *(dataPos++);
+
+ Y_ASSERT(!NCompactTrie::IsEpsilonLink(flags)); // Epsilon links are not allowed
+
+ char label = *(dataPos++);
+
+ suffixLink = *((NCompactTrie::TSuffixLink*)dataPos);
+ dataPos += sizeof(NCompactTrie::TSuffixLink);
+
+ { // Left branch
+ size_t offsetLength = NCompactTrie::LeftOffsetLen(flags);
+ size_t leftOffset = NCompactTrie::UnpackOffset(dataPos, offsetLength);
+ leftSibling = leftOffset ? (nodeStart + leftOffset) : nullptr;
+
+ dataPos += offsetLength;
+ }
+
+
+ { // Right branch
+ size_t offsetLength = NCompactTrie::RightOffsetLen(flags);
+ size_t rightOffset = NCompactTrie::UnpackOffset(dataPos, offsetLength);
+ rightSibling = rightOffset ? (nodeStart + rightOffset) : nullptr;
+
+ dataPos += offsetLength;
+ }
+
+ data = nullptr;
+ if (flags & NCompactTrie::MT_FINAL) {
+ data = dataPos;
+ }
+ directChild = nullptr;
+ if (flags & NCompactTrie::MT_NEXT) {
+ directChild = dataPos;
+ if (flags & NCompactTrie::MT_FINAL) {
+ directChild += packer.SkipLeaf(directChild);
+ }
+ }
+
+ return label;
+}
+
+Y_FORCE_INLINE bool Advance(
+ const char*& dataPos,
+ const char* const dataEnd,
+ char label
+) {
+ if (dataPos == nullptr) {
+ return false;
+ }
+
+ while (dataPos < dataEnd) {
+ size_t offsetLength, offset;
+ const char* startPos = dataPos;
+ char flags = *(dataPos++);
+ char symbol = *(dataPos++);
+ dataPos += sizeof(NCompactTrie::TSuffixLink);
+
+ // Left branch
+ offsetLength = NCompactTrie::LeftOffsetLen(flags);
+ if ((unsigned char)label < (unsigned char)symbol) {
+ offset = NCompactTrie::UnpackOffset(dataPos, offsetLength);
+ if (!offset)
+ break;
+
+ dataPos = startPos + offset;
+ continue;
+ }
+
+ dataPos += offsetLength;
+
+ // Right branch
+ offsetLength = NCompactTrie::RightOffsetLen(flags);
+ if ((unsigned char)label > (unsigned char)symbol) {
+ offset = NCompactTrie::UnpackOffset(dataPos, offsetLength);
+ if (!offset)
+ break;
+
+ dataPos = startPos + offset;
+ continue;
+ }
+
+ dataPos = startPos;
+ return true;
+ }
+
+ // if we got here, we're past the dataend - bail out ASAP
+ dataPos = nullptr;
+ return false;
+}
+
+} // anonymous
+
+template <class T, class D, class S>
+void TCompactPatternSearcherBuilder<T, D, S>::CalculateSuffixLinks(
+ char* trieStart,
+ const char* trieEnd
+) const {
+ struct TBfsElement {
+ char* Node;
+ const char* Parent;
+
+ TBfsElement(char* node, const char* parent)
+ : Node(node)
+ , Parent(parent)
+ {
+ }
+ };
+
+ TDeque<TBfsElement> bfsQueue;
+ if (trieStart && trieStart != trieEnd) {
+ bfsQueue.emplace_back(trieStart, nullptr);
+ }
+
+ while (!bfsQueue.empty()) {
+ auto front = bfsQueue.front();
+ char* node = front.Node;
+ const char* parent = front.Parent;
+ bfsQueue.pop_front();
+
+ char* leftSibling;
+ char* rightSibling;
+ char* directChild;
+ NCompactTrie::TSuffixLink* suffixLink;
+
+ char label = ReadNode<TData, TPacker>(
+ node,
+ leftSibling,
+ rightSibling,
+ directChild,
+ suffixLink
+ );
+
+ const char* suffix;
+
+ if (parent == nullptr) {
+ suffix = node;
+ } else {
+ const char* parentOfSuffix = parent;
+ const char* temp;
+ do {
+ NCompactTrie::TSuffixLink parentOfSuffixSuffixLink;
+
+ ReadNodeConst<TData, TPacker>(
+ parentOfSuffix,
+ /*left*/temp,
+ /*right*/temp,
+ /*direct*/temp,
+ /*data*/temp,
+ parentOfSuffixSuffixLink
+ );
+ if (parentOfSuffixSuffixLink.NextSuffixOffset == 0) {
+ suffix = trieStart;
+ if (!Advance(suffix, trieEnd, label)) {
+ suffix = node;
+ }
+ break;
+ }
+ parentOfSuffix += parentOfSuffixSuffixLink.NextSuffixOffset;
+
+ NCompactTrie::TSuffixLink tempSuffixLink;
+ ReadNodeConst<TData, TPacker>(
+ parentOfSuffix,
+ /*left*/temp,
+ /*right*/temp,
+ /*direct*/suffix,
+ /*data*/temp,
+ tempSuffixLink
+ );
+
+ if (suffix == nullptr) {
+ continue;
+ }
+ } while (!Advance(suffix, trieEnd, label));
+ }
+
+ suffixLink->NextSuffixOffset = suffix - node;
+
+ NCompactTrie::TSuffixLink suffixSuffixLink;
+ const char* suffixData;
+ const char* temp;
+ ReadNodeConst<TData, TPacker>(
+ suffix,
+ /*left*/temp,
+ /*right*/temp,
+ /*direct*/temp,
+ suffixData,
+ suffixSuffixLink
+ );
+ suffixLink->NextSuffixWithDataOffset = suffix - node;
+ if (suffixData == nullptr) {
+ suffixLink->NextSuffixWithDataOffset += suffixSuffixLink.NextSuffixWithDataOffset;
+ }
+
+ if (directChild) {
+ bfsQueue.emplace_back(directChild, node);
+ }
+
+ if (leftSibling) {
+ bfsQueue.emplace_front(leftSibling, parent);
+ }
+
+ if (rightSibling) {
+ bfsQueue.emplace_front(rightSibling, parent);
+ }
+ }
+}
+
+
+template<class T, class D, class S>
+TVector<TPatternMatch<D>> TCompactPatternSearcher<T, D, S>::SearchMatches(
+ const TSymbol* text,
+ size_t textSize
+) const {
+ const char* temp;
+ NCompactTrie::TSuffixLink tempSuffixLink;
+
+ const auto& trieData = Trie.Data();
+ const char* trieStart = trieData.AsCharPtr();
+ size_t dataSize = trieData.Length();
+ const char* trieEnd = trieStart + dataSize;
+
+ const char* lastNode = nullptr;
+ const char* currentSubtree = trieStart;
+
+ TVector<TPatternMatch<TData>> matches;
+
+ for (const TSymbol* position = text; position < text + textSize; ++position) {
+ TSymbol symbol = *position;
+ for (i64 i = (i64)NCompactTrie::ExtraBits<TSymbol>(); i >= 0; i -= 8) {
+ char label = (char)(symbol >> i);
+
+ // Find first suffix extendable by label
+ while (true) {
+ const char* nextLastNode = currentSubtree;
+ if (Advance(nextLastNode, trieEnd, label)) {
+ lastNode = nextLastNode;
+ ReadNodeConst<TData, TPacker>(
+ lastNode,
+ /*left*/temp,
+ /*right*/temp,
+ currentSubtree,
+ /*data*/temp,
+ tempSuffixLink
+ );
+ break;
+ } else {
+ if (lastNode == nullptr) {
+ break;
+ }
+ }
+
+ NCompactTrie::TSuffixLink suffixLink;
+ ReadNodeConst<TData, TPacker>(
+ lastNode,
+ /*left*/temp,
+ /*right*/temp,
+ /*direct*/temp,
+ /*data*/temp,
+ suffixLink
+ );
+ if (suffixLink.NextSuffixOffset == 0) {
+ lastNode = nullptr;
+ currentSubtree = trieStart;
+ continue;
+ }
+ lastNode += suffixLink.NextSuffixOffset;
+ ReadNodeConst<TData, TPacker>(
+ lastNode,
+ /*left*/temp,
+ /*right*/temp,
+ currentSubtree,
+ /*data*/temp,
+ tempSuffixLink
+ );
+ }
+
+ // Iterate through all suffixes
+ const char* suffix = lastNode;
+ while (suffix != nullptr) {
+ const char* nodeData;
+ NCompactTrie::TSuffixLink suffixLink;
+ ReadNodeConst<TData, TPacker>(
+ suffix,
+ /*left*/temp,
+ /*right*/temp,
+ /*direct*/temp,
+ nodeData,
+ suffixLink
+ );
+ if (nodeData != nullptr) {
+ TData data;
+ Trie.GetPacker().UnpackLeaf(nodeData, data);
+ matches.emplace_back(
+ position - text,
+ data
+ );
+ }
+ if (suffixLink.NextSuffixOffset == 0) {
+ break;
+ }
+ suffix += suffixLink.NextSuffixWithDataOffset;
+ }
+ }
+ }
+
+ return matches;
+}
diff --git a/library/cpp/containers/comptrie/search_iterator.h b/library/cpp/containers/comptrie/search_iterator.h
index 247f7e5936..90a831c268 100644
--- a/library/cpp/containers/comptrie/search_iterator.h
+++ b/library/cpp/containers/comptrie/search_iterator.h
@@ -1,12 +1,12 @@
#pragma once
#include "comptrie_trie.h"
-#include "first_symbol_iterator.h"
-
-#include <util/str_stl.h>
-#include <util/digest/numeric.h>
-#include <util/digest/multi.h>
+#include "first_symbol_iterator.h"
+#include <util/str_stl.h>
+#include <util/digest/numeric.h>
+#include <util/digest/multi.h>
+
// Iterator for incremental searching.
// All Advance() methods shift the iterator using specifed key/char.
// The subsequent Advance() call starts searching from the previous state.
@@ -23,26 +23,26 @@ public:
using TSymbol = typename TTrie::TSymbol;
using TKeyBuf = typename TTrie::TKeyBuf;
- TSearchIterator() = default;
-
+ TSearchIterator() = default;
+
explicit TSearchIterator(const TTrie& trie)
: Trie(&trie)
, DataPos(trie.DataHolder.AsCharPtr())
, DataEnd(DataPos + trie.DataHolder.Length())
- , ValuePos(trie.EmptyValue)
- {
- }
-
- explicit TSearchIterator(const TTrie& trie, const TTrie& subTrie)
- : Trie(&trie)
- , DataPos(subTrie.Data().AsCharPtr())
- , DataEnd(trie.DataHolder.AsCharPtr() + trie.DataHolder.Length())
- , ValuePos(subTrie.EmptyValue)
+ , ValuePos(trie.EmptyValue)
{
}
+ explicit TSearchIterator(const TTrie& trie, const TTrie& subTrie)
+ : Trie(&trie)
+ , DataPos(subTrie.Data().AsCharPtr())
+ , DataEnd(trie.DataHolder.AsCharPtr() + trie.DataHolder.Length())
+ , ValuePos(subTrie.EmptyValue)
+ {
+ }
+
bool operator==(const TSearchIterator& other) const {
- Y_ASSERT(Trie && other.Trie);
+ Y_ASSERT(Trie && other.Trie);
return Trie == other.Trie &&
DataPos == other.DataPos &&
DataEnd == other.DataEnd &&
@@ -53,7 +53,7 @@ public:
}
inline bool Advance(TSymbol label) {
- Y_ASSERT(Trie);
+ Y_ASSERT(Trie);
if (DataPos == nullptr || DataPos >= DataEnd) {
return false;
}
@@ -64,14 +64,14 @@ public:
}
bool Advance(const TSymbol* key, size_t keylen);
bool GetValue(TData* value = nullptr) const;
- bool HasValue() const;
- inline size_t GetHash() const;
+ bool HasValue() const;
+ inline size_t GetHash() const;
private:
- const TTrie* Trie = nullptr;
- const char* DataPos = nullptr;
- const char* DataEnd = nullptr;
- const char* ValuePos = nullptr;
+ const TTrie* Trie = nullptr;
+ const char* DataPos = nullptr;
+ const char* DataEnd = nullptr;
+ const char* ValuePos = nullptr;
};
template <class TTrie>
@@ -79,18 +79,18 @@ inline TSearchIterator<TTrie> MakeSearchIterator(const TTrie& trie) {
return TSearchIterator<TTrie>(trie);
}
-template <class TTrie>
-struct THash<TSearchIterator<TTrie>> {
- inline size_t operator()(const TSearchIterator<TTrie>& item) {
- return item.GetHash();
- }
-};
-
+template <class TTrie>
+struct THash<TSearchIterator<TTrie>> {
+ inline size_t operator()(const TSearchIterator<TTrie>& item) {
+ return item.GetHash();
+ }
+};
+
//----------------------------------------------------------------------------
template <class TTrie>
bool TSearchIterator<TTrie>::Advance(const TSymbol* key, size_t keylen) {
- Y_ASSERT(Trie);
+ Y_ASSERT(Trie);
if (!key || DataPos == nullptr || DataPos >= DataEnd) {
return false;
}
@@ -112,7 +112,7 @@ bool TSearchIterator<TTrie>::Advance(const TSymbol* key, size_t keylen) {
template <class TTrie>
bool TSearchIterator<TTrie>::GetValue(TData* value) const {
- Y_ASSERT(Trie);
+ Y_ASSERT(Trie);
bool result = false;
if (value) {
if (ValuePos) {
@@ -122,19 +122,19 @@ bool TSearchIterator<TTrie>::GetValue(TData* value) const {
}
return result;
}
-
-template <class TTrie>
-bool TSearchIterator<TTrie>::HasValue() const {
- Y_ASSERT(Trie);
- return ValuePos;
-}
-
-template <class TTrie>
-inline size_t TSearchIterator<TTrie>::GetHash() const {
- Y_ASSERT(Trie);
+
+template <class TTrie>
+bool TSearchIterator<TTrie>::HasValue() const {
+ Y_ASSERT(Trie);
+ return ValuePos;
+}
+
+template <class TTrie>
+inline size_t TSearchIterator<TTrie>::GetHash() const {
+ Y_ASSERT(Trie);
return MultiHash(
static_cast<const void*>(Trie),
static_cast<const void*>(DataPos),
static_cast<const void*>(DataEnd),
static_cast<const void*>(ValuePos));
-}
+}