aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/containers/comptrie
diff options
context:
space:
mode:
authorDevtools Arcadia <arcadia-devtools@yandex-team.ru>2022-02-07 18:08:42 +0300
committerDevtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>2022-02-07 18:08:42 +0300
commit1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
treee26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/containers/comptrie
downloadydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/containers/comptrie')
-rw-r--r--library/cpp/containers/comptrie/README.md232
-rw-r--r--library/cpp/containers/comptrie/array_with_size.h67
-rw-r--r--library/cpp/containers/comptrie/benchmark/main.cpp260
-rw-r--r--library/cpp/containers/comptrie/benchmark/ya.make14
-rw-r--r--library/cpp/containers/comptrie/chunked_helpers_trie.h218
-rw-r--r--library/cpp/containers/comptrie/comptrie.cpp8
-rw-r--r--library/cpp/containers/comptrie/comptrie.h4
-rw-r--r--library/cpp/containers/comptrie/comptrie_builder.cpp1
-rw-r--r--library/cpp/containers/comptrie/comptrie_builder.h159
-rw-r--r--library/cpp/containers/comptrie/comptrie_builder.inl1121
-rw-r--r--library/cpp/containers/comptrie/comptrie_impl.cpp39
-rw-r--r--library/cpp/containers/comptrie/comptrie_impl.h221
-rw-r--r--library/cpp/containers/comptrie/comptrie_packer.h21
-rw-r--r--library/cpp/containers/comptrie/comptrie_trie.h663
-rw-r--r--library/cpp/containers/comptrie/comptrie_ut.cpp1791
-rw-r--r--library/cpp/containers/comptrie/first_symbol_iterator.h61
-rw-r--r--library/cpp/containers/comptrie/key_selector.h29
-rw-r--r--library/cpp/containers/comptrie/leaf_skipper.h56
-rw-r--r--library/cpp/containers/comptrie/loader/loader.cpp1
-rw-r--r--library/cpp/containers/comptrie/loader/loader.h22
-rw-r--r--library/cpp/containers/comptrie/loader/loader_ut.cpp30
-rw-r--r--library/cpp/containers/comptrie/loader/ut/dummy.triebin0 -> 25 bytes
-rw-r--r--library/cpp/containers/comptrie/loader/ut/ya.make18
-rw-r--r--library/cpp/containers/comptrie/loader/ya.make15
-rw-r--r--library/cpp/containers/comptrie/make_fast_layout.cpp467
-rw-r--r--library/cpp/containers/comptrie/make_fast_layout.h20
-rw-r--r--library/cpp/containers/comptrie/minimize.cpp359
-rw-r--r--library/cpp/containers/comptrie/minimize.h29
-rw-r--r--library/cpp/containers/comptrie/node.cpp79
-rw-r--r--library/cpp/containers/comptrie/node.h80
-rw-r--r--library/cpp/containers/comptrie/opaque_trie_iterator.cpp231
-rw-r--r--library/cpp/containers/comptrie/opaque_trie_iterator.h266
-rw-r--r--library/cpp/containers/comptrie/pattern_searcher.h606
-rw-r--r--library/cpp/containers/comptrie/prefix_iterator.cpp1
-rw-r--r--library/cpp/containers/comptrie/prefix_iterator.h88
-rw-r--r--library/cpp/containers/comptrie/protopacker.h29
-rw-r--r--library/cpp/containers/comptrie/search_iterator.cpp1
-rw-r--r--library/cpp/containers/comptrie/search_iterator.h140
-rw-r--r--library/cpp/containers/comptrie/set.h40
-rw-r--r--library/cpp/containers/comptrie/ut/ya.make9
-rw-r--r--library/cpp/containers/comptrie/write_trie_backwards.cpp110
-rw-r--r--library/cpp/containers/comptrie/write_trie_backwards.h23
-rw-r--r--library/cpp/containers/comptrie/writeable_node.cpp96
-rw-r--r--library/cpp/containers/comptrie/writeable_node.h26
-rw-r--r--library/cpp/containers/comptrie/ya.make35
45 files changed, 7786 insertions, 0 deletions
diff --git a/library/cpp/containers/comptrie/README.md b/library/cpp/containers/comptrie/README.md
new file mode 100644
index 0000000000..43c298e2c8
--- /dev/null
+++ b/library/cpp/containers/comptrie/README.md
@@ -0,0 +1,232 @@
+Compact trie
+=============
+
+The comptrie library is a fast and very tightly packed
+implementation of a prefix tree (Sedgewick's T-trie, that is a ternary tree,
+see https://www.cs.princeton.edu/~rs/strings/paper.pdf,
+https://www.cs.upc.edu/~ps/downloads/tst/tst.html). It contains tools for creating, optimizing, and serializing trees, accessing by key, and performing
+various searches. Because it is template-based and performance-oriented, a significant
+part of the library consists of inline functions, so if you don't need all the
+features of the library, consider including a more specific header file instead of the top-level
+comptrie.h file.
+
+Description of the data structure
+---------------------------------
+
+A prefix tree is an implementation of the map data structure
+for cases when keys are sequences of characters. The nodes on this tree
+contain characters and values. The key that corresponds to a
+value is a sequence of all the characters on nodes along the path from the root to the
+node with this value. It follows that a tree node can have as many children as
+there are different characters, which is quite a lot. For compact storage and
+quick access, the children are ordered as a binary search tree, which should be
+balanced if possible (characters have a defined order that must be
+preserved). Thus, our tree is ternary: each node has a reference to a subtree
+of all children and two references to subtrees of siblings. One of these subtrees contains the
+younger siblings, and the other contains the elder siblings.
+
+The library implements tree optimization by merging identical subtrees, which means
+the tree becomes a DAG (Directed Acyclic Graph –
+an oriented graph without oriented cycles).
+
+The main class TCompactTrie is defined in comptrie_trie.h and is templatized:
+- The first parameter of the template is the character type. It should be an
+integer type, which means that arithmetical operations must be defined for it.
+- The second parameter of the template is the value type.
+- The third parameter is the packer class, which packs values in order to quickly and compactly
+serialize the value type to a continuous memory buffer, deserialize it
+back, and quickly determine its size using the pointer to the beginning of this
+memory buffer. Good packers have already been written for most types, and they are available in
+library/cpp/packers. For more information, please refer to the documentation for these packers.
+
+The set.h file defines a modification for cases when keys must be stored
+without values.
+
+When a tree is built from scratch, the value corresponding to an empty key is
+assigned to a single-character key '\0'. So in a tree with the 'char' character type,
+the empty key and the '\0' key are bound together. For a subtree received from
+a call to FindTails, this restriction no longer exists.
+
+Creating trees
+--------------
+
+Building a tree from a list of key-value pairs is performed by the
+TCompactTrieBuilder class described in the comptrie_builder.h file.
+
+This class allows you to add words to a tree one at a time, merge a complete
+subtree, and also use an unfinished tree as a map.
+
+An important optimization is the prefix-grouped mode when you need to add keys
+in a certain order (for details, see the comments in the header file). The resulting tree is compactly packed while keys are being added, and the memory consumption is approximately the same as for
+the completed tree. For the default mode, compact stacking is turned on at the
+very end, and the data consumes quite a lot of memory up until that point.
+
+Optimizing trees
+----------------
+
+After a tree is created, there are two optimizing operations that can be applied:
+ - Minimization to a DAG by merging equal subtrees.
+ - Fast memory layout.
+The functions that implement these operations are declared in the comptrie_builder.h file. The first
+optimization is implemented by the CompactTrieMinimize function, and the second is implemented by
+CompactTrieMakeFastLayout. You can perform both at once by calling the
+CompactTrieMinimizeAndMakeFastLayout function.
+
+### Minimization ###
+
+Minimization to a DAG requires quite a lot of time and a large amount of
+memory to store an array of subtrees, but it can reduce the size of the tree several
+times over (an example is a language model that has many low-frequency
+phrases with repeated last words and frequency counts). However, if you know
+in advance that there are no duplicate values in the tree, you don't need to waste time on it, since the minimization
+won't have any effect on the tree.
+
+### Fast memory layout ###
+
+The second optimization function results in fewer cache misses, but it causes the
+tree to grow in size. Our experience has shown a 5% gain
+in speed for some tries. The algorithm consumes about three times more memory than
+the amount required for the source tree. So if the machine has enough memory to
+assemble a tree, it does not neccessarily mean that it has enough memory to run
+the algorithm. To learn about the theory behind this algorithm, read the comments before the declaration of the CompactTrieMinimize function.
+
+Serializing trees
+-----------------
+
+The tree resides in memory as a sequence of nodes. Links to other nodes are always
+counted relative to the position of the current node. This allows you to save a
+tree to disk as it is and then re-load it using mmap(). The TCompactTrie class has the
+TBlob constructor for reading a tree from disk. The TCompactTrieBuilder class has
+Save/SaveToFile methods for writing a built tree to a stream or a file.
+
+Accessing trees
+---------------
+
+As a rule, all methods that accept a key as input have two variants:
+- One takes the key in the format: pointer to the beginning of the key, length.
+- The other takes a high-level type like TStringBuf.
+
+You can get a value for a key in the tree – TCompactTrie::Find returns
+false if there is no key, and TCompactTrie::Get throws an exception. You can use FindPrefix methods to find the longest key prefix in a tree and get the corresponding value for it.
+You can also use a single FindPhrases request to get values for all the beginnings of
+a phrase with a given word delimiter.
+
+An important operation that distinguishes a tree from a simple map is implemented in the FindTails method,
+which allows you to obtain a subtree consisting of all possible extensions of the
+given prefix.
+
+Iterators for trees
+-------------------
+
+First of all, there is a typical map iterator over all key-value pairs called
+TConstIterator. A tree has three methods that return it: Begin, End, and
+UpperBound. The latter takes a key as input and returns an iterator to the
+smallest key that is not smaller than the input key.
+
+The rest of the iterators are not so widely used, and thus are located in
+separate files.
+
+TPrefixIterator is defined in the prefix_iterator.h file. It allows
+iterations over all the prefixes of this key available in the tree.
+
+TSearchIterator is defined in the search_iterator.h file. It allows you to enter
+a key in a tree one character at a time and see where it ends up. The following character can
+be selected depending on the current result. You can also copy the iterator and
+proceed on two different paths. You can actually achieve the same result with
+repeated use of the FindTails method, but the authors of this iterator claim
+that they obtained a performance gain with it.
+
+Appendix. Memory implementation details
+---------------------------------------
+
+*If you are not going to modify the library, then you do not need to read further.*
+
+First, if the character type has a size larger than 1 byte, then all keys that use these characters are converted to byte strings in the big-endian way. This
+means that character bytes are written in a string from the most significant
+to the least significant from left to right. Thus it is reduced to the case when
+the character in use is 'char'.
+
+The tree resides in memory as a series of consecutive nodes. The nodes can have different
+sizes, so the only way to identify the boundaries of nodes is by passing the entire
+tree.
+
+### Node structure ###
+
+The structure of a node, as can be understood from thoughtfully reading the
+LeapByte function in Comptrie_impl.h, is the following:
+- The first byte is for service flags.
+- The second byte is a character (unless it is the ε-link type of node
+ described below, which has from 1 to 7 bytes of offset distance from the
+ beginning of this node to the content node, and nothing else).
+
+Thus, the size of any node is at least 2 bytes. All other elements of a node
+are optional. Next there is from 0 to 7 bytes of the packed offset from the beginning
+of this node to the beginning of the root node of a subtree with the younger
+siblings. It is followed by 0 to 7 bytes of the packed offset from the beginning of this
+node to the beginning of the root node of a subtree with the elder siblings.
+Next comes the packed value in this node. Its size is not limited, but you may
+recall that the packer allows you to quickly determine this size using a pointer
+to the beginning of the packed value. Then, if the service flags indicate
+that the tree has children, there is a root node of the subtree of children.
+
+The packed offset is restricted to 7 bytes, and this gives us a limit on the largest
+possible size of a tree. You need to study the packer code to understand
+the exact limit.
+
+All packed offsets are nonnegative, meaning that roots of subtrees with
+siblings and the node pointed to by the ε-link must be located
+strictly to the right of the current node in memory. This does not allow placement of
+finite state machines with oriented cycles in the comptrie. But it does allow you to
+effectively stack the comptrie from right to left.
+
+### Service flags ###
+
+The byte of service flags contains (as shown by the constants at the beginning of
+the comptrie_impl.h file):
+- 1 bit of MT_NEXT, indicating whether this node has children.
+- 1 bit of MT_FINAL, indicating if there is a value in this node.
+- 3 bits of MT_SIZEMASK, indicating the size of the packed offset to a subtree
+ with elder siblings.
+- 3 bits of MT_SIZEMASK << MT_LEFTSHIFT, indicating the size of the packed
+ offset to a subtree with younger siblings.
+If one of these subtrees is not present, then the size of the corresponding
+packed offset is 0, and vice versa.
+
+### ε-links ###
+
+These nodes only occur if we optimized a tree into a DAG and got two nodes with
+merged subtrees of children. Since the offset to the subtree of children can't be
+specified and the root of this subtree should lie just after the value, we have
+to add a node of the ε-link type, which contains the offset to the root subtree of
+children and nothing more. This applies to all nodes that have equal subtrees of children,
+except the rightmost node. The size of this offset is set in 3 bits of MT_SIZEMASK
+flags for a node.
+
+As the implementation of the IsEpsilonLink function in
+comptrie_impl.h demonstrates, the ε-link differs from other nodes in that it does not have the MT_NEXT flag or the MT_FINAL
+ flag, so it can always be
+identified by the flags. Of course, the best programming practice is to call the
+function itself instead of examining the flags.
+
+Note that the ε-link flags do not use the MT_SIZEMASK <<
+MT_LEFTSHIFT` bits, which allows us to start using ε-links for some other purpose.
+
+Pattern Searcher
+================
+
+This is an implementation of Aho-Corasick algorithm on compact trie structure.
+In order to create a pattern searcher one must fill a TCompactPatternSearcherBuilder
+with patterns and call SaveAsPatternSearcher or SaveToFileAsPatternSearcher.
+Then TCompactPatternSearcher must be created from the builder output.
+
+### Implementation details ###
+
+Aho-Corasick algorithm stores a suffix link in each node.
+A suffix link of a node is the offset (relative to this node) of the largest suffix
+of a string this node represents which is present in a trie.
+Current implementation also stores a shortcut link to the largest suffix
+for which the corresponding node in a trie is a final node.
+These two links are stored as NCompactTrie::TSuffixLink structure of two 64-bit
+integers.
+In a trie layout these links are stored for each node right after the two bytes
+containing service flags and a symbol.
diff --git a/library/cpp/containers/comptrie/array_with_size.h b/library/cpp/containers/comptrie/array_with_size.h
new file mode 100644
index 0000000000..36e61c7410
--- /dev/null
+++ b/library/cpp/containers/comptrie/array_with_size.h
@@ -0,0 +1,67 @@
+#pragma once
+
+#include <util/generic/ptr.h>
+#include <util/generic/noncopyable.h>
+#include <util/generic/utility.h>
+#include <util/system/sys_alloc.h>
+
+template <typename T>
+class TArrayWithSizeHolder : TNonCopyable {
+ typedef TArrayWithSizeHolder<T> TThis;
+
+ T* Data;
+
+public:
+ TArrayWithSizeHolder()
+ : Data(nullptr)
+ {
+ }
+
+ ~TArrayWithSizeHolder() {
+ if (!Data)
+ return;
+ for (size_t i = 0; i < Size(); ++i) {
+ try {
+ Data[i].~T();
+ } catch (...) {
+ }
+ }
+ y_deallocate(((size_t*)Data) - 1);
+ }
+
+ void Swap(TThis& copy) {
+ DoSwap(Data, copy.Data);
+ }
+
+ void Resize(size_t newSize) {
+ if (newSize == Size())
+ return;
+ TThis copy;
+ copy.Data = (T*)(((size_t*)y_allocate(sizeof(size_t) + sizeof(T) * newSize)) + 1);
+ // does not handle constructor exceptions properly
+ for (size_t i = 0; i < Min(Size(), newSize); ++i) {
+ new (copy.Data + i) T(Data[i]);
+ }
+ for (size_t i = Min(Size(), newSize); i < newSize; ++i) {
+ new (copy.Data + i) T;
+ }
+ ((size_t*)copy.Data)[-1] = newSize;
+ Swap(copy);
+ }
+
+ size_t Size() const {
+ return Data ? ((size_t*)Data)[-1] : 0;
+ }
+
+ bool Empty() const {
+ return Size() == 0;
+ }
+
+ T* Get() {
+ return Data;
+ }
+
+ const T* Get() const {
+ return Data;
+ }
+};
diff --git a/library/cpp/containers/comptrie/benchmark/main.cpp b/library/cpp/containers/comptrie/benchmark/main.cpp
new file mode 100644
index 0000000000..6e42dad18a
--- /dev/null
+++ b/library/cpp/containers/comptrie/benchmark/main.cpp
@@ -0,0 +1,260 @@
+#include <library/cpp/testing/benchmark/bench.h>
+
+#include <library/cpp/containers/comptrie/comptrie_trie.h>
+#include <library/cpp/containers/comptrie/comptrie_builder.h>
+#include <library/cpp/containers/comptrie/search_iterator.h>
+#include <library/cpp/containers/comptrie/pattern_searcher.h>
+
+#include <library/cpp/on_disk/aho_corasick/writer.h>
+#include <library/cpp/on_disk/aho_corasick/reader.h>
+#include <library/cpp/on_disk/aho_corasick/helpers.h>
+
+#include <library/cpp/containers/dense_hash/dense_hash.h>
+
+#include <util/stream/file.h>
+#include <util/generic/algorithm.h>
+#include <util/random/fast.h>
+#include <util/random/shuffle.h>
+
+/////////////////
+// COMMON DATA //
+/////////////////
+
+const size_t MAX_PATTERN_LENGTH = 11;
+
+TVector<TString> letters = {
+ "а", "б", "в", "г", "д", "е", "ё", "ж", "з", "и", "й",
+ "к", "л", "м", "н", "о", "п", "р", "с", "т", "у", "ф",
+ "х", "ц", "ч", "ж", "щ", "ъ", "ы", "ь", "э", "ю", "я"
+};
+
+TString GenerateOneString(
+ TFastRng<ui64>& rng,
+ size_t maxLength,
+ const TVector<TString>& sequences
+) {
+ size_t length = rng.GenRand() % maxLength + 1;
+ TString result;
+ while (result.size() < length) {
+ result += sequences[rng.GenRand() % sequences.size()];
+ }
+ return result;
+}
+
+TVector<TString> GenerateStrings(
+ TFastRng<ui64>& rng,
+ size_t num,
+ size_t maxLength,
+ const TVector<TString>& sequences
+) {
+ TVector<TString> strings;
+ while (strings.size() < num) {
+ strings.push_back(GenerateOneString(rng, maxLength, sequences));
+ }
+ return strings;
+}
+
+struct TDatasetInstance {
+ TDatasetInstance(const TVector<TString>& sequences) {
+ TFastRng<ui64> rng(0);
+
+ TVector<TString> prefixes = GenerateStrings(rng, /*num*/10, /*maxLength*/3, sequences);
+ prefixes.push_back("");
+
+ TVector<TString> roots = GenerateStrings(rng, /*num*/1000, /*maxLength*/5, sequences);
+
+ TVector<TString> suffixes = GenerateStrings(rng, /*num*/10, /*maxLength*/3, sequences);
+ suffixes.push_back("");
+
+ TVector<TString> dictionary;
+ for (const auto& root : roots) {
+ for (const auto& prefix : prefixes) {
+ for (const auto& suffix : suffixes) {
+ dictionary.push_back(prefix + root + suffix);
+ Y_ASSERT(dictionary.back().size() < MAX_PATTERN_LENGTH);
+ }
+ }
+ }
+ Shuffle(dictionary.begin(), dictionary.end());
+
+ Patterns.assign(dictionary.begin(), dictionary.begin() + 10'000);
+
+ for (size_t sampleIdx = 0; sampleIdx < /*samplesNum*/1'000'000; ++sampleIdx) {
+ Samples.emplace_back();
+ size_t wordsNum = rng.GenRand() % 10;
+ for (size_t wordIdx = 0; wordIdx < wordsNum; ++wordIdx) {
+ if (wordIdx > 0) {
+ Samples.back() += " ";
+ }
+ Samples.back() += dictionary[rng.GenRand() % dictionary.size()];
+ }
+ }
+ };
+
+ TString GetSample(size_t iteration) const {
+ TFastRng<ui64> rng(iteration);
+ return Samples[rng.GenRand() % Samples.size()];
+ }
+
+
+ TVector<TString> Patterns;
+ TVector<TString> Samples;
+};
+
+static const TDatasetInstance dataset(letters);
+
+//////////////////////////
+// NEW PATTERN SEARCHER //
+//////////////////////////
+
+struct TPatternSearcherInstance {
+ TPatternSearcherInstance() {
+ TCompactPatternSearcherBuilder<char, ui32> builder;
+
+ for (ui32 patternId = 0; patternId < dataset.Patterns.size(); ++patternId) {
+ builder.Add(dataset.Patterns[patternId], patternId);
+ }
+
+ TBufferOutput buffer;
+ builder.Save(buffer);
+
+ Instance.Reset(
+ new TCompactPatternSearcher<char, ui32>(
+ buffer.Buffer().Data(),
+ buffer.Buffer().Size()
+ )
+ );
+ }
+
+ THolder<TCompactPatternSearcher<char, ui32>> Instance;
+};
+
+static const TPatternSearcherInstance patternSearcherInstance;
+
+Y_CPU_BENCHMARK(PatternSearcher, iface) {
+ TVector<TVector<std::pair<ui32, ui32>>> result;
+ for (size_t iteration = 0; iteration < iface.Iterations(); ++iteration) {
+ result.emplace_back();
+ TString testString = dataset.GetSample(iteration);
+ auto matches = patternSearcherInstance.Instance->SearchMatches(testString);
+ for (auto& match : matches) {
+ result.back().emplace_back(match.End, match.Data);
+ }
+ }
+}
+
+//////////////////////
+// OLD AHO CORASICK //
+//////////////////////
+
+struct TAhoCorasickInstance {
+ TAhoCorasickInstance() {
+ TAhoCorasickBuilder<TString, ui32> builder;
+
+ for (ui32 patternId = 0; patternId < dataset.Patterns.size(); ++patternId) {
+ builder.AddString(dataset.Patterns[patternId], patternId);
+ }
+
+ TBufferOutput buffer;
+ builder.SaveToStream(&buffer);
+
+ Instance.Reset(new TDefaultMappedAhoCorasick(TBlob::FromBuffer(buffer.Buffer())));
+ };
+
+ THolder<TDefaultMappedAhoCorasick> Instance;
+};
+
+static const TAhoCorasickInstance ahoCorasickInstance;
+
+Y_CPU_BENCHMARK(AhoCorasick, iface) {
+ TVector<TDeque<std::pair<ui32, ui32>>> result;
+ for (size_t iteration = 0; iteration < iface.Iterations(); ++iteration) {
+ result.emplace_back();
+ TString testString = dataset.GetSample(iteration);
+ auto matches = ahoCorasickInstance.Instance->AhoSearch(testString);
+ result.push_back(matches);
+ }
+}
+
+////////////////////////////////
+// COMPTRIE + SIMPLE MATCHING //
+////////////////////////////////
+
+struct TCompactTrieInstance {
+ TCompactTrieInstance() {
+ TCompactTrieBuilder<char, ui32> builder;
+
+ for (ui32 patternId = 0; patternId < dataset.Patterns.size(); ++patternId) {
+ builder.Add(dataset.Patterns[patternId], patternId);
+ }
+
+
+ TBufferOutput buffer;
+ CompactTrieMinimizeAndMakeFastLayout(buffer, builder);
+
+ Instance.Reset(new TCompactTrie<char, ui32>(
+ buffer.Buffer().Data(),
+ buffer.Buffer().Size()
+ ));
+ }
+
+ THolder<TCompactTrie<char, ui32>> Instance;
+};
+
+static const TCompactTrieInstance compactTrieInstance;
+
+Y_CPU_BENCHMARK(ComptrieSimple, iface) {
+ TVector<TVector<std::pair<ui32, ui32>>> result;
+ for (size_t iteration = 0; iteration < iface.Iterations(); ++iteration) {
+ result.emplace_back();
+ TString testString = dataset.GetSample(iteration);
+ for (ui32 startPos = 0; startPos < testString.size(); ++startPos) {
+ TSearchIterator<TCompactTrie<char, ui32>> iter(*(compactTrieInstance.Instance));
+ for (ui32 position = startPos; position < testString.size(); ++position) {
+ if (!iter.Advance(testString[position])) {
+ break;
+ }
+ ui32 answer;
+ if (iter.GetValue(&answer)) {
+ result.back().emplace_back(position, answer);
+ }
+ }
+ }
+ }
+}
+
+////////////////
+// DENSE_HASH //
+////////////////
+
+struct TDenseHashInstance {
+ TDenseHashInstance() {
+ for (ui32 patternId = 0; patternId < dataset.Patterns.size(); ++patternId) {
+ Instance[dataset.Patterns[patternId]] = patternId;
+ }
+ }
+
+ TDenseHash<TString, ui32> Instance;
+};
+
+static const TDenseHashInstance denseHashInstance;
+
+Y_CPU_BENCHMARK(DenseHash, iface) {
+ TVector<TVector<std::pair<ui32, ui32>>> result;
+ for (size_t iteration = 0; iteration < iface.Iterations(); ++iteration) {
+ result.emplace_back();
+ TString testString = dataset.GetSample(iteration);
+ for (size_t start = 0; start < testString.size(); ++start) {
+ for (
+ size_t length = 1;
+ length <= MAX_PATTERN_LENGTH && start + length <= testString.size();
+ ++length
+ ) {
+ auto value = denseHashInstance.Instance.find(testString.substr(start, length));
+ if (value != denseHashInstance.Instance.end()) {
+ result.back().emplace_back(start + length - 1, value->second);
+ }
+ }
+ }
+ }
+}
diff --git a/library/cpp/containers/comptrie/benchmark/ya.make b/library/cpp/containers/comptrie/benchmark/ya.make
new file mode 100644
index 0000000000..16fa19530d
--- /dev/null
+++ b/library/cpp/containers/comptrie/benchmark/ya.make
@@ -0,0 +1,14 @@
+Y_BENCHMARK()
+
+OWNER(smirnovpavel)
+
+SRCS(
+ main.cpp
+)
+
+PEERDIR(
+ library/cpp/containers/comptrie
+ util
+)
+
+END()
diff --git a/library/cpp/containers/comptrie/chunked_helpers_trie.h b/library/cpp/containers/comptrie/chunked_helpers_trie.h
new file mode 100644
index 0000000000..cfa35f5ba2
--- /dev/null
+++ b/library/cpp/containers/comptrie/chunked_helpers_trie.h
@@ -0,0 +1,218 @@
+#pragma once
+
+#include <library/cpp/on_disk/chunks/chunked_helpers.h>
+
+#include "comptrie.h"
+
+class TTrieSet {
+private:
+ TCompactTrie<char> Trie;
+
+public:
+ TTrieSet(const TBlob& blob)
+ : Trie(blob)
+ {
+ }
+
+ bool Has(const char* key) const {
+ return Trie.Find(key, strlen(key));
+ }
+
+ bool FindLongestPrefix(const char* key, size_t keylen, size_t* prefixLen) {
+ return Trie.FindLongestPrefix(key, keylen, prefixLen);
+ }
+};
+
+template <bool sorted = false>
+class TTrieSetWriter {
+private:
+ TCompactTrieBuilder<char> Builder;
+
+public:
+ TTrieSetWriter(bool isSorted = sorted)
+ : Builder(isSorted ? CTBF_PREFIX_GROUPED : CTBF_NONE)
+ {
+ }
+
+ void Add(const char* key, size_t keylen) {
+ Builder.Add(key, keylen, 0);
+ assert(Has(((TString)key).substr(0, keylen).data()));
+ }
+
+ void Add(const char* key) {
+ Add(key, strlen(key));
+ }
+
+ bool Has(const char* key) const {
+ ui64 dummy;
+ return Builder.Find(key, strlen(key), &dummy);
+ }
+
+ void Save(IOutputStream& out) const {
+ Builder.Save(out);
+ }
+
+ void Clear() {
+ Builder.Clear();
+ }
+};
+
+template <bool isWriter, bool sorted = false>
+struct TTrieSetG;
+
+template <bool sorted>
+struct TTrieSetG<false, sorted> {
+ typedef TTrieSet T;
+};
+
+template <bool sorted>
+struct TTrieSetG<true, sorted> {
+ typedef TTrieSetWriter<sorted> T;
+};
+
+template <typename T>
+class TTrieMap {
+private:
+ TCompactTrie<char> Trie;
+ static_assert(sizeof(T) <= sizeof(ui64), "expect sizeof(T) <= sizeof(ui64)");
+
+public:
+ TTrieMap(const TBlob& blob)
+ : Trie(blob)
+ {
+ }
+
+ bool Get(const char* key, T* value) const {
+ ui64 trieValue;
+ if (Trie.Find(key, strlen(key), &trieValue)) {
+ *value = ReadUnaligned<T>(&trieValue);
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ T Get(const char* key, T def = T()) const {
+ ui64 trieValue;
+ if (Trie.Find(key, strlen(key), &trieValue)) {
+ return ReadUnaligned<T>(&trieValue);
+ } else {
+ return def;
+ }
+ }
+
+ const TCompactTrie<char>& GetTrie() const {
+ return Trie;
+ }
+};
+
+template <typename T, bool sorted = false>
+class TTrieMapWriter {
+private:
+ typedef TCompactTrieBuilder<char> TBuilder;
+ TBuilder Builder;
+ static_assert(sizeof(T) <= sizeof(ui64), "expect sizeof(T) <= sizeof(ui64)");
+#ifndef NDEBUG
+ bool IsSorted;
+#endif
+
+public:
+ TTrieMapWriter(bool isSorted = sorted)
+ : Builder(isSorted ? CTBF_PREFIX_GROUPED : CTBF_NONE)
+#ifndef NDEBUG
+ , IsSorted(isSorted)
+#endif
+ {
+ }
+
+ void Add(const char* key, const T& value) {
+ ui64 intValue = 0;
+ memcpy(&intValue, &value, sizeof(T));
+ Builder.Add(key, strlen(key), intValue);
+#ifndef NDEBUG
+ /*
+ if (!IsSorted) {
+ T test;
+ assert(Get(key, &test) && value == test);
+ }
+ */
+#endif
+ }
+
+ void Add(const TString& s, const T& value) {
+ ui64 intValue = 0;
+ memcpy(&intValue, &value, sizeof(T));
+ Builder.Add(s.data(), s.size(), intValue);
+ }
+
+ bool Get(const char* key, T* value) const {
+ ui64 trieValue;
+ if (Builder.Find(key, strlen(key), &trieValue)) {
+ *value = ReadUnaligned<T>(&trieValue);
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ T Get(const char* key, T def = (T)0) const {
+ ui64 trieValue;
+ if (Builder.Find(key, strlen(key), &trieValue)) {
+ return ReadUnaligned<T>(&trieValue);
+ } else {
+ return def;
+ }
+ }
+
+ void Save(IOutputStream& out, bool minimize = false) const {
+ if (minimize) {
+ CompactTrieMinimize<TBuilder>(out, Builder, false);
+ } else {
+ Builder.Save(out);
+ }
+ }
+
+ void Clear() {
+ Builder.Clear();
+ }
+};
+
+template <typename T>
+class TTrieSortedMapWriter {
+private:
+ typedef std::pair<TString, T> TValue;
+ typedef TVector<TValue> TValues;
+ TValues Values;
+
+public:
+ TTrieSortedMapWriter() = default;
+
+ void Add(const char* key, const T& value) {
+ Values.push_back(TValue(key, value));
+ }
+
+ void Save(IOutputStream& out) {
+ Sort(Values.begin(), Values.end());
+ TTrieMapWriter<T, true> writer;
+ for (typename TValues::const_iterator toValue = Values.begin(); toValue != Values.end(); ++toValue)
+ writer.Add(toValue->first.data(), toValue->second);
+ writer.Save(out);
+ }
+
+ void Clear() {
+ Values.clear();
+ }
+};
+
+template <typename X, bool isWriter, bool sorted = false>
+struct TTrieMapG;
+
+template <typename X, bool sorted>
+struct TTrieMapG<X, false, sorted> {
+ typedef TTrieMap<X> T;
+};
+
+template <typename X, bool sorted>
+struct TTrieMapG<X, true, sorted> {
+ typedef TTrieMapWriter<X, sorted> T;
+};
diff --git a/library/cpp/containers/comptrie/comptrie.cpp b/library/cpp/containers/comptrie/comptrie.cpp
new file mode 100644
index 0000000000..4556e5b571
--- /dev/null
+++ b/library/cpp/containers/comptrie/comptrie.cpp
@@ -0,0 +1,8 @@
+#include "comptrie_impl.h"
+#include "comptrie.h"
+#include "array_with_size.h"
+#include "comptrie_trie.h"
+#include "comptrie_builder.h"
+#include "protopacker.h"
+#include "set.h"
+#include "chunked_helpers_trie.h"
diff --git a/library/cpp/containers/comptrie/comptrie.h b/library/cpp/containers/comptrie/comptrie.h
new file mode 100644
index 0000000000..f77024327e
--- /dev/null
+++ b/library/cpp/containers/comptrie/comptrie.h
@@ -0,0 +1,4 @@
+#pragma once
+
+#include "comptrie_trie.h"
+#include "comptrie_builder.h"
diff --git a/library/cpp/containers/comptrie/comptrie_builder.cpp b/library/cpp/containers/comptrie/comptrie_builder.cpp
new file mode 100644
index 0000000000..28a1e41dd2
--- /dev/null
+++ b/library/cpp/containers/comptrie/comptrie_builder.cpp
@@ -0,0 +1 @@
+#include "comptrie_builder.h"
diff --git a/library/cpp/containers/comptrie/comptrie_builder.h b/library/cpp/containers/comptrie/comptrie_builder.h
new file mode 100644
index 0000000000..cf7d2e39a3
--- /dev/null
+++ b/library/cpp/containers/comptrie/comptrie_builder.h
@@ -0,0 +1,159 @@
+#pragma once
+
+#include "comptrie_packer.h"
+#include "minimize.h"
+#include "key_selector.h"
+
+#include <util/stream/file.h>
+
+// --------------------------------------------------------------------------------------
+// Data Builder
+// To build the data buffer, we first create an automaton in memory. The automaton
+// is created incrementally. It actually helps a lot to have the input data prefix-grouped
+// by key; otherwise, memory consumption becomes a tough issue.
+// NOTE: building and serializing the automaton may be lengthy, and takes lots of memory.
+
+// PREFIX_GROUPED means that if we, while constructing a trie, add to the builder two keys with the same prefix,
+// then all the keys that we add between these two also have the same prefix.
+// Actually in this mode the builder can accept even more freely ordered input,
+// but for input as above it is guaranteed to work.
+enum ECompactTrieBuilderFlags {
+ CTBF_NONE = 0,
+ CTBF_PREFIX_GROUPED = 1 << 0,
+ CTBF_VERBOSE = 1 << 1,
+ CTBF_UNIQUE = 1 << 2,
+};
+
+using TCompactTrieBuilderFlags = ECompactTrieBuilderFlags;
+
+inline TCompactTrieBuilderFlags operator|(TCompactTrieBuilderFlags first, TCompactTrieBuilderFlags second) {
+ return static_cast<TCompactTrieBuilderFlags>(static_cast<int>(first) | second);
+}
+
+inline TCompactTrieBuilderFlags& operator|=(TCompactTrieBuilderFlags& first, TCompactTrieBuilderFlags second) {
+ return first = first | second;
+}
+
+template <typename T>
+class TArrayWithSizeHolder;
+
+template <class T = char, class D = ui64, class S = TCompactTriePacker<D>>
+class TCompactTrieBuilder {
+public:
+ typedef T TSymbol;
+ typedef D TData;
+ typedef S TPacker;
+ typedef typename TCompactTrieKeySelector<TSymbol>::TKey TKey;
+ typedef typename TCompactTrieKeySelector<TSymbol>::TKeyBuf TKeyBuf;
+
+ explicit TCompactTrieBuilder(TCompactTrieBuilderFlags flags = CTBF_NONE, TPacker packer = TPacker(), IAllocator* alloc = TDefaultAllocator::Instance());
+
+ // All Add.. methods return true if it was a new key, false if the key already existed.
+
+ bool Add(const TSymbol* key, size_t keylen, const TData& value);
+ bool Add(const TKeyBuf& key, const TData& value) {
+ return Add(key.data(), key.size(), value);
+ }
+
+ // add already serialized data
+ bool AddPtr(const TSymbol* key, size_t keylen, const char* data);
+ bool AddPtr(const TKeyBuf& key, const char* data) {
+ return AddPtr(key.data(), key.size(), data);
+ }
+
+ bool AddSubtreeInFile(const TSymbol* key, size_t keylen, const TString& filename);
+ bool AddSubtreeInFile(const TKeyBuf& key, const TString& filename) {
+ return AddSubtreeInFile(key.data(), key.size(), filename);
+ }
+
+ bool AddSubtreeInBuffer(const TSymbol* key, size_t keylen, TArrayWithSizeHolder<char>&& buffer);
+ bool AddSubtreeInBuffer(const TKeyBuf& key, TArrayWithSizeHolder<char>&& buffer) {
+ return AddSubtreeInBuffer(key.data(), key.size(), std::move(buffer));
+ }
+
+ bool Find(const TSymbol* key, size_t keylen, TData* value) const;
+ bool Find(const TKeyBuf& key, TData* value = nullptr) const {
+ return Find(key.data(), key.size(), value);
+ }
+
+ bool FindLongestPrefix(const TSymbol* key, size_t keylen, size_t* prefixLen, TData* value = nullptr) const;
+ bool FindLongestPrefix(const TKeyBuf& key, size_t* prefixLen, TData* value = nullptr) const {
+ return FindLongestPrefix(key.data(), key.size(), prefixLen, value);
+ }
+
+ size_t Save(IOutputStream& os) const;
+ size_t SaveAndDestroy(IOutputStream& os);
+ size_t SaveToFile(const TString& fileName) const {
+ TFixedBufferFileOutput out(fileName);
+ return Save(out);
+ }
+
+ void Clear(); // Returns all memory to the system and resets the builder state.
+
+ size_t GetEntryCount() const;
+ size_t GetNodeCount() const;
+
+ // Exact output file size in bytes.
+ size_t MeasureByteSize() const {
+ return Impl->MeasureByteSize();
+ }
+
+protected:
+ class TCompactTrieBuilderImpl;
+ THolder<TCompactTrieBuilderImpl> Impl;
+};
+
+//----------------------------------------------------------------------------------------------------------------------
+// Minimize the trie. The result is equivalent to the original
+// trie, except that it takes less space (and has marginally lower
+// performance, because of eventual epsilon links).
+// The algorithm is as follows: starting from the largest pieces, we find
+// nodes that have identical continuations (Daciuk's right language),
+// and repack the trie. Repacking is done in-place, so memory is less
+// of an issue; however, it may take considerable time.
+
+// IMPORTANT: never try to reminimize an already minimized trie or a trie with fast layout.
+// Because of non-local structure and epsilon links, it won't work
+// as you expect it to, and can destroy the trie in the making.
+// If you want both minimization and fast layout, do the minimization first.
+
+template <class TPacker>
+size_t CompactTrieMinimize(IOutputStream& os, const char* data, size_t datalength, bool verbose = false, const TPacker& packer = TPacker(), NCompactTrie::EMinimizeMode mode = NCompactTrie::MM_DEFAULT);
+
+template <class TTrieBuilder>
+size_t CompactTrieMinimize(IOutputStream& os, const TTrieBuilder& builder, bool verbose = false);
+
+//----------------------------------------------------------------------------------------------------------------
+// Lay the trie in memory in such a way that there are less cache misses when jumping from root to leaf.
+// The trie becomes about 2% larger, but the access became about 25% faster in our experiments.
+// Can be called on minimized and non-minimized tries, in the first case in requires half a trie more memory.
+// Calling it the second time on the same trie does nothing.
+//
+// The algorithm is based on van Emde Boas layout as described in the yandex data school lectures on external memory algoritms
+// by Maxim Babenko and Ivan Puzyrevsky. The difference is that when we cut the tree into levels
+// two nodes connected by a forward link are put into the same level (because they usually lie near each other in the original tree).
+// The original paper (describing the layout in Section 2.1) is:
+// Michael A. Bender, Erik D. Demaine, Martin Farach-Colton. Cache-Oblivious B-Trees
+// SIAM Journal on Computing, volume 35, number 2, 2005, pages 341-358.
+// Available on the web: http://erikdemaine.org/papers/CacheObliviousBTrees_SICOMP/
+// Or: Michael A. Bender, Erik D. Demaine, and Martin Farach-Colton. Cache-Oblivious B-Trees
+// Proceedings of the 41st Annual Symposium
+// on Foundations of Computer Science (FOCS 2000), Redondo Beach, California, November 12-14, 2000, pages 399-409.
+// Available on the web: http://erikdemaine.org/papers/FOCS2000b/
+// (there is not much difference between these papers, actually).
+//
+template <class TPacker>
+size_t CompactTrieMakeFastLayout(IOutputStream& os, const char* data, size_t datalength, bool verbose = false, const TPacker& packer = TPacker());
+
+template <class TTrieBuilder>
+size_t CompactTrieMakeFastLayout(IOutputStream& os, const TTrieBuilder& builder, bool verbose = false);
+
+// Composition of minimization and fast layout
+template <class TPacker>
+size_t CompactTrieMinimizeAndMakeFastLayout(IOutputStream& os, const char* data, size_t datalength, bool verbose = false, const TPacker& packer = TPacker());
+
+template <class TTrieBuilder>
+size_t CompactTrieMinimizeAndMakeFastLayout(IOutputStream& os, const TTrieBuilder& builder, bool verbose = false);
+
+// Implementation details moved here.
+#include "comptrie_builder.inl"
diff --git a/library/cpp/containers/comptrie/comptrie_builder.inl b/library/cpp/containers/comptrie/comptrie_builder.inl
new file mode 100644
index 0000000000..f273fa6571
--- /dev/null
+++ b/library/cpp/containers/comptrie/comptrie_builder.inl
@@ -0,0 +1,1121 @@
+#pragma once
+
+#include "comptrie_impl.h"
+#include "comptrie_trie.h"
+#include "make_fast_layout.h"
+#include "array_with_size.h"
+
+#include <library/cpp/containers/compact_vector/compact_vector.h>
+
+#include <util/memory/alloc.h>
+#include <util/memory/blob.h>
+#include <util/memory/pool.h>
+#include <util/memory/tempbuf.h>
+#include <util/memory/smallobj.h>
+#include <util/generic/algorithm.h>
+#include <util/generic/buffer.h>
+#include <util/generic/strbuf.h>
+
+#include <util/system/align.h>
+#include <util/stream/buffer.h>
+
+#define CONSTEXPR_MAX2(a, b) (a) > (b) ? (a) : (b)
+#define CONSTEXPR_MAX3(a, b, c) CONSTEXPR_MAX2(CONSTEXPR_MAX2(a, b), c)
+
+// TCompactTrieBuilder::TCompactTrieBuilderImpl
+
+template <class T, class D, class S>
+class TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl {
+protected:
+ TMemoryPool Pool;
+ size_t PayloadSize;
+ THolder<TFixedSizeAllocator> NodeAllocator;
+ class TNode;
+ class TArc;
+ TNode* Root;
+ TCompactTrieBuilderFlags Flags;
+ size_t EntryCount;
+ size_t NodeCount;
+ TPacker Packer;
+
+ enum EPayload {
+ DATA_ABSENT,
+ DATA_INSIDE,
+ DATA_MALLOCED,
+ DATA_IN_MEMPOOL,
+ };
+
+protected:
+ void ConvertSymbolArrayToChar(const TSymbol* key, size_t keylen, TTempBuf& buf, size_t ckeylen) const;
+ void NodeLinkTo(TNode* thiz, const TBlob& label, TNode* node);
+ TNode* NodeForwardAdd(TNode* thiz, const char* label, size_t len, size_t& passed, size_t* nodeCount);
+ bool FindEntryImpl(const char* key, size_t keylen, TData* value) const;
+ bool FindLongestPrefixImpl(const char* keyptr, size_t keylen, size_t* prefixLen, TData* value) const;
+
+ size_t NodeMeasureSubtree(TNode* thiz) const;
+ ui64 NodeSaveSubtree(TNode* thiz, IOutputStream& os) const;
+ ui64 NodeSaveSubtreeAndDestroy(TNode* thiz, IOutputStream& osy);
+ void NodeBufferSubtree(TNode* thiz);
+
+ size_t NodeMeasureLeafValue(TNode* thiz) const;
+ ui64 NodeSaveLeafValue(TNode* thiz, IOutputStream& os) const;
+
+ virtual ui64 ArcMeasure(const TArc* thiz, size_t leftsize, size_t rightsize) const;
+
+ virtual ui64 ArcSaveSelf(const TArc* thiz, IOutputStream& os) const;
+ ui64 ArcSave(const TArc* thiz, IOutputStream& os) const;
+ ui64 ArcSaveAndDestroy(const TArc* thiz, IOutputStream& os);
+
+public:
+ TCompactTrieBuilderImpl(TCompactTrieBuilderFlags flags, TPacker packer, IAllocator* alloc);
+ virtual ~TCompactTrieBuilderImpl();
+
+ void DestroyNode(TNode* node);
+ void NodeReleasePayload(TNode* thiz);
+
+ char* AddEntryForData(const TSymbol* key, size_t keylen, size_t dataLen, bool& isNewAddition);
+ TNode* AddEntryForSomething(const TSymbol* key, size_t keylen, bool& isNewAddition);
+
+ bool AddEntry(const TSymbol* key, size_t keylen, const TData& value);
+ bool AddEntryPtr(const TSymbol* key, size_t keylen, const char* value);
+ bool AddSubtreeInFile(const TSymbol* key, size_t keylen, const TString& fileName);
+ bool AddSubtreeInBuffer(const TSymbol* key, size_t keylen, TArrayWithSizeHolder<char>&& buffer);
+ bool FindEntry(const TSymbol* key, size_t keylen, TData* value) const;
+ bool FindLongestPrefix(const TSymbol* key, size_t keylen, size_t* prefixlen, TData* value) const;
+
+ size_t Save(IOutputStream& os) const;
+ size_t SaveAndDestroy(IOutputStream& os);
+
+ void Clear();
+
+ // lies if some key was added at least twice
+ size_t GetEntryCount() const;
+ size_t GetNodeCount() const;
+
+ size_t MeasureByteSize() const {
+ return NodeMeasureSubtree(Root);
+ }
+};
+
+template <class T, class D, class S>
+class TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::TArc {
+public:
+ TBlob Label;
+ TNode* Node;
+ mutable size_t LeftOffset;
+ mutable size_t RightOffset;
+
+ TArc(const TBlob& lbl, TNode* nd);
+};
+
+template <class T, class D, class S>
+class TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::TNode {
+public:
+ typedef typename TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl TBuilderImpl;
+ typedef typename TBuilderImpl::TArc TArc;
+
+ struct ISubtree {
+ virtual ~ISubtree() = default;
+ virtual bool IsLast() const = 0;
+ virtual ui64 Measure(const TBuilderImpl* builder) const = 0;
+ virtual ui64 Save(const TBuilderImpl* builder, IOutputStream& os) const = 0;
+ virtual ui64 SaveAndDestroy(TBuilderImpl* builder, IOutputStream& os) = 0;
+ virtual void Destroy(TBuilderImpl*) { }
+
+ // Tries to find key in subtree.
+ // Returns next node to find the key in (to avoid recursive calls)
+ // If it has end result, writes it to @value and @result arguments and returns nullptr
+ virtual const TNode* Find(TStringBuf& key, TData* value, bool& result, const TPacker& packer) const = 0;
+ virtual const TNode* FindLongestPrefix(TStringBuf& key, TData* value, bool& result, const TPacker& packer) const = 0;
+ };
+
+ class TArcSet: public ISubtree, public TCompactVector<TArc> {
+ public:
+ typedef typename TCompactVector<TArc>::iterator iterator;
+ typedef typename TCompactVector<TArc>::const_iterator const_iterator;
+
+ TArcSet() {
+ Y_ASSERT(reinterpret_cast<ISubtree*>(this) == static_cast<void*>(this)); // This assumption is used in TNode::Subtree()
+ }
+
+ iterator Find(char ch);
+ const_iterator Find(char ch) const;
+ void Add(const TBlob& s, TNode* node);
+
+ bool IsLast() const override {
+ return this->Empty();
+ }
+
+ const TNode* Find(TStringBuf& key, TData* value, bool& result, const TPacker& packer) const override;
+ const TNode* FindLongestPrefix(TStringBuf& key, TData* value, bool& result, const TPacker& packer) const override {
+ return Find(key, value, result, packer);
+ }
+
+ ui64 Measure(const TBuilderImpl* builder) const override {
+ return MeasureRange(builder, 0, this->size());
+ }
+
+ ui64 MeasureRange(const TBuilderImpl* builder, size_t from, size_t to) const {
+ if (from >= to)
+ return 0;
+
+ size_t median = (from + to) / 2;
+ size_t leftsize = (size_t)MeasureRange(builder, from, median);
+ size_t rightsize = (size_t)MeasureRange(builder, median + 1, to);
+
+ return builder->ArcMeasure(&(*this)[median], leftsize, rightsize);
+ }
+
+ ui64 Save(const TBuilderImpl* builder, IOutputStream& os) const override {
+ return SaveRange(builder, 0, this->size(), os);
+ }
+
+ ui64 SaveAndDestroy(TBuilderImpl* builder, IOutputStream& os) override {
+ ui64 result = SaveRangeAndDestroy(builder, 0, this->size(), os);
+ Destroy(builder);
+ return result;
+ }
+
+ ui64 SaveRange(const TBuilderImpl* builder, size_t from, size_t to, IOutputStream& os) const {
+ if (from >= to)
+ return 0;
+
+ size_t median = (from + to) / 2;
+
+ ui64 written = builder->ArcSave(&(*this)[median], os);
+ written += SaveRange(builder, from, median, os);
+ written += SaveRange(builder, median + 1, to, os);
+ return written;
+ }
+
+ ui64 SaveRangeAndDestroy(TBuilderImpl* builder, size_t from, size_t to, IOutputStream& os) {
+ if (from >= to)
+ return 0;
+
+ size_t median = (from + to) / 2;
+
+ ui64 written = builder->ArcSaveAndDestroy(&(*this)[median], os);
+ written += SaveRangeAndDestroy(builder, from, median, os);
+ written += SaveRangeAndDestroy(builder, median + 1, to, os);
+ return written;
+ }
+
+ void Destroy(TBuilderImpl* builder) override {
+ // Delete all nodes down the stream.
+ for (iterator it = this->begin(); it != this->end(); ++it) {
+ builder->DestroyNode(it->Node);
+ }
+ this->clear();
+ }
+
+ ~TArcSet() override {
+ Y_ASSERT(this->empty());
+ }
+
+ };
+
+ struct TBufferedSubtree: public ISubtree {
+ TArrayWithSizeHolder<char> Buffer;
+
+ TBufferedSubtree() {
+ Y_ASSERT(reinterpret_cast<ISubtree*>(this) == static_cast<void*>(this)); // This assumption is used in TNode::Subtree()
+ }
+
+ bool IsLast() const override {
+ return Buffer.Empty();
+ }
+
+ const TNode* Find(TStringBuf& key, TData* value, bool& result, const TPacker& packer) const override {
+ if (Buffer.Empty()) {
+ result = false;
+ return nullptr;
+ }
+
+ TCompactTrie<char, D, S> trie(Buffer.Get(), Buffer.Size(), packer);
+ result = trie.Find(key.data(), key.size(), value);
+
+ return nullptr;
+ }
+
+ const TNode* FindLongestPrefix(TStringBuf& key, TData* value, bool& result, const TPacker& packer) const override {
+ if (Buffer.Empty()) {
+ result = false;
+ return nullptr;
+ }
+
+ TCompactTrie<char, D, S> trie(Buffer.Get(), Buffer.Size(), packer);
+ size_t prefixLen = 0;
+ result = trie.FindLongestPrefix(key.data(), key.size(), &prefixLen, value);
+ key = key.SubStr(prefixLen);
+
+ return nullptr;
+ }
+
+ ui64 Measure(const TBuilderImpl*) const override {
+ return Buffer.Size();
+ }
+
+ ui64 Save(const TBuilderImpl*, IOutputStream& os) const override {
+ os.Write(Buffer.Get(), Buffer.Size());
+ return Buffer.Size();
+ }
+
+ ui64 SaveAndDestroy(TBuilderImpl* builder, IOutputStream& os) override {
+ ui64 result = Save(builder, os);
+ TArrayWithSizeHolder<char>().Swap(Buffer);
+ return result;
+ }
+ };
+
+ struct TSubtreeInFile: public ISubtree {
+ struct TData {
+ TString FileName;
+ ui64 Size;
+ };
+ THolder<TData> Data;
+
+ TSubtreeInFile(const TString& fileName) {
+ // stupid API
+ TFile file(fileName, RdOnly);
+ i64 size = file.GetLength();
+ if (size < 0)
+ ythrow yexception() << "unable to get file " << fileName.Quote() << " size for unknown reason";
+ Data.Reset(new TData);
+ Data->FileName = fileName;
+ Data->Size = size;
+
+ Y_ASSERT(reinterpret_cast<ISubtree*>(this) == static_cast<void*>(this)); // This assumption is used in TNode::Subtree()
+ }
+
+ bool IsLast() const override {
+ return Data->Size == 0;
+ }
+
+ const TNode* Find(TStringBuf& key, typename TCompactTrieBuilder::TData* value, bool& result, const TPacker& packer) const override {
+ if (!Data) {
+ result = false;
+ return nullptr;
+ }
+
+ TCompactTrie<char, D, S> trie(TBlob::FromFile(Data->FileName), packer);
+ result = trie.Find(key.data(), key.size(), value);
+ return nullptr;
+ }
+
+ const TNode* FindLongestPrefix(TStringBuf& key, typename TCompactTrieBuilder::TData* value, bool& result, const TPacker& packer) const override {
+ if (!Data) {
+ result = false;
+ return nullptr;
+ }
+
+ TCompactTrie<char, D, S> trie(TBlob::FromFile(Data->FileName), packer);
+ size_t prefixLen = 0;
+ result = trie.FindLongestPrefix(key.data(), key.size(), &prefixLen, value);
+ key = key.SubStr(prefixLen);
+
+ return nullptr;
+ }
+
+ ui64 Measure(const TBuilderImpl*) const override {
+ return Data->Size;
+ }
+
+ ui64 Save(const TBuilderImpl*, IOutputStream& os) const override {
+ TUnbufferedFileInput is(Data->FileName);
+ ui64 written = TransferData(&is, &os);
+ if (written != Data->Size)
+ ythrow yexception() << "file " << Data->FileName.Quote() << " size changed";
+ return written;
+ }
+
+ ui64 SaveAndDestroy(TBuilderImpl* builder, IOutputStream& os) override {
+ return Save(builder, os);
+ }
+ };
+
+ union {
+ char ArcsData[CONSTEXPR_MAX3(sizeof(TArcSet), sizeof(TBufferedSubtree), sizeof(TSubtreeInFile))];
+ union {
+ void* Data1;
+ long long int Data2;
+ } Aligner;
+ };
+
+ inline ISubtree* Subtree() {
+ return reinterpret_cast<ISubtree*>(ArcsData);
+ }
+
+ inline const ISubtree* Subtree() const {
+ return reinterpret_cast<const ISubtree*>(ArcsData);
+ }
+
+ EPayload PayloadType;
+
+ inline const char* PayloadPtr() const {
+ return ((const char*) this) + sizeof(TNode);
+ }
+
+ inline char* PayloadPtr() {
+ return ((char*) this) + sizeof(TNode);
+ }
+
+ // *Payload()
+ inline const char*& PayloadAsPtr() const {
+ const char** payload = (const char**) PayloadPtr();
+ return *payload;
+ }
+
+ inline char*& PayloadAsPtr() {
+ char** payload = (char**) PayloadPtr();
+ return *payload;
+ }
+
+ inline const char* GetPayload() const {
+ switch (PayloadType) {
+ case DATA_INSIDE:
+ return PayloadPtr();
+ case DATA_MALLOCED:
+ case DATA_IN_MEMPOOL:
+ return PayloadAsPtr();
+ case DATA_ABSENT:
+ default:
+ abort();
+ }
+ }
+
+ inline char* GetPayload() {
+ const TNode* thiz = this;
+ return const_cast<char*>(thiz->GetPayload()); // const_cast is to avoid copy-paste style
+ }
+
+ bool IsFinal() const {
+ return PayloadType != DATA_ABSENT;
+ }
+
+ bool IsLast() const {
+ return Subtree()->IsLast();
+ }
+
+ inline void* operator new(size_t, TFixedSizeAllocator& pool) {
+ return pool.Allocate();
+ }
+
+ inline void operator delete(void* ptr, TFixedSizeAllocator& pool) noexcept {
+ pool.Release(ptr);
+ }
+
+ TNode()
+ : PayloadType(DATA_ABSENT)
+ {
+ new (Subtree()) TArcSet;
+ }
+
+ ~TNode() {
+ Subtree()->~ISubtree();
+ Y_ASSERT(PayloadType == DATA_ABSENT);
+ }
+
+};
+
+// TCompactTrieBuilder
+
+template <class T, class D, class S>
+TCompactTrieBuilder<T, D, S>::TCompactTrieBuilder(TCompactTrieBuilderFlags flags, TPacker packer, IAllocator* alloc)
+ : Impl(new TCompactTrieBuilderImpl(flags, packer, alloc))
+{
+}
+
+template <class T, class D, class S>
+bool TCompactTrieBuilder<T, D, S>::Add(const TSymbol* key, size_t keylen, const TData& value) {
+ return Impl->AddEntry(key, keylen, value);
+}
+
+template <class T, class D, class S>
+bool TCompactTrieBuilder<T, D, S>::AddPtr(const TSymbol* key, size_t keylen, const char* value) {
+ return Impl->AddEntryPtr(key, keylen, value);
+}
+
+template <class T, class D, class S>
+bool TCompactTrieBuilder<T, D, S>::AddSubtreeInFile(const TSymbol* key, size_t keylen, const TString& fileName) {
+ return Impl->AddSubtreeInFile(key, keylen, fileName);
+}
+
+template <class T, class D, class S>
+bool TCompactTrieBuilder<T, D, S>::AddSubtreeInBuffer(const TSymbol* key, size_t keylen, TArrayWithSizeHolder<char>&& buffer) {
+ return Impl->AddSubtreeInBuffer(key, keylen, std::move(buffer));
+}
+
+template <class T, class D, class S>
+bool TCompactTrieBuilder<T, D, S>::Find(const TSymbol* key, size_t keylen, TData* value) const {
+ return Impl->FindEntry(key, keylen, value);
+}
+
+template <class T, class D, class S>
+bool TCompactTrieBuilder<T, D, S>::FindLongestPrefix(
+ const TSymbol* key, size_t keylen, size_t* prefixlen, TData* value) const {
+ return Impl->FindLongestPrefix(key, keylen, prefixlen, value);
+}
+
+template <class T, class D, class S>
+size_t TCompactTrieBuilder<T, D, S>::Save(IOutputStream& os) const {
+ return Impl->Save(os);
+}
+
+template <class T, class D, class S>
+size_t TCompactTrieBuilder<T, D, S>::SaveAndDestroy(IOutputStream& os) {
+ return Impl->SaveAndDestroy(os);
+}
+
+template <class T, class D, class S>
+void TCompactTrieBuilder<T, D, S>::Clear() {
+ Impl->Clear();
+}
+
+template <class T, class D, class S>
+size_t TCompactTrieBuilder<T, D, S>::GetEntryCount() const {
+ return Impl->GetEntryCount();
+}
+
+template <class T, class D, class S>
+size_t TCompactTrieBuilder<T, D, S>::GetNodeCount() const {
+ return Impl->GetNodeCount();
+}
+
+// TCompactTrieBuilder::TCompactTrieBuilderImpl
+
+template <class T, class D, class S>
+TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::TCompactTrieBuilderImpl(TCompactTrieBuilderFlags flags, TPacker packer, IAllocator* alloc)
+ : Pool(1000000, TMemoryPool::TLinearGrow::Instance(), alloc)
+ , PayloadSize(sizeof(void*)) // XXX: find better value
+ , NodeAllocator(new TFixedSizeAllocator(sizeof(TNode) + PayloadSize, alloc))
+ , Flags(flags)
+ , EntryCount(0)
+ , NodeCount(1)
+ , Packer(packer)
+{
+ Root = new (*NodeAllocator) TNode;
+}
+
+template <class T, class D, class S>
+TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::~TCompactTrieBuilderImpl() {
+ DestroyNode(Root);
+}
+
+template <class T, class D, class S>
+void TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::ConvertSymbolArrayToChar(
+ const TSymbol* key, size_t keylen, TTempBuf& buf, size_t buflen) const {
+ char* ckeyptr = buf.Data();
+
+ for (size_t i = 0; i < keylen; ++i) {
+ TSymbol label = key[i];
+ for (int j = (int)NCompactTrie::ExtraBits<TSymbol>(); j >= 0; j -= 8) {
+ Y_ASSERT(ckeyptr < buf.Data() + buflen);
+ *(ckeyptr++) = (char)(label >> j);
+ }
+ }
+
+ buf.Proceed(buflen);
+ Y_ASSERT(ckeyptr == buf.Data() + buf.Filled());
+}
+
+template <class T, class D, class S>
+void TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::DestroyNode(TNode* thiz) {
+ thiz->Subtree()->Destroy(this);
+ NodeReleasePayload(thiz);
+ thiz->~TNode();
+ NodeAllocator->Release(thiz);
+}
+
+template <class T, class D, class S>
+void TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::NodeReleasePayload(TNode* thiz) {
+ switch (thiz->PayloadType) {
+ case DATA_ABSENT:
+ case DATA_INSIDE:
+ case DATA_IN_MEMPOOL:
+ break;
+ case DATA_MALLOCED:
+ delete[] thiz->PayloadAsPtr();
+ break;
+ default:
+ abort();
+ }
+ thiz->PayloadType = DATA_ABSENT;
+}
+
+template <class T, class D, class S>
+bool TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::AddEntry(
+ const TSymbol* key, size_t keylen, const TData& value) {
+ size_t datalen = Packer.MeasureLeaf(value);
+
+ bool isNewAddition = false;
+ char* place = AddEntryForData(key, keylen, datalen, isNewAddition);
+ Packer.PackLeaf(place, value, datalen);
+ return isNewAddition;
+}
+
+template <class T, class D, class S>
+bool TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::AddEntryPtr(
+ const TSymbol* key, size_t keylen, const char* value) {
+ size_t datalen = Packer.SkipLeaf(value);
+
+ bool isNewAddition = false;
+ char* place = AddEntryForData(key, keylen, datalen, isNewAddition);
+ memcpy(place, value, datalen);
+ return isNewAddition;
+}
+
+template <class T, class D, class S>
+bool TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::AddSubtreeInFile(
+ const TSymbol* key, size_t keylen, const TString& fileName) {
+ typedef typename TNode::ISubtree ISubtree;
+ typedef typename TNode::TSubtreeInFile TSubtreeInFile;
+
+ bool isNewAddition = false;
+ TNode* node = AddEntryForSomething(key, keylen, isNewAddition);
+ node->Subtree()->Destroy(this);
+ node->Subtree()->~ISubtree();
+
+ new (node->Subtree()) TSubtreeInFile(fileName);
+ return isNewAddition;
+}
+
+template <class T, class D, class S>
+bool TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::AddSubtreeInBuffer(
+ const TSymbol* key, size_t keylen, TArrayWithSizeHolder<char>&& buffer) {
+
+ typedef typename TNode::TBufferedSubtree TBufferedSubtree;
+
+ bool isNewAddition = false;
+ TNode* node = AddEntryForSomething(key, keylen, isNewAddition);
+ node->Subtree()->Destroy(this);
+ node->Subtree()->~ISubtree();
+
+ auto subtree = new (node->Subtree()) TBufferedSubtree();
+ subtree->Buffer.Swap(buffer);
+
+ return isNewAddition;
+}
+
+template <class T, class D, class S>
+typename TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::TNode*
+ TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::AddEntryForSomething(
+ const TSymbol* key, size_t keylen, bool& isNewAddition) {
+ using namespace NCompactTrie;
+
+ EntryCount++;
+
+ if (Flags & CTBF_VERBOSE)
+ ShowProgress(EntryCount);
+
+ TNode* current = Root;
+ size_t passed;
+
+ // Special case of empty key: replace it by 1-byte "\0" key.
+ size_t ckeylen = keylen ? keylen * sizeof(TSymbol) : 1;
+ TTempBuf ckeybuf(ckeylen);
+ if (keylen == 0) {
+ ckeybuf.Append("\0", 1);
+ } else {
+ ConvertSymbolArrayToChar(key, keylen, ckeybuf, ckeylen);
+ }
+
+ char* ckey = ckeybuf.Data();
+
+ TNode* next;
+ while ((ckeylen > 0) && (next = NodeForwardAdd(current, ckey, ckeylen, passed, &NodeCount)) != nullptr) {
+ current = next;
+ ckeylen -= passed;
+ ckey += passed;
+ }
+
+ if (ckeylen != 0) {
+ //new leaf
+ NodeCount++;
+ TNode* leaf = new (*NodeAllocator) TNode();
+ NodeLinkTo(current, TBlob::Copy(ckey, ckeylen), leaf);
+ current = leaf;
+ }
+ isNewAddition = (current->PayloadType == DATA_ABSENT);
+ if ((Flags & CTBF_UNIQUE) && !isNewAddition)
+ ythrow yexception() << "Duplicate key";
+ return current;
+}
+
+template <class T, class D, class S>
+char* TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::AddEntryForData(const TSymbol* key, size_t keylen,
+ size_t datalen, bool& isNewAddition) {
+ TNode* current = AddEntryForSomething(key, keylen, isNewAddition);
+ NodeReleasePayload(current);
+ if (datalen <= PayloadSize) {
+ current->PayloadType = DATA_INSIDE;
+ } else if (Flags & CTBF_PREFIX_GROUPED) {
+ current->PayloadType = DATA_MALLOCED;
+ current->PayloadAsPtr() = new char[datalen];
+ } else {
+ current->PayloadType = DATA_IN_MEMPOOL;
+ current->PayloadAsPtr() = (char*) Pool.Allocate(datalen); // XXX: allocate unaligned
+ }
+ return current->GetPayload();
+}
+
+template <class T, class D, class S>
+bool TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::FindEntry(const TSymbol* key, size_t keylen, TData* value) const {
+ using namespace NCompactTrie;
+
+ if (!keylen) {
+ const char zero = '\0';
+ return FindEntryImpl(&zero, 1, value);
+ } else {
+ size_t ckeylen = keylen * sizeof(TSymbol);
+ TTempBuf ckeybuf(ckeylen);
+ ConvertSymbolArrayToChar(key, keylen, ckeybuf, ckeylen);
+ return FindEntryImpl(ckeybuf.Data(), ckeylen, value);
+ }
+}
+
+template <class T, class D, class S>
+bool TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::FindEntryImpl(const char* keyptr, size_t keylen, TData* value) const {
+ const TNode* node = Root;
+ bool result = false;
+ TStringBuf key(keyptr, keylen);
+ while (key && (node = node->Subtree()->Find(key, value, result, Packer))) {
+ }
+ return result;
+}
+
+template <class T, class D, class S>
+bool TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::FindLongestPrefix(
+ const TSymbol* key, size_t keylen, size_t* prefixlen, TData* value) const {
+ using namespace NCompactTrie;
+
+ if (!keylen) {
+ const char zero = '\0';
+ const bool ret = FindLongestPrefixImpl(&zero, 1, prefixlen, value);
+ if (ret && prefixlen)
+ *prefixlen = 0; // empty key found
+ return ret;
+ } else {
+ size_t ckeylen = keylen * sizeof(TSymbol);
+ TTempBuf ckeybuf(ckeylen);
+ ConvertSymbolArrayToChar(key, keylen, ckeybuf, ckeylen);
+ bool ret = FindLongestPrefixImpl(ckeybuf.Data(), ckeylen, prefixlen, value);
+ if (ret && prefixlen && *prefixlen == 1 && ckeybuf.Data()[0] == '\0')
+ *prefixlen = 0; // if we have found empty key, set prefixlen to zero
+ else if (!ret) // try to find value with empty key, because empty key is prefix of a every key
+ ret = FindLongestPrefix(nullptr, 0, prefixlen, value);
+
+ if (ret && prefixlen)
+ *prefixlen /= sizeof(TSymbol);
+
+ return ret;
+ }
+}
+
+template <class T, class D, class S>
+bool TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::FindLongestPrefixImpl(const char* keyptr, size_t keylen, size_t* prefixLen, TData* value) const {
+ const TNode* node = Root;
+ const TNode* lastFinalNode = nullptr;
+ bool endResult = false;
+ TStringBuf key(keyptr, keylen);
+ TStringBuf keyTail = key;
+ TStringBuf lastFinalKeyTail;
+ while (keyTail && (node = node->Subtree()->FindLongestPrefix(keyTail, value, endResult, Packer))) {
+ if (endResult) // no more ways to find prefix and prefix has been found
+ break;
+
+ if (node->IsFinal()) {
+ lastFinalNode = node;
+ lastFinalKeyTail = keyTail;
+ }
+ }
+ if (!endResult && lastFinalNode) {
+ if (value)
+ Packer.UnpackLeaf(lastFinalNode->GetPayload(), *value);
+ keyTail = lastFinalKeyTail;
+ endResult = true;
+ }
+ if (endResult && prefixLen)
+ *prefixLen = keyTail ? key.size() - keyTail.size() : key.size();
+ return endResult;
+}
+
+template <class T, class D, class S>
+void TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::Clear() {
+ DestroyNode(Root);
+ Pool.Clear();
+ NodeAllocator.Reset(new TFixedSizeAllocator(sizeof(TNode) + PayloadSize, TDefaultAllocator::Instance()));
+ Root = new (*NodeAllocator) TNode;
+ EntryCount = 0;
+ NodeCount = 1;
+}
+
+template <class T, class D, class S>
+size_t TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::Save(IOutputStream& os) const {
+ const size_t len = NodeMeasureSubtree(Root);
+ if (len != NodeSaveSubtree(Root, os))
+ ythrow yexception() << "something wrong";
+
+ return len;
+}
+
+template <class T, class D, class S>
+size_t TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::SaveAndDestroy(IOutputStream& os) {
+ const size_t len = NodeMeasureSubtree(Root);
+ if (len != NodeSaveSubtreeAndDestroy(Root, os))
+ ythrow yexception() << "something wrong";
+
+ return len;
+}
+
+template <class T, class D, class S>
+size_t TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::GetEntryCount() const {
+ return EntryCount;
+}
+
+template <class T, class D, class S>
+size_t TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::GetNodeCount() const {
+ return NodeCount;
+}
+
+template <class T, class D, class S>
+typename TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::TNode*
+ TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::NodeForwardAdd(
+ TNode* thiz, const char* label, size_t len, size_t& passed, size_t* nodeCount) {
+ typename TNode::TArcSet* arcSet = dynamic_cast<typename TNode::TArcSet*>(thiz->Subtree());
+ if (!arcSet)
+ ythrow yexception() << "Bad input order - expected input strings to be prefix-grouped.";
+
+ typename TNode::TArcSet::iterator it = arcSet->Find(*label);
+
+ if (it != arcSet->end()) {
+ const char* arcLabel = it->Label.AsCharPtr();
+ size_t arcLabelLen = it->Label.Length();
+
+ for (passed = 0; (passed < len) && (passed < arcLabelLen) && (label[passed] == arcLabel[passed]); ++passed) {
+ //just count
+ }
+
+ if (passed < arcLabelLen) {
+ (*nodeCount)++;
+ TNode* node = new (*NodeAllocator) TNode();
+ NodeLinkTo(node, it->Label.SubBlob(passed, arcLabelLen), it->Node);
+
+ it->Node = node;
+ it->Label = it->Label.SubBlob(passed);
+ }
+
+ return it->Node;
+ }
+
+ return nullptr;
+}
+
+template <class T, class D, class S>
+void TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::NodeLinkTo(TNode* thiz, const TBlob& label, TNode* node) {
+ typename TNode::TArcSet* arcSet = dynamic_cast<typename TNode::TArcSet*>(thiz->Subtree());
+ if (!arcSet)
+ ythrow yexception() << "Bad input order - expected input strings to be prefix-grouped.";
+
+ // Buffer the node at the last arc
+ if ((Flags & CTBF_PREFIX_GROUPED) && !arcSet->empty())
+ NodeBufferSubtree(arcSet->back().Node);
+
+ arcSet->Add(label, node);
+}
+
+template <class T, class D, class S>
+size_t TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::NodeMeasureSubtree(TNode* thiz) const {
+ return (size_t)thiz->Subtree()->Measure(this);
+}
+
+template <class T, class D, class S>
+ui64 TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::NodeSaveSubtree(TNode* thiz, IOutputStream& os) const {
+ return thiz->Subtree()->Save(this, os);
+}
+
+template <class T, class D, class S>
+ui64 TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::NodeSaveSubtreeAndDestroy(TNode* thiz, IOutputStream& os) {
+ return thiz->Subtree()->SaveAndDestroy(this, os);
+}
+
+template <class T, class D, class S>
+void TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::NodeBufferSubtree(TNode* thiz) {
+ typedef typename TNode::TArcSet TArcSet;
+
+ TArcSet* arcSet = dynamic_cast<TArcSet*>(thiz->Subtree());
+ if (!arcSet)
+ return;
+
+ size_t bufferLength = (size_t)arcSet->Measure(this);
+ TArrayWithSizeHolder<char> buffer;
+ buffer.Resize(bufferLength);
+
+ TMemoryOutput bufout(buffer.Get(), buffer.Size());
+
+ ui64 written = arcSet->Save(this, bufout);
+ Y_ASSERT(written == bufferLength);
+
+ arcSet->Destroy(this);
+ arcSet->~TArcSet();
+
+ typename TNode::TBufferedSubtree* bufferedArcSet = new (thiz->Subtree()) typename TNode::TBufferedSubtree;
+
+ bufferedArcSet->Buffer.Swap(buffer);
+}
+
+template <class T, class D, class S>
+size_t TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::NodeMeasureLeafValue(TNode* thiz) const {
+ if (!thiz->IsFinal())
+ return 0;
+
+ return Packer.SkipLeaf(thiz->GetPayload());
+}
+
+template <class T, class D, class S>
+ui64 TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::NodeSaveLeafValue(TNode* thiz, IOutputStream& os) const {
+ if (!thiz->IsFinal())
+ return 0;
+
+ size_t len = Packer.SkipLeaf(thiz->GetPayload());
+ os.Write(thiz->GetPayload(), len);
+ return len;
+}
+
+// TCompactTrieBuilder::TCompactTrieBuilderImpl::TNode::TArc
+
+template <class T, class D, class S>
+TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::TArc::TArc(const TBlob& lbl, TNode* nd)
+ : Label(lbl)
+ , Node(nd)
+ , LeftOffset(0)
+ , RightOffset(0)
+{}
+
+template <class T, class D, class S>
+ui64 TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::ArcMeasure(
+ const TArc* thiz, size_t leftsize, size_t rightsize) const {
+ using namespace NCompactTrie;
+
+ size_t coresize = 2 + NodeMeasureLeafValue(thiz->Node); // 2 == (char + flags)
+ size_t treesize = NodeMeasureSubtree(thiz->Node);
+
+ if (thiz->Label.Length() > 0)
+ treesize += 2 * (thiz->Label.Length() - 1);
+
+ // Triple measurements are needed because the space needed to store the offset
+ // shall be added to the offset itself. Hence three iterations.
+ size_t leftoffsetsize = leftsize ? MeasureOffset(coresize + treesize) : 0;
+ size_t rightoffsetsize = rightsize ? MeasureOffset(coresize + treesize + leftsize) : 0;
+ leftoffsetsize = leftsize ? MeasureOffset(coresize + treesize + leftoffsetsize + rightoffsetsize) : 0;
+ rightoffsetsize = rightsize ? MeasureOffset(coresize + treesize + leftsize + leftoffsetsize + rightoffsetsize) : 0;
+ leftoffsetsize = leftsize ? MeasureOffset(coresize + treesize + leftoffsetsize + rightoffsetsize) : 0;
+ rightoffsetsize = rightsize ? MeasureOffset(coresize + treesize + leftsize + leftoffsetsize + rightoffsetsize) : 0;
+
+ coresize += leftoffsetsize + rightoffsetsize;
+ thiz->LeftOffset = leftsize ? coresize + treesize : 0;
+ thiz->RightOffset = rightsize ? coresize + treesize + leftsize : 0;
+
+ return coresize + treesize + leftsize + rightsize;
+}
+
+template <class T, class D, class S>
+ui64 TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::ArcSaveSelf(const TArc* thiz, IOutputStream& os) const {
+ using namespace NCompactTrie;
+
+ ui64 written = 0;
+
+ size_t leftoffsetsize = MeasureOffset(thiz->LeftOffset);
+ size_t rightoffsetsize = MeasureOffset(thiz->RightOffset);
+
+ size_t labelLen = thiz->Label.Length();
+
+ for (size_t i = 0; i < labelLen; ++i) {
+ char flags = 0;
+
+ if (i == 0) {
+ flags |= (leftoffsetsize << MT_LEFTSHIFT);
+ flags |= (rightoffsetsize << MT_RIGHTSHIFT);
+ }
+
+ if (i == labelLen-1) {
+ if (thiz->Node->IsFinal())
+ flags |= MT_FINAL;
+
+ if (!thiz->Node->IsLast())
+ flags |= MT_NEXT;
+ } else {
+ flags |= MT_NEXT;
+ }
+
+ os.Write(&flags, 1);
+ os.Write(&thiz->Label.AsCharPtr()[i], 1);
+ written += 2;
+
+ if (i == 0) {
+ written += ArcSaveOffset(thiz->LeftOffset, os);
+ written += ArcSaveOffset(thiz->RightOffset, os);
+ }
+ }
+
+ written += NodeSaveLeafValue(thiz->Node, os);
+ return written;
+}
+
+template <class T, class D, class S>
+ui64 TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::ArcSave(const TArc* thiz, IOutputStream& os) const {
+ ui64 written = ArcSaveSelf(thiz, os);
+ written += NodeSaveSubtree(thiz->Node, os);
+ return written;
+}
+
+template <class T, class D, class S>
+ui64 TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::ArcSaveAndDestroy(const TArc* thiz, IOutputStream& os) {
+ ui64 written = ArcSaveSelf(thiz, os);
+ written += NodeSaveSubtreeAndDestroy(thiz->Node, os);
+ return written;
+}
+
+// TCompactTrieBuilder::TCompactTrieBuilderImpl::TNode::TArcSet
+
+template <class T, class D, class S>
+typename TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::TNode::TArcSet::iterator
+ TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::TNode::TArcSet::Find(char ch) {
+ using namespace NCompTriePrivate;
+ iterator it = LowerBound(this->begin(), this->end(), ch, TCmp());
+
+ if (it != this->end() && it->Label[0] == (unsigned char)ch) {
+ return it;
+ }
+
+ return this->end();
+}
+
+template <class T, class D, class S>
+typename TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::TNode::TArcSet::const_iterator
+ TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::TNode::TArcSet::Find(char ch) const {
+ using namespace NCompTriePrivate;
+ const_iterator it = LowerBound(this->begin(), this->end(), ch, TCmp());
+
+ if (it != this->end() && it->Label[0] == (unsigned char)ch) {
+ return it;
+ }
+
+ return this->end();
+}
+
+template <class T, class D, class S>
+void TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::TNode::TArcSet::Add(const TBlob& s, TNode* node) {
+ using namespace NCompTriePrivate;
+ this->insert(LowerBound(this->begin(), this->end(), s[0], TCmp()), TArc(s, node));
+}
+
+template <class T, class D, class S>
+const typename TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::TNode*
+ TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::TNode::TArcSet::Find(
+ TStringBuf& key, TData* value, bool& result, const TPacker& packer) const {
+ result = false;
+ if (!key)
+ return nullptr;
+
+ const const_iterator it = Find(key[0]);
+ if (it != this->end()) {
+ const char* const arcLabel = it->Label.AsCharPtr();
+ const size_t arcLabelLen = it->Label.Length();
+ if (key.size() >= arcLabelLen && memcmp(key.data(), arcLabel, arcLabelLen) == 0) {
+ const TStringBuf srcKey = key;
+ key = key.SubStr(arcLabelLen);
+ const TNode* const node = it->Node;
+ if (srcKey.size() == arcLabelLen) {
+ // unpack value of it->Node, if it has value
+ if (!node->IsFinal())
+ return nullptr;
+
+ if (value)
+ packer.UnpackLeaf(node->GetPayload(), *value);
+
+ result = true;
+ return nullptr;
+ }
+
+ // find in subtree
+ return node;
+ }
+ }
+
+ return nullptr;
+}
+
+// Different
+
+//----------------------------------------------------------------------------------------------------------------------
+// Minimize the trie. The result is equivalent to the original
+// trie, except that it takes less space (and has marginally lower
+// performance, because of eventual epsilon links).
+// The algorithm is as follows: starting from the largest pieces, we find
+// nodes that have identical continuations (Daciuk's right language),
+// and repack the trie. Repacking is done in-place, so memory is less
+// of an issue; however, it may take considerable time.
+
+// IMPORTANT: never try to reminimize an already minimized trie or a trie with fast layout.
+// Because of non-local structure and epsilon links, it won't work
+// as you expect it to, and can destroy the trie in the making.
+
+template <class TPacker>
+size_t CompactTrieMinimize(IOutputStream& os, const char* data, size_t datalength, bool verbose /*= false*/, const TPacker& packer /*= TPacker()*/, NCompactTrie::EMinimizeMode mode) {
+ using namespace NCompactTrie;
+ return CompactTrieMinimizeImpl(os, data, datalength, verbose, &packer, mode);
+}
+
+template <class TTrieBuilder>
+size_t CompactTrieMinimize(IOutputStream& os, const TTrieBuilder& builder, bool verbose /*=false*/) {
+ TBufferStream buftmp;
+ size_t len = builder.Save(buftmp);
+ return CompactTrieMinimize<typename TTrieBuilder::TPacker>(os, buftmp.Buffer().Data(), len, verbose);
+}
+
+//----------------------------------------------------------------------------------------------------------------
+// Lay the trie in memory in such a way that there are less cache misses when jumping from root to leaf.
+// The trie becomes about 2% larger, but the access became about 25% faster in our experiments.
+// Can be called on minimized and non-minimized tries, in the first case in requires half a trie more memory.
+// Calling it the second time on the same trie does nothing.
+//
+// The algorithm is based on van Emde Boas layout as described in the yandex data school lectures on external memory algoritms
+// by Maxim Babenko and Ivan Puzyrevsky. The difference is that when we cut the tree into levels
+// two nodes connected by a forward link are put into the same level (because they usually lie near each other in the original tree).
+// The original paper (describing the layout in Section 2.1) is:
+// Michael A. Bender, Erik D. Demaine, Martin Farach-Colton. Cache-Oblivious B-Trees
+// SIAM Journal on Computing, volume 35, number 2, 2005, pages 341-358.
+// Available on the web: http://erikdemaine.org/papers/CacheObliviousBTrees_SICOMP/
+// Or: Michael A. Bender, Erik D. Demaine, and Martin Farach-Colton. Cache-Oblivious B-Trees
+// Proceedings of the 41st Annual Symposium
+// on Foundations of Computer Science (FOCS 2000), Redondo Beach, California, November 12-14, 2000, pages 399-409.
+// Available on the web: http://erikdemaine.org/papers/FOCS2000b/
+// (there is not much difference between these papers, actually).
+//
+template <class TPacker>
+size_t CompactTrieMakeFastLayout(IOutputStream& os, const char* data, size_t datalength, bool verbose /*= false*/, const TPacker& packer /*= TPacker()*/) {
+ using namespace NCompactTrie;
+ return CompactTrieMakeFastLayoutImpl(os, data, datalength, verbose, &packer);
+}
+
+template <class TTrieBuilder>
+size_t CompactTrieMakeFastLayout(IOutputStream& os, const TTrieBuilder& builder, bool verbose /*=false*/) {
+ TBufferStream buftmp;
+ size_t len = builder.Save(buftmp);
+ return CompactTrieMakeFastLayout<typename TTrieBuilder::TPacker>(os, buftmp.Buffer().Data(), len, verbose);
+}
+
+template <class TPacker>
+size_t CompactTrieMinimizeAndMakeFastLayout(IOutputStream& os, const char* data, size_t datalength, bool verbose/*=false*/, const TPacker& packer/*= TPacker()*/) {
+ TBufferStream buftmp;
+ size_t len = CompactTrieMinimize(buftmp, data, datalength, verbose, packer);
+ return CompactTrieMakeFastLayout(os, buftmp.Buffer().Data(), len, verbose, packer);
+}
+
+template <class TTrieBuilder>
+size_t CompactTrieMinimizeAndMakeFastLayout(IOutputStream& os, const TTrieBuilder& builder, bool verbose /*=false*/) {
+ TBufferStream buftmp;
+ size_t len = CompactTrieMinimize(buftmp, builder, verbose);
+ return CompactTrieMakeFastLayout<typename TTrieBuilder::TPacker>(os, buftmp.Buffer().Data(), len, verbose);
+}
+
diff --git a/library/cpp/containers/comptrie/comptrie_impl.cpp b/library/cpp/containers/comptrie/comptrie_impl.cpp
new file mode 100644
index 0000000000..a116ab6d1e
--- /dev/null
+++ b/library/cpp/containers/comptrie/comptrie_impl.cpp
@@ -0,0 +1,39 @@
+#include "comptrie_impl.h"
+
+#include <util/system/rusage.h>
+#include <util/stream/output.h>
+
+// Unpack the leaf value. The algorithm can store up to 8 full bytes in leafs.
+
+namespace NCompactTrie {
+ size_t MeasureOffset(size_t offset) {
+ int n = 0;
+
+ while (offset) {
+ offset >>= 8;
+ ++n;
+ }
+
+ return n;
+ }
+
+ size_t PackOffset(char* buffer, size_t offset) {
+ size_t len = MeasureOffset(offset);
+ size_t i = len;
+
+ while (i--) {
+ buffer[i] = (char)(offset & 0xFF);
+ offset >>= 8;
+ }
+
+ return len;
+ }
+
+ void ShowProgress(size_t n) {
+ if (n % 1000000 == 0)
+ Cerr << n << ", RSS=" << (TRusage::Get().MaxRss >> 20) << "mb" << Endl;
+ else if (n % 20000 == 0)
+ Cerr << ".";
+ }
+
+}
diff --git a/library/cpp/containers/comptrie/comptrie_impl.h b/library/cpp/containers/comptrie/comptrie_impl.h
new file mode 100644
index 0000000000..f41c38311a
--- /dev/null
+++ b/library/cpp/containers/comptrie/comptrie_impl.h
@@ -0,0 +1,221 @@
+#pragma once
+
+#include <util/stream/output.h>
+
+#ifndef COMPTRIE_DATA_CHECK
+#define COMPTRIE_DATA_CHECK 1
+#endif
+
+// NCompactTrie
+
+namespace NCompactTrie {
+ const char MT_FINAL = '\x80';
+ const char MT_NEXT = '\x40';
+ const char MT_SIZEMASK = '\x07';
+ const size_t MT_LEFTSHIFT = 3;
+ const size_t MT_RIGHTSHIFT = 0;
+
+ Y_FORCE_INLINE size_t UnpackOffset(const char* p, size_t len);
+ size_t MeasureOffset(size_t offset);
+ size_t PackOffset(char* buffer, size_t offset);
+ static inline ui64 ArcSaveOffset(size_t offset, IOutputStream& os);
+ Y_FORCE_INLINE char LeapByte(const char*& datapos, const char* dataend, char label);
+
+ template <class T>
+ inline static size_t ExtraBits() {
+ return (sizeof(T) - 1) * 8;
+ }
+
+ static inline bool IsEpsilonLink(const char flags) {
+ return !(flags & (MT_FINAL | MT_NEXT));
+ }
+
+ static inline void TraverseEpsilon(const char*& datapos) {
+ const char flags = *datapos;
+ if (!IsEpsilonLink(flags)) {
+ return;
+ }
+ const size_t offsetlength = flags & MT_SIZEMASK;
+ const size_t offset = UnpackOffset(datapos + 1, offsetlength);
+ Y_ASSERT(offset);
+ datapos += offset;
+ }
+
+ static inline size_t LeftOffsetLen(const char flags) {
+ return (flags >> MT_LEFTSHIFT) & MT_SIZEMASK;
+ }
+
+ static inline size_t RightOffsetLen(const char flags) {
+ return flags & MT_SIZEMASK;
+ }
+
+ void ShowProgress(size_t n); // just print dots
+}
+
+namespace NCompTriePrivate {
+ template <typename TChar>
+ struct TStringForChar {
+ };
+
+ template <>
+ struct TStringForChar<char> {
+ typedef TString TResult;
+ };
+
+ template <>
+ struct TStringForChar<wchar16> {
+ typedef TUtf16String TResult;
+ };
+
+ template <>
+ struct TStringForChar<wchar32> {
+ typedef TUtf32String TResult;
+ };
+
+}
+
+namespace NCompTriePrivate {
+ struct TCmp {
+ template <class T>
+ inline bool operator()(const T& l, const T& r) {
+ return (unsigned char)(l.Label[0]) < (unsigned char)(r.Label[0]);
+ }
+
+ template <class T>
+ inline bool operator()(const T& l, char r) {
+ return (unsigned char)(l.Label[0]) < (unsigned char)r;
+ }
+ };
+}
+
+namespace NCompactTrie {
+ static inline ui64 ArcSaveOffset(size_t offset, IOutputStream& os) {
+ using namespace NCompactTrie;
+
+ if (!offset)
+ return 0;
+
+ char buf[16];
+ size_t len = PackOffset(buf, offset);
+ os.Write(buf, len);
+ return len;
+ }
+
+ // Unpack the offset to the next node. The encoding scheme can store offsets
+ // up to 7 bytes; whether they fit into size_t is another issue.
+ Y_FORCE_INLINE size_t UnpackOffset(const char* p, size_t len) {
+ size_t result = 0;
+
+ while (len--)
+ result = ((result << 8) | (*(p++) & 0xFF));
+
+ return result;
+ }
+
+ // Auxiliary function: consumes one character from the input. Advances the data pointer
+ // to the position immediately preceding the value for the link just traversed (if any);
+ // returns flags associated with the link. If no arc with the required label is present,
+ // zeroes the data pointer.
+ Y_FORCE_INLINE char LeapByte(const char*& datapos, const char* dataend, char label) {
+ while (datapos < dataend) {
+ size_t offsetlength, offset;
+ const char* startpos = datapos;
+ char flags = *(datapos++);
+
+ if (IsEpsilonLink(flags)) {
+ // Epsilon link - jump to the specified offset without further checks.
+ // These links are created during minimization: original uncompressed
+ // tree does not need them. (If we find a way to package 3 offset lengths
+ // into 1 byte, we could get rid of them; but it looks like they do no harm.
+ Y_ASSERT(datapos < dataend);
+ offsetlength = flags & MT_SIZEMASK;
+ offset = UnpackOffset(datapos, offsetlength);
+ if (!offset)
+ break;
+ datapos = startpos + offset;
+
+ continue;
+ }
+
+ char ch = *(datapos++);
+
+ // Left branch
+ offsetlength = LeftOffsetLen(flags);
+ if ((unsigned char)label < (unsigned char)ch) {
+ offset = UnpackOffset(datapos, offsetlength);
+ if (!offset)
+ break;
+
+ datapos = startpos + offset;
+
+ continue;
+ }
+
+ datapos += offsetlength;
+
+ // Right branch
+ offsetlength = RightOffsetLen(flags);
+ if ((unsigned char)label > (unsigned char)ch) {
+ offset = UnpackOffset(datapos, offsetlength);
+
+ if (!offset)
+ break;
+
+ datapos = startpos + offset;
+
+ continue;
+ }
+
+ // Got a match; return position right before the contents for the label
+ datapos += offsetlength;
+ return flags;
+ }
+
+ // if we got here, we're past the dataend - bail out ASAP
+ datapos = nullptr;
+ return 0;
+ }
+
+ // Auxiliary function: consumes one (multibyte) symbol from the input.
+ // Advances the data pointer to the root of the subtrie beginning after the symbol,
+ // zeroes it if this subtrie is empty.
+ // If there is a value associated with the symbol, makes the value pointer point to it,
+ // otherwise sets it to nullptr.
+ // Returns true if the symbol was succesfully found in the trie, false otherwise.
+ template <typename TSymbol, class TPacker>
+ Y_FORCE_INLINE bool Advance(const char*& datapos, const char* const dataend, const char*& value,
+ TSymbol label, TPacker packer) {
+ Y_ASSERT(datapos < dataend);
+ char flags = MT_NEXT;
+ for (int i = (int)ExtraBits<TSymbol>(); i >= 0; i -= 8) {
+ flags = LeapByte(datapos, dataend, (char)(label >> i));
+ if (!datapos) {
+ return false; // no such arc
+ }
+
+ value = nullptr;
+
+ Y_ASSERT(datapos <= dataend);
+ if ((flags & MT_FINAL)) {
+ value = datapos;
+ datapos += packer.SkipLeaf(datapos);
+ }
+
+ if (!(flags & MT_NEXT)) {
+ if (i == 0) {
+ datapos = nullptr;
+ return true;
+ }
+ return false; // no further way
+ }
+
+ TraverseEpsilon(datapos);
+ if (i == 0) { // last byte, and got a match
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+}
diff --git a/library/cpp/containers/comptrie/comptrie_packer.h b/library/cpp/containers/comptrie/comptrie_packer.h
new file mode 100644
index 0000000000..0341eeeae3
--- /dev/null
+++ b/library/cpp/containers/comptrie/comptrie_packer.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <library/cpp/packers/packers.h>
+
+template <class T>
+class TCompactTriePacker {
+public:
+ void UnpackLeaf(const char* p, T& t) const {
+ NPackers::TPacker<T>().UnpackLeaf(p, t);
+ }
+ void PackLeaf(char* buffer, const T& data, size_t computedSize) const {
+ NPackers::TPacker<T>().PackLeaf(buffer, data, computedSize);
+ }
+ size_t MeasureLeaf(const T& data) const {
+ return NPackers::TPacker<T>().MeasureLeaf(data);
+ }
+ size_t SkipLeaf(const char* p) const // this function better be fast because it is very frequently used
+ {
+ return NPackers::TPacker<T>().SkipLeaf(p);
+ }
+};
diff --git a/library/cpp/containers/comptrie/comptrie_trie.h b/library/cpp/containers/comptrie/comptrie_trie.h
new file mode 100644
index 0000000000..40ec1e52b3
--- /dev/null
+++ b/library/cpp/containers/comptrie/comptrie_trie.h
@@ -0,0 +1,663 @@
+#pragma once
+
+#include "comptrie_impl.h"
+#include "comptrie_packer.h"
+#include "opaque_trie_iterator.h"
+#include "leaf_skipper.h"
+#include "key_selector.h"
+
+#include <util/generic/buffer.h>
+#include <util/generic/ptr.h>
+#include <util/generic/vector.h>
+#include <util/generic/yexception.h>
+#include <util/memory/blob.h>
+#include <util/stream/input.h>
+#include <utility>
+
+template <class T, class D, class S>
+class TCompactTrieBuilder;
+
+namespace NCompactTrie {
+ template <class TTrie>
+ class TFirstSymbolIterator;
+}
+
+template <class TTrie>
+class TSearchIterator;
+
+template <class TTrie>
+class TPrefixIterator;
+
+// in case of <char> specialization cannot distinguish between "" and "\0" keys
+template <class T = char, class D = ui64, class S = TCompactTriePacker<D>>
+class TCompactTrie {
+public:
+ typedef T TSymbol;
+ typedef D TData;
+ typedef S TPacker;
+
+ typedef typename TCompactTrieKeySelector<TSymbol>::TKey TKey;
+ typedef typename TCompactTrieKeySelector<TSymbol>::TKeyBuf TKeyBuf;
+
+ typedef std::pair<TKey, TData> TValueType;
+ typedef std::pair<size_t, TData> TPhraseMatch;
+ typedef TVector<TPhraseMatch> TPhraseMatchVector;
+
+ typedef TCompactTrieBuilder<T, D, S> TBuilder;
+
+protected:
+ TBlob DataHolder;
+ const char* EmptyValue = nullptr;
+ TPacker Packer;
+ NCompactTrie::TPackerLeafSkipper<TPacker> Skipper = &Packer; // This should be true for every constructor.
+
+public:
+ TCompactTrie() = default;
+
+ TCompactTrie(const char* d, size_t len, TPacker packer);
+ TCompactTrie(const char* d, size_t len)
+ : TCompactTrie{d, len, TPacker{}} {
+ }
+
+ TCompactTrie(const TBlob& data, TPacker packer);
+ explicit TCompactTrie(const TBlob& data)
+ : TCompactTrie{data, TPacker{}} {
+ }
+
+ // Skipper should be initialized with &Packer, not with &other.Packer, so you have to redefine these.
+ TCompactTrie(const TCompactTrie& other);
+ TCompactTrie(TCompactTrie&& other) noexcept;
+ TCompactTrie& operator=(const TCompactTrie& other);
+ TCompactTrie& operator=(TCompactTrie&& other) noexcept;
+
+ explicit operator bool() const {
+ return !IsEmpty();
+ }
+
+ void Init(const char* d, size_t len, TPacker packer = TPacker());
+ void Init(const TBlob& data, TPacker packer = TPacker());
+
+ bool IsInitialized() const;
+ bool IsEmpty() const;
+
+ bool Find(const TSymbol* key, size_t keylen, TData* value = nullptr) const;
+ bool Find(const TKeyBuf& key, TData* value = nullptr) const {
+ return Find(key.data(), key.size(), value);
+ }
+
+ TData Get(const TSymbol* key, size_t keylen) const {
+ TData value;
+ if (!Find(key, keylen, &value))
+ ythrow yexception() << "key " << TKey(key, keylen).Quote() << " not found in trie";
+ return value;
+ }
+ TData Get(const TKeyBuf& key) const {
+ return Get(key.data(), key.size());
+ }
+ TData GetDefault(const TKeyBuf& key, const TData& def) const {
+ TData value;
+ if (!Find(key.data(), key.size(), &value))
+ return def;
+ else
+ return value;
+ }
+
+ const TBlob& Data() const {
+ return DataHolder;
+ };
+
+ const NCompactTrie::ILeafSkipper& GetSkipper() const {
+ return Skipper;
+ }
+
+ TPacker GetPacker() const {
+ return Packer;
+ }
+
+ bool HasCorrectSkipper() const {
+ return Skipper.GetPacker() == &Packer;
+ }
+
+ void FindPhrases(const TSymbol* key, size_t keylen, TPhraseMatchVector& matches, TSymbol separator = TSymbol(' ')) const;
+ void FindPhrases(const TKeyBuf& key, TPhraseMatchVector& matches, TSymbol separator = TSymbol(' ')) const {
+ return FindPhrases(key.data(), key.size(), matches, separator);
+ }
+ bool FindLongestPrefix(const TSymbol* key, size_t keylen, size_t* prefixLen, TData* value = nullptr, bool* hasNext = nullptr) const;
+ bool FindLongestPrefix(const TKeyBuf& key, size_t* prefixLen, TData* value = nullptr, bool* hasNext = nullptr) const {
+ return FindLongestPrefix(key.data(), key.size(), prefixLen, value, hasNext);
+ }
+
+ // Return trie, containing all tails for the given key
+ inline TCompactTrie<T, D, S> FindTails(const TSymbol* key, size_t keylen) const;
+ TCompactTrie<T, D, S> FindTails(const TKeyBuf& key) const {
+ return FindTails(key.data(), key.size());
+ }
+ bool FindTails(const TSymbol* key, size_t keylen, TCompactTrie<T, D, S>& res) const;
+ bool FindTails(const TKeyBuf& key, TCompactTrie<T, D, S>& res) const {
+ return FindTails(key.data(), key.size(), res);
+ }
+
+ // same as FindTails(&key, 1), a bit faster
+ // return false, if no arc with @label exists
+ inline bool FindTails(TSymbol label, TCompactTrie<T, D, S>& res) const;
+
+ class TConstIterator {
+ private:
+ typedef NCompactTrie::TOpaqueTrieIterator TOpaqueTrieIterator;
+ typedef NCompactTrie::TOpaqueTrie TOpaqueTrie;
+ friend class TCompactTrie;
+ TConstIterator(const TOpaqueTrie& trie, const char* emptyValue, bool atend, TPacker packer); // only usable from Begin() and End() methods
+ TConstIterator(const TOpaqueTrie& trie, const char* emptyValue, const TKeyBuf& key, TPacker packer); // only usable from UpperBound() method
+
+ public:
+ TConstIterator() = default;
+ bool IsEmpty() const {
+ return !Impl;
+ } // Almost no other method can be called.
+
+ bool operator==(const TConstIterator& other) const;
+ bool operator!=(const TConstIterator& other) const;
+ TConstIterator& operator++();
+ TConstIterator operator++(int /*unused*/);
+ TConstIterator& operator--();
+ TConstIterator operator--(int /*unused*/);
+ TValueType operator*();
+
+ TKey GetKey() const;
+ size_t GetKeySize() const;
+ TData GetValue() const;
+ void GetValue(TData& data) const;
+ const char* GetValuePtr() const;
+
+ private:
+ TPacker Packer;
+ TCopyPtr<TOpaqueTrieIterator> Impl;
+ };
+
+ TConstIterator Begin() const;
+ TConstIterator begin() const;
+ TConstIterator End() const;
+ TConstIterator end() const;
+
+ // Returns an iterator pointing to the smallest key in the trie >= the argument.
+ // TODO: misleading name. Should be called LowerBound for consistency with stl.
+ // No. It is the STL that has a misleading name.
+ // LowerBound of X cannot be greater than X.
+ TConstIterator UpperBound(const TKeyBuf& key) const;
+
+ void Print(IOutputStream& os);
+
+ size_t Size() const;
+
+ friend class NCompactTrie::TFirstSymbolIterator<TCompactTrie>;
+ friend class TSearchIterator<TCompactTrie>;
+ friend class TPrefixIterator<TCompactTrie>;
+
+protected:
+ explicit TCompactTrie(const char* emptyValue);
+ TCompactTrie(const TBlob& data, const char* emptyValue, TPacker packer = TPacker());
+
+ bool LookupLongestPrefix(const TSymbol* key, size_t keylen, size_t& prefixLen, const char*& valuepos, bool& hasNext) const;
+ bool LookupLongestPrefix(const TSymbol* key, size_t keylen, size_t& prefixLen, const char*& valuepos) const {
+ bool hasNext;
+ return LookupLongestPrefix(key, keylen, prefixLen, valuepos, hasNext);
+ }
+ void LookupPhrases(const char* datapos, size_t len, const TSymbol* key, size_t keylen, TVector<TPhraseMatch>& matches, TSymbol separator) const;
+};
+
+template <class T = char, class D = ui64, class S = TCompactTriePacker<D>>
+class TCompactTrieHolder: public TCompactTrie<T, D, S>, NNonCopyable::TNonCopyable {
+private:
+ typedef TCompactTrie<T, D, S> TBase;
+ TArrayHolder<char> Storage;
+
+public:
+ TCompactTrieHolder(IInputStream& is, size_t len);
+};
+
+//------------------------//
+// Implementation section //
+//------------------------//
+
+// TCompactTrie
+
+template <class T, class D, class S>
+TCompactTrie<T, D, S>::TCompactTrie(const TBlob& data, TPacker packer)
+ : DataHolder(data)
+ , Packer(packer)
+{
+ Init(data, packer);
+}
+
+template <class T, class D, class S>
+TCompactTrie<T, D, S>::TCompactTrie(const char* d, size_t len, TPacker packer)
+ : Packer(packer)
+{
+ Init(d, len, packer);
+}
+
+template <class T, class D, class S>
+TCompactTrie<T, D, S>::TCompactTrie(const char* emptyValue)
+ : EmptyValue(emptyValue)
+{
+}
+
+template <class T, class D, class S>
+TCompactTrie<T, D, S>::TCompactTrie(const TBlob& data, const char* emptyValue, TPacker packer)
+ : DataHolder(data)
+ , EmptyValue(emptyValue)
+ , Packer(packer)
+{
+}
+
+template <class T, class D, class S>
+TCompactTrie<T, D, S>::TCompactTrie(const TCompactTrie& other)
+ : DataHolder(other.DataHolder)
+ , EmptyValue(other.EmptyValue)
+ , Packer(other.Packer)
+{
+}
+
+template <class T, class D, class S>
+TCompactTrie<T, D, S>::TCompactTrie(TCompactTrie&& other) noexcept
+ : DataHolder(std::move(other.DataHolder))
+ , EmptyValue(std::move(other.EmptyValue))
+ , Packer(std::move(other.Packer))
+{
+}
+
+template <class T, class D, class S>
+TCompactTrie<T, D, S>& TCompactTrie<T, D, S>::operator=(const TCompactTrie& other) {
+ if (this != &other) {
+ DataHolder = other.DataHolder;
+ EmptyValue = other.EmptyValue;
+ Packer = other.Packer;
+ }
+ return *this;
+}
+
+template <class T, class D, class S>
+TCompactTrie<T, D, S>& TCompactTrie<T, D, S>::operator=(TCompactTrie&& other) noexcept {
+ if (this != &other) {
+ DataHolder = std::move(other.DataHolder);
+ EmptyValue = std::move(other.EmptyValue);
+ Packer = std::move(other.Packer);
+ }
+ return *this;
+}
+
+template <class T, class D, class S>
+void TCompactTrie<T, D, S>::Init(const char* d, size_t len, TPacker packer) {
+ Init(TBlob::NoCopy(d, len), packer);
+}
+
+template <class T, class D, class S>
+void TCompactTrie<T, D, S>::Init(const TBlob& data, TPacker packer) {
+ using namespace NCompactTrie;
+
+ DataHolder = data;
+ Packer = packer;
+
+ const char* datapos = DataHolder.AsCharPtr();
+ size_t len = DataHolder.Length();
+ if (!len)
+ return;
+
+ const char* const dataend = datapos + len;
+
+ const char* emptypos = datapos;
+ char flags = LeapByte(emptypos, dataend, 0);
+ if (emptypos && (flags & MT_FINAL)) {
+ Y_ASSERT(emptypos <= dataend);
+ EmptyValue = emptypos;
+ }
+}
+
+template <class T, class D, class S>
+bool TCompactTrie<T, D, S>::IsInitialized() const {
+ return DataHolder.Data() != nullptr;
+}
+
+template <class T, class D, class S>
+bool TCompactTrie<T, D, S>::IsEmpty() const {
+ return DataHolder.Size() == 0 && EmptyValue == nullptr;
+}
+
+template <class T, class D, class S>
+bool TCompactTrie<T, D, S>::Find(const TSymbol* key, size_t keylen, TData* value) const {
+ size_t prefixLen = 0;
+ const char* valuepos = nullptr;
+ bool hasNext;
+ if (!LookupLongestPrefix(key, keylen, prefixLen, valuepos, hasNext) || prefixLen != keylen)
+ return false;
+ if (value)
+ Packer.UnpackLeaf(valuepos, *value);
+ return true;
+}
+
+template <class T, class D, class S>
+void TCompactTrie<T, D, S>::FindPhrases(const TSymbol* key, size_t keylen, TPhraseMatchVector& matches, TSymbol separator) const {
+ LookupPhrases(DataHolder.AsCharPtr(), DataHolder.Length(), key, keylen, matches, separator);
+}
+
+template <class T, class D, class S>
+inline TCompactTrie<T, D, S> TCompactTrie<T, D, S>::FindTails(const TSymbol* key, size_t keylen) const {
+ TCompactTrie<T, D, S> ret;
+ FindTails(key, keylen, ret);
+ return ret;
+}
+
+template <class T, class D, class S>
+bool TCompactTrie<T, D, S>::FindTails(const TSymbol* key, size_t keylen, TCompactTrie<T, D, S>& res) const {
+ using namespace NCompactTrie;
+
+ size_t len = DataHolder.Length();
+
+ if (!key || !len)
+ return false;
+
+ if (!keylen) {
+ res = *this;
+ return true;
+ }
+
+ const char* datastart = DataHolder.AsCharPtr();
+ const char* datapos = datastart;
+ const char* const dataend = datapos + len;
+
+ const TSymbol* keyend = key + keylen;
+ const char* value = nullptr;
+
+ while (key != keyend) {
+ T label = *(key++);
+ if (!NCompactTrie::Advance(datapos, dataend, value, label, Packer))
+ return false;
+
+ if (key == keyend) {
+ if (datapos) {
+ Y_ASSERT(datapos >= datastart);
+ res = TCompactTrie<T, D, S>(TBlob::NoCopy(datapos, dataend - datapos), value);
+ } else {
+ res = TCompactTrie<T, D, S>(value);
+ }
+ return true;
+ } else if (!datapos) {
+ return false; // No further way
+ }
+ }
+
+ return false;
+}
+
+template <class T, class D, class S>
+inline bool TCompactTrie<T, D, S>::FindTails(TSymbol label, TCompactTrie<T, D, S>& res) const {
+ using namespace NCompactTrie;
+
+ const size_t len = DataHolder.Length();
+ if (!len)
+ return false;
+
+ const char* datastart = DataHolder.AsCharPtr();
+ const char* dataend = datastart + len;
+ const char* datapos = datastart;
+ const char* value = nullptr;
+
+ if (!NCompactTrie::Advance(datapos, dataend, value, label, Packer))
+ return false;
+
+ if (datapos) {
+ Y_ASSERT(datapos >= datastart);
+ res = TCompactTrie<T, D, S>(TBlob::NoCopy(datapos, dataend - datapos), value);
+ } else {
+ res = TCompactTrie<T, D, S>(value);
+ }
+
+ return true;
+}
+
+template <class T, class D, class S>
+typename TCompactTrie<T, D, S>::TConstIterator TCompactTrie<T, D, S>::Begin() const {
+ NCompactTrie::TOpaqueTrie self(DataHolder.AsCharPtr(), DataHolder.Length(), Skipper);
+ return TConstIterator(self, EmptyValue, false, Packer);
+}
+
+template <class T, class D, class S>
+typename TCompactTrie<T, D, S>::TConstIterator TCompactTrie<T, D, S>::begin() const {
+ return Begin();
+}
+
+template <class T, class D, class S>
+typename TCompactTrie<T, D, S>::TConstIterator TCompactTrie<T, D, S>::End() const {
+ NCompactTrie::TOpaqueTrie self(DataHolder.AsCharPtr(), DataHolder.Length(), Skipper);
+ return TConstIterator(self, EmptyValue, true, Packer);
+}
+
+template <class T, class D, class S>
+typename TCompactTrie<T, D, S>::TConstIterator TCompactTrie<T, D, S>::end() const {
+ return End();
+}
+
+template <class T, class D, class S>
+size_t TCompactTrie<T, D, S>::Size() const {
+ size_t res = 0;
+ for (TConstIterator it = Begin(); it != End(); ++it)
+ ++res;
+ return res;
+}
+
+template <class T, class D, class S>
+typename TCompactTrie<T, D, S>::TConstIterator TCompactTrie<T, D, S>::UpperBound(const TKeyBuf& key) const {
+ NCompactTrie::TOpaqueTrie self(DataHolder.AsCharPtr(), DataHolder.Length(), Skipper);
+ return TConstIterator(self, EmptyValue, key, Packer);
+}
+
+template <class T, class D, class S>
+void TCompactTrie<T, D, S>::Print(IOutputStream& os) {
+ typedef typename ::TCompactTrieKeySelector<T>::TKeyBuf TSBuffer;
+ for (TConstIterator it = Begin(); it != End(); ++it) {
+ os << TSBuffer((*it).first.data(), (*it).first.size()) << "\t" << (*it).second << Endl;
+ }
+}
+
+template <class T, class D, class S>
+bool TCompactTrie<T, D, S>::FindLongestPrefix(const TSymbol* key, size_t keylen, size_t* prefixLen, TData* value, bool* hasNext) const {
+ const char* valuepos = nullptr;
+ size_t tempPrefixLen = 0;
+ bool tempHasNext;
+ bool found = LookupLongestPrefix(key, keylen, tempPrefixLen, valuepos, tempHasNext);
+ if (prefixLen)
+ *prefixLen = tempPrefixLen;
+ if (found && value)
+ Packer.UnpackLeaf(valuepos, *value);
+ if (hasNext)
+ *hasNext = tempHasNext;
+ return found;
+}
+
+template <class T, class D, class S>
+bool TCompactTrie<T, D, S>::LookupLongestPrefix(const TSymbol* key, size_t keylen, size_t& prefixLen, const char*& valuepos, bool& hasNext) const {
+ using namespace NCompactTrie;
+
+ const char* datapos = DataHolder.AsCharPtr();
+ size_t len = DataHolder.Length();
+
+ prefixLen = 0;
+ hasNext = false;
+ bool found = false;
+
+ if (EmptyValue) {
+ valuepos = EmptyValue;
+ found = true;
+ }
+
+ if (!key || !len)
+ return found;
+
+ const char* const dataend = datapos + len;
+
+ const T* keyend = key + keylen;
+ while (key != keyend) {
+ T label = *(key++);
+ for (i64 i = (i64)ExtraBits<TSymbol>(); i >= 0; i -= 8) {
+ const char flags = LeapByte(datapos, dataend, (char)(label >> i));
+ if (!datapos) {
+ return found; // no such arc
+ }
+
+ Y_ASSERT(datapos <= dataend);
+ if ((flags & MT_FINAL)) {
+ prefixLen = keylen - (keyend - key) - (i ? 1 : 0);
+ valuepos = datapos;
+ hasNext = flags & MT_NEXT;
+ found = true;
+
+ if (!i && key == keyend) { // last byte, and got a match
+ return found;
+ }
+ datapos += Packer.SkipLeaf(datapos); // skip intermediate leaf nodes
+ }
+
+ if (!(flags & MT_NEXT)) {
+ return found; // no further way
+ }
+ }
+ }
+
+ return found;
+}
+
+template <class T, class D, class S>
+void TCompactTrie<T, D, S>::LookupPhrases(
+ const char* datapos, size_t len, const TSymbol* key, size_t keylen,
+ TVector<TPhraseMatch>& matches, TSymbol separator) const {
+ using namespace NCompactTrie;
+
+ matches.clear();
+
+ if (!key || !len)
+ return;
+
+ const T* const keystart = key;
+ const T* const keyend = key + keylen;
+ const char* const dataend = datapos + len;
+ while (datapos && key != keyend) {
+ T label = *(key++);
+ const char* value = nullptr;
+ if (!Advance(datapos, dataend, value, label, Packer)) {
+ return;
+ }
+ if (value && (key == keyend || *key == separator)) {
+ size_t matchlength = (size_t)(key - keystart);
+ D data;
+ Packer.UnpackLeaf(value, data);
+ matches.push_back(TPhraseMatch(matchlength, data));
+ }
+ }
+}
+
+// TCompactTrieHolder
+
+template <class T, class D, class S>
+TCompactTrieHolder<T, D, S>::TCompactTrieHolder(IInputStream& is, size_t len)
+ : Storage(new char[len])
+{
+ if (is.Load(Storage.Get(), len) != len) {
+ ythrow yexception() << "bad data load";
+ }
+ TBase::Init(Storage.Get(), len);
+}
+
+//----------------------------------------------------------------------------------------------------------------
+// TCompactTrie::TConstIterator
+
+template <class T, class D, class S>
+TCompactTrie<T, D, S>::TConstIterator::TConstIterator(const TOpaqueTrie& trie, const char* emptyValue, bool atend, TPacker packer)
+ : Packer(packer)
+ , Impl(new TOpaqueTrieIterator(trie, emptyValue, atend))
+{
+}
+
+template <class T, class D, class S>
+TCompactTrie<T, D, S>::TConstIterator::TConstIterator(const TOpaqueTrie& trie, const char* emptyValue, const TKeyBuf& key, TPacker packer)
+ : Packer(packer)
+ , Impl(new TOpaqueTrieIterator(trie, emptyValue, true))
+{
+ Impl->UpperBound<TSymbol>(key);
+}
+
+template <class T, class D, class S>
+bool TCompactTrie<T, D, S>::TConstIterator::operator==(const TConstIterator& other) const {
+ if (!Impl)
+ return !other.Impl;
+ if (!other.Impl)
+ return false;
+ return *Impl == *other.Impl;
+}
+
+template <class T, class D, class S>
+bool TCompactTrie<T, D, S>::TConstIterator::operator!=(const TConstIterator& other) const {
+ return !operator==(other);
+}
+
+template <class T, class D, class S>
+typename TCompactTrie<T, D, S>::TConstIterator& TCompactTrie<T, D, S>::TConstIterator::operator++() {
+ Impl->Forward();
+ return *this;
+}
+
+template <class T, class D, class S>
+typename TCompactTrie<T, D, S>::TConstIterator TCompactTrie<T, D, S>::TConstIterator::operator++(int /*unused*/) {
+ TConstIterator copy(*this);
+ Impl->Forward();
+ return copy;
+}
+
+template <class T, class D, class S>
+typename TCompactTrie<T, D, S>::TConstIterator& TCompactTrie<T, D, S>::TConstIterator::operator--() {
+ Impl->Backward();
+ return *this;
+}
+
+template <class T, class D, class S>
+typename TCompactTrie<T, D, S>::TConstIterator TCompactTrie<T, D, S>::TConstIterator::operator--(int /*unused*/) {
+ TConstIterator copy(*this);
+ Impl->Backward();
+ return copy;
+}
+
+template <class T, class D, class S>
+typename TCompactTrie<T, D, S>::TValueType TCompactTrie<T, D, S>::TConstIterator::operator*() {
+ return TValueType(GetKey(), GetValue());
+}
+
+template <class T, class D, class S>
+typename TCompactTrie<T, D, S>::TKey TCompactTrie<T, D, S>::TConstIterator::GetKey() const {
+ return Impl->GetKey<TSymbol>();
+}
+
+template <class T, class D, class S>
+size_t TCompactTrie<T, D, S>::TConstIterator::GetKeySize() const {
+ return Impl->MeasureKey<TSymbol>();
+}
+
+template <class T, class D, class S>
+const char* TCompactTrie<T, D, S>::TConstIterator::GetValuePtr() const {
+ return Impl->GetValuePtr();
+}
+
+template <class T, class D, class S>
+typename TCompactTrie<T, D, S>::TData TCompactTrie<T, D, S>::TConstIterator::GetValue() const {
+ D data;
+ GetValue(data);
+ return data;
+}
+
+template <class T, class D, class S>
+void TCompactTrie<T, D, S>::TConstIterator::GetValue(typename TCompactTrie<T, D, S>::TData& data) const {
+ const char* ptr = GetValuePtr();
+ if (ptr) {
+ Packer.UnpackLeaf(ptr, data);
+ } else {
+ data = typename TCompactTrie<T, D, S>::TData();
+ }
+}
diff --git a/library/cpp/containers/comptrie/comptrie_ut.cpp b/library/cpp/containers/comptrie/comptrie_ut.cpp
new file mode 100644
index 0000000000..74bee09b5d
--- /dev/null
+++ b/library/cpp/containers/comptrie/comptrie_ut.cpp
@@ -0,0 +1,1791 @@
+#include <util/random/shuffle.h>
+#include <library/cpp/testing/unittest/registar.h>
+
+#include <util/stream/output.h>
+#include <utility>
+
+#include <util/charset/wide.h>
+#include <util/generic/algorithm.h>
+#include <util/generic/buffer.h>
+#include <util/generic/map.h>
+#include <util/generic/vector.h>
+#include <util/generic/ptr.h>
+#include <util/generic/ylimits.h>
+
+#include <util/folder/dirut.h>
+
+#include <util/random/random.h>
+#include <util/random/fast.h>
+
+#include <util/string/hex.h>
+#include <util/string/cast.h>
+
+#include "comptrie.h"
+#include "set.h"
+#include "first_symbol_iterator.h"
+#include "search_iterator.h"
+#include "pattern_searcher.h"
+
+#include <array>
+#include <iterator>
+
+
+class TCompactTrieTest: public TTestBase {
+private:
+ UNIT_TEST_SUITE(TCompactTrieTest);
+ UNIT_TEST(TestTrie8);
+ UNIT_TEST(TestTrie16);
+ UNIT_TEST(TestTrie32);
+
+ UNIT_TEST(TestFastTrie8);
+ UNIT_TEST(TestFastTrie16);
+ UNIT_TEST(TestFastTrie32);
+
+ UNIT_TEST(TestMinimizedTrie8);
+ UNIT_TEST(TestMinimizedTrie16);
+ UNIT_TEST(TestMinimizedTrie32);
+
+ UNIT_TEST(TestFastMinimizedTrie8);
+ UNIT_TEST(TestFastMinimizedTrie16);
+ UNIT_TEST(TestFastMinimizedTrie32);
+
+ UNIT_TEST(TestTrieIterator8);
+ UNIT_TEST(TestTrieIterator16);
+ UNIT_TEST(TestTrieIterator32);
+
+ UNIT_TEST(TestMinimizedTrieIterator8);
+ UNIT_TEST(TestMinimizedTrieIterator16);
+ UNIT_TEST(TestMinimizedTrieIterator32);
+
+ UNIT_TEST(TestPhraseSearch);
+ UNIT_TEST(TestAddGet);
+ UNIT_TEST(TestEmpty);
+ UNIT_TEST(TestUninitializedNonEmpty);
+ UNIT_TEST(TestRandom);
+ UNIT_TEST(TestFindTails);
+ UNIT_TEST(TestPrefixGrouped);
+ UNIT_TEST(CrashTestPrefixGrouped);
+ UNIT_TEST(TestMergeFromFile);
+ UNIT_TEST(TestMergeFromBuffer);
+ UNIT_TEST(TestUnique);
+ UNIT_TEST(TestAddRetValue);
+ UNIT_TEST(TestClear);
+
+ UNIT_TEST(TestIterateEmptyKey);
+
+ UNIT_TEST(TestTrieSet);
+
+ UNIT_TEST(TestTrieForVectorInt64);
+ UNIT_TEST(TestTrieForListInt64);
+ UNIT_TEST(TestTrieForSetInt64);
+
+ UNIT_TEST(TestTrieForVectorStroka);
+ UNIT_TEST(TestTrieForListStroka);
+ UNIT_TEST(TestTrieForSetStroka);
+
+ UNIT_TEST(TestTrieForVectorWtroka);
+ UNIT_TEST(TestTrieForVectorFloat);
+ UNIT_TEST(TestTrieForVectorDouble);
+
+ UNIT_TEST(TestTrieForListVectorInt64);
+ UNIT_TEST(TestTrieForPairWtrokaVectorInt64);
+
+ UNIT_TEST(TestEmptyValueOutOfOrder);
+ UNIT_TEST(TestFindLongestPrefixWithEmptyValue);
+
+ UNIT_TEST(TestSearchIterChar);
+ UNIT_TEST(TestSearchIterWchar);
+ UNIT_TEST(TestSearchIterWchar32)
+
+ UNIT_TEST(TestCopyAndAssignment);
+
+ UNIT_TEST(TestFirstSymbolIterator8);
+ UNIT_TEST(TestFirstSymbolIterator16);
+ UNIT_TEST(TestFirstSymbolIterator32);
+ UNIT_TEST(TestFirstSymbolIteratorChar32);
+
+ UNIT_TEST(TestArrayPacker);
+
+ UNIT_TEST(TestBuilderFindLongestPrefix);
+ UNIT_TEST(TestBuilderFindLongestPrefixWithEmptyValue);
+
+ UNIT_TEST(TestPatternSearcherEmpty);
+ UNIT_TEST(TestPatternSearcherSimple);
+ UNIT_TEST(TestPatternSearcherRandom);
+
+ UNIT_TEST_SUITE_END();
+
+ static const char* SampleData[];
+
+ template <class T>
+ void CreateTrie(IOutputStream& out, bool minimize, bool useFastLayout);
+
+ template <class T>
+ void CheckData(const char* src, size_t len);
+
+ template <class T>
+ void CheckUpperBound(const char* src, size_t len);
+
+ template <class T>
+ void CheckIterator(const char* src, size_t len);
+
+ template <class T>
+ void TestTrie(bool minimize, bool useFastLayout);
+
+ template <class T>
+ void TestTrieIterator(bool minimize);
+
+ template <class T, bool minimize>
+ void TestRandom(const size_t n, const size_t maxKeySize);
+
+ void TestFindTailsImpl(const TString& prefix);
+
+ void TestUniqueImpl(bool isPrefixGrouped);
+
+ TVector<TUtf16String> GetSampleKeys(size_t nKeys) const;
+ template <class TContainer>
+ TVector<TContainer> GetSampleVectorData(size_t nValues);
+ template <class TContainer>
+ TVector<TContainer> GetSampleTextVectorData(size_t nValues);
+ template <class T>
+ void CheckEquality(const T& value1, const T& value2) const;
+ template <class TContainer>
+ void TestTrieWithContainers(const TVector<TUtf16String>& keys, const TVector<TContainer>& sampleData, TString methodName);
+
+ template <typename TChar>
+ void TestSearchIterImpl();
+
+ template <class TTrie>
+ void TestFirstSymbolIteratorForTrie(const TTrie& trie, const TStringBuf& narrowAnswers);
+
+ template <typename TSymbol>
+ void TestFirstSymbolIterator();
+
+ template <class T>
+ class TIntPacker;
+ template <class T>
+ class TDummyPacker;
+ class TStrokaPacker;
+
+public:
+ void TestPackers();
+
+ void TestTrie8();
+ void TestTrie16();
+ void TestTrie32();
+
+ void TestFastTrie8();
+ void TestFastTrie16();
+ void TestFastTrie32();
+
+ void TestMinimizedTrie8();
+ void TestMinimizedTrie16();
+ void TestMinimizedTrie32();
+
+ void TestFastMinimizedTrie8();
+ void TestFastMinimizedTrie16();
+ void TestFastMinimizedTrie32();
+
+ void TestTrieIterator8();
+ void TestTrieIterator16();
+ void TestTrieIterator32();
+
+ void TestMinimizedTrieIterator8();
+ void TestMinimizedTrieIterator16();
+ void TestMinimizedTrieIterator32();
+
+ void TestPhraseSearch();
+ void TestAddGet();
+ void TestEmpty();
+ void TestUninitializedNonEmpty();
+ void TestRandom();
+ void TestFindTails();
+ void TestPrefixGrouped();
+ void CrashTestPrefixGrouped();
+ void TestMergeFromFile();
+ void TestMergeFromBuffer();
+ void TestUnique();
+ void TestAddRetValue();
+ void TestClear();
+
+ void TestIterateEmptyKey();
+
+ void TestTrieSet();
+
+ void TestTrieForVectorInt64();
+ void TestTrieForListInt64();
+ void TestTrieForSetInt64();
+
+ void TestTrieForVectorStroka();
+ void TestTrieForListStroka();
+ void TestTrieForSetStroka();
+
+ void TestTrieForVectorWtroka();
+ void TestTrieForVectorFloat();
+ void TestTrieForVectorDouble();
+
+ void TestTrieForListVectorInt64();
+ void TestTrieForPairWtrokaVectorInt64();
+
+ void TestEmptyValueOutOfOrder();
+ void TestFindLongestPrefixWithEmptyValue();
+
+ void TestSearchIterChar();
+ void TestSearchIterWchar();
+ void TestSearchIterWchar32();
+
+ void TestCopyAndAssignment();
+
+ void TestFirstSymbolIterator8();
+ void TestFirstSymbolIterator16();
+ void TestFirstSymbolIterator32();
+ void TestFirstSymbolIteratorChar32();
+
+ void TestArrayPacker();
+
+ void TestBuilderFindLongestPrefix();
+ void TestBuilderFindLongestPrefix(size_t keysCount, double branchProbability, bool isPrefixGrouped, bool hasEmptyKey);
+ void TestBuilderFindLongestPrefixWithEmptyValue();
+
+ void TestPatternSearcherOnDataset(
+ const TVector<TString>& patterns,
+ const TVector<TString>& samples
+ );
+ void TestPatternSearcherEmpty();
+ void TestPatternSearcherSimple();
+ void TestPatternSearcherRandom(
+ size_t patternsNum,
+ size_t patternMaxLength,
+ size_t strMaxLength,
+ int maxChar,
+ TFastRng<ui64>& rng
+ );
+ void TestPatternSearcherRandom();
+};
+
+UNIT_TEST_SUITE_REGISTRATION(TCompactTrieTest);
+
+const char* TCompactTrieTest::SampleData[] = {
+ "",
+ "a", "b", "c", "d",
+ "aa", "ab", "ac", "ad",
+ "aaa", "aab", "aac", "aad",
+ "aba", "abb", "abc", "abd",
+ "fba", "fbb", "fbc", "fbd",
+ "fbbaa",
+ "c\x85\xA4\xBF" // Just something outside ASCII.
+};
+
+template <class T>
+typename TCompactTrie<T>::TKey MakeWideKey(const char* str, size_t len) {
+ typename TCompactTrie<T>::TKey buffer;
+ for (size_t i = 0; i < len; i++) {
+ unsigned int ch = (str[i] & 0xFF);
+ buffer.push_back((T)(ch | ch << 8 | ch << 16 | ch << 24));
+ }
+ return buffer;
+}
+
+template <class T>
+typename TCompactTrie<T>::TKey MakeWideKey(const TString& str) {
+ return MakeWideKey<T>(str.c_str(), str.length());
+}
+
+template <class T>
+typename TCompactTrie<T>::TKey MakeWideKey(const TStringBuf& buf) {
+ return MakeWideKey<T>(buf.data(), buf.length());
+}
+
+template <class T>
+void TCompactTrieTest::CreateTrie(IOutputStream& out, bool minimize, bool useFastLayout) {
+ TCompactTrieBuilder<T> builder;
+
+ for (auto& i : SampleData) {
+ size_t len = strlen(i);
+
+ builder.Add(MakeWideKey<T>(i, len), len * 2);
+ }
+
+ TBufferOutput tmp2;
+ IOutputStream& currentOutput = useFastLayout ? tmp2 : out;
+ if (minimize) {
+ TBufferOutput buftmp;
+ builder.Save(buftmp);
+ CompactTrieMinimize<TCompactTriePacker<ui64>>(currentOutput, buftmp.Buffer().Data(), buftmp.Buffer().Size(), false);
+ } else {
+ builder.Save(currentOutput);
+ }
+ if (useFastLayout) {
+ CompactTrieMakeFastLayout<TCompactTriePacker<T>>(out, tmp2.Buffer().Data(), tmp2.Buffer().Size(), false);
+ }
+}
+
+// Iterates over all strings of length <= 4 made of letters a-g.
+static bool LexicographicStep(TString& s) {
+ if (s.length() < 4) {
+ s += "a";
+ return true;
+ }
+ while (!s.empty() && s.back() == 'g')
+ s.pop_back();
+ if (s.empty())
+ return false;
+ char last = s.back();
+ last++;
+ s.pop_back();
+ s.push_back(last);
+ return true;
+}
+
+template <class T>
+void TCompactTrieTest::CheckUpperBound(const char* data, size_t datalen) {
+ TCompactTrie<T> trie(data, datalen);
+ typedef typename TCompactTrie<T>::TKey TKey;
+ typedef typename TCompactTrie<T>::TData TData;
+
+ TString key;
+ do {
+ const TKey wideKey = MakeWideKey<T>(key);
+ typename TCompactTrie<T>::TConstIterator it = trie.UpperBound(wideKey);
+ UNIT_ASSERT_C(it == trie.End() || it.GetKey() >= wideKey, "key=" + key);
+ TData data;
+ const bool found = trie.Find(wideKey, &data);
+ if (found)
+ UNIT_ASSERT_C(it.GetKey() == wideKey && it.GetValue() == data, "key=" + key);
+ if (it != trie.Begin())
+ UNIT_ASSERT_C((--it).GetKey() < wideKey, "key=" + key);
+ } while (LexicographicStep(key));
+}
+
+template <class T>
+void TCompactTrieTest::CheckData(const char* data, size_t datalen) {
+ TCompactTrie<T> trie(data, datalen);
+
+ UNIT_ASSERT_VALUES_EQUAL(Y_ARRAY_SIZE(SampleData), trie.Size());
+
+ for (auto& i : SampleData) {
+ size_t len = strlen(i);
+ ui64 value = 0;
+ size_t prefixLen = 0;
+
+ typename TCompactTrie<T>::TKey key = MakeWideKey<T>(i, len);
+ UNIT_ASSERT(trie.Find(key, &value));
+ UNIT_ASSERT_EQUAL(len * 2, value);
+ UNIT_ASSERT(trie.FindLongestPrefix(key, &prefixLen, &value));
+ UNIT_ASSERT_EQUAL(len, prefixLen);
+ UNIT_ASSERT_EQUAL(len * 2, value);
+
+ TString badkey("bb");
+ badkey += i;
+ key = MakeWideKey<T>(badkey);
+ UNIT_ASSERT(!trie.Find(key));
+ value = 123;
+ UNIT_ASSERT(!trie.Find(key, &value));
+ UNIT_ASSERT_EQUAL(123, value);
+ UNIT_ASSERT(trie.FindLongestPrefix(key, &prefixLen, &value));
+ UNIT_ASSERT_EQUAL(1, prefixLen);
+ UNIT_ASSERT_EQUAL(2, value);
+
+ badkey = i;
+ badkey += "x";
+ key = MakeWideKey<T>(badkey);
+ UNIT_ASSERT(!trie.Find(key));
+ value = 1234;
+ UNIT_ASSERT(!trie.Find(key, &value));
+ UNIT_ASSERT_EQUAL(1234, value);
+ UNIT_ASSERT(trie.FindLongestPrefix(key, &prefixLen, &value));
+ UNIT_ASSERT_EQUAL(len, prefixLen);
+ UNIT_ASSERT_EQUAL(len * 2, value);
+ UNIT_ASSERT(trie.FindLongestPrefix(key, &prefixLen, nullptr));
+ UNIT_ASSERT_EQUAL(len, prefixLen);
+ }
+
+ TString testkey("fbbaa");
+ typename TCompactTrie<T>::TKey key = MakeWideKey<T>(testkey);
+ ui64 value = 0;
+ size_t prefixLen = 0;
+ UNIT_ASSERT(trie.FindLongestPrefix(key.data(), testkey.length() - 1, &prefixLen, &value));
+ UNIT_ASSERT_EQUAL(prefixLen, 3);
+ UNIT_ASSERT_EQUAL(6, value);
+
+ testkey = "fbbax";
+ key = MakeWideKey<T>(testkey);
+ UNIT_ASSERT(trie.FindLongestPrefix(key, &prefixLen, &value));
+ UNIT_ASSERT_EQUAL(prefixLen, 3);
+ UNIT_ASSERT_EQUAL(6, value);
+
+ value = 12345678;
+ UNIT_ASSERT(!trie.Find(key, &value));
+ UNIT_ASSERT_EQUAL(12345678, value); //Failed Find() should not change value
+}
+
+template <class T>
+void TCompactTrieTest::CheckIterator(const char* data, size_t datalen) {
+ typedef typename TCompactTrie<T>::TKey TKey;
+ typedef typename TCompactTrie<T>::TValueType TValue;
+ TMap<TKey, ui64> stored;
+
+ for (auto& i : SampleData) {
+ size_t len = strlen(i);
+
+ stored[MakeWideKey<T>(i, len)] = len * 2;
+ }
+
+ TCompactTrie<T> trie(data, datalen);
+ TVector<TValue> items;
+ typename TCompactTrie<T>::TConstIterator it = trie.Begin();
+ size_t entry_count = 0;
+ TMap<TKey, ui64> received;
+ while (it != trie.End()) {
+ UNIT_ASSERT_VALUES_EQUAL(it.GetKeySize(), it.GetKey().size());
+ received.insert(*it);
+ items.push_back(*it);
+ entry_count++;
+ it++;
+ }
+ TMap<TKey, ui64> received2;
+ for (std::pair<TKey, ui64> x : trie) {
+ received2.insert(x);
+ }
+ UNIT_ASSERT(entry_count == stored.size());
+ UNIT_ASSERT(received == stored);
+ UNIT_ASSERT(received2 == stored);
+
+ std::reverse(items.begin(), items.end());
+ typename TCompactTrie<T>::TConstIterator revIt = trie.End();
+ typename TCompactTrie<T>::TConstIterator const begin = trie.Begin();
+ typename TCompactTrie<T>::TConstIterator emptyIt;
+ size_t pos = 0;
+ while (revIt != begin) {
+ revIt--;
+ UNIT_ASSERT(*revIt == items[pos]);
+ pos++;
+ }
+ // Checking the assignment operator.
+ revIt = begin;
+ UNIT_ASSERT(revIt == trie.Begin());
+ UNIT_ASSERT(!revIt.IsEmpty());
+ UNIT_ASSERT(revIt != emptyIt);
+ UNIT_ASSERT(revIt != trie.End());
+ ++revIt; // Call a method that uses Skipper.
+ revIt = emptyIt;
+ UNIT_ASSERT(revIt == emptyIt);
+ UNIT_ASSERT(revIt.IsEmpty());
+ UNIT_ASSERT(revIt != trie.End());
+ // Checking the move assignment operator.
+ revIt = trie.Begin();
+ UNIT_ASSERT(revIt == trie.Begin());
+ UNIT_ASSERT(!revIt.IsEmpty());
+ UNIT_ASSERT(revIt != emptyIt);
+ UNIT_ASSERT(revIt != trie.End());
+ ++revIt; // Call a method that uses Skipper.
+ revIt = typename TCompactTrie<T>::TConstIterator();
+ UNIT_ASSERT(revIt == emptyIt);
+ UNIT_ASSERT(revIt.IsEmpty());
+ UNIT_ASSERT(revIt != trie.End());
+}
+
+template <class T>
+void TCompactTrieTest::TestTrie(bool minimize, bool useFastLayout) {
+ TBufferOutput bufout;
+ CreateTrie<T>(bufout, minimize, useFastLayout);
+ CheckData<T>(bufout.Buffer().Data(), bufout.Buffer().Size());
+ CheckUpperBound<T>(bufout.Buffer().Data(), bufout.Buffer().Size());
+}
+
+template <class T>
+void TCompactTrieTest::TestTrieIterator(bool minimize) {
+ TBufferOutput bufout;
+ CreateTrie<T>(bufout, minimize, false);
+ CheckIterator<T>(bufout.Buffer().Data(), bufout.Buffer().Size());
+}
+
+void TCompactTrieTest::TestTrie8() {
+ TestTrie<char>(false, false);
+}
+void TCompactTrieTest::TestTrie16() {
+ TestTrie<wchar16>(false, false);
+}
+void TCompactTrieTest::TestTrie32() {
+ TestTrie<wchar32>(false, false);
+}
+
+void TCompactTrieTest::TestFastTrie8() {
+ TestTrie<char>(false, true);
+}
+void TCompactTrieTest::TestFastTrie16() {
+ TestTrie<wchar16>(false, true);
+}
+void TCompactTrieTest::TestFastTrie32() {
+ TestTrie<wchar32>(false, true);
+}
+
+void TCompactTrieTest::TestMinimizedTrie8() {
+ TestTrie<char>(true, false);
+}
+void TCompactTrieTest::TestMinimizedTrie16() {
+ TestTrie<wchar16>(true, false);
+}
+void TCompactTrieTest::TestMinimizedTrie32() {
+ TestTrie<wchar32>(true, false);
+}
+
+void TCompactTrieTest::TestFastMinimizedTrie8() {
+ TestTrie<char>(true, true);
+}
+void TCompactTrieTest::TestFastMinimizedTrie16() {
+ TestTrie<wchar16>(true, true);
+}
+void TCompactTrieTest::TestFastMinimizedTrie32() {
+ TestTrie<wchar32>(true, true);
+}
+
+void TCompactTrieTest::TestTrieIterator8() {
+ TestTrieIterator<char>(false);
+}
+void TCompactTrieTest::TestTrieIterator16() {
+ TestTrieIterator<wchar16>(false);
+}
+void TCompactTrieTest::TestTrieIterator32() {
+ TestTrieIterator<wchar32>(false);
+}
+
+void TCompactTrieTest::TestMinimizedTrieIterator8() {
+ TestTrieIterator<char>(true);
+}
+void TCompactTrieTest::TestMinimizedTrieIterator16() {
+ TestTrieIterator<wchar16>(true);
+}
+void TCompactTrieTest::TestMinimizedTrieIterator32() {
+ TestTrieIterator<wchar32>(true);
+}
+
+void TCompactTrieTest::TestPhraseSearch() {
+ static const char* phrases[] = {"ab", "ab cd", "ab cd ef"};
+ static const char* const goodphrase = "ab cd ef gh";
+ static const char* const badphrase = "cd ef gh ab";
+ TBufferOutput bufout;
+
+ TCompactTrieBuilder<char> builder;
+ for (size_t i = 0; i < Y_ARRAY_SIZE(phrases); i++) {
+ builder.Add(phrases[i], strlen(phrases[i]), i);
+ }
+ builder.Save(bufout);
+
+ TCompactTrie<char> trie(bufout.Buffer().Data(), bufout.Buffer().Size());
+ TVector<TCompactTrie<char>::TPhraseMatch> matches;
+ trie.FindPhrases(goodphrase, strlen(goodphrase), matches);
+
+ UNIT_ASSERT(matches.size() == Y_ARRAY_SIZE(phrases));
+ for (size_t i = 0; i < Y_ARRAY_SIZE(phrases); i++) {
+ UNIT_ASSERT(matches[i].first == strlen(phrases[i]));
+ UNIT_ASSERT(matches[i].second == i);
+ }
+
+ trie.FindPhrases(badphrase, strlen(badphrase), matches);
+ UNIT_ASSERT(matches.size() == 0);
+}
+
+void TCompactTrieTest::TestAddGet() {
+ TCompactTrieBuilder<char> builder;
+ builder.Add("abcd", 4, 1);
+ builder.Add("acde", 4, 2);
+ ui64 dummy;
+ UNIT_ASSERT(builder.Find("abcd", 4, &dummy));
+ UNIT_ASSERT(1 == dummy);
+ UNIT_ASSERT(builder.Find("acde", 4, &dummy));
+ UNIT_ASSERT(2 == dummy);
+ UNIT_ASSERT(!builder.Find("fgdgfacde", 9, &dummy));
+ UNIT_ASSERT(!builder.Find("ab", 2, &dummy));
+}
+
+void TCompactTrieTest::TestEmpty() {
+ TCompactTrieBuilder<char> builder;
+ ui64 dummy = 12345;
+ size_t prefixLen;
+ UNIT_ASSERT(!builder.Find("abc", 3, &dummy));
+ TBufferOutput bufout;
+ builder.Save(bufout);
+
+ TCompactTrie<char> trie(bufout.Buffer().Data(), bufout.Buffer().Size());
+ UNIT_ASSERT(!trie.Find("abc", 3, &dummy));
+ UNIT_ASSERT(!trie.Find("", 0, &dummy));
+ UNIT_ASSERT(!trie.FindLongestPrefix("abc", 3, &prefixLen, &dummy));
+ UNIT_ASSERT(!trie.FindLongestPrefix("", 0, &prefixLen, &dummy));
+ UNIT_ASSERT_EQUAL(12345, dummy);
+
+ UNIT_ASSERT(trie.Begin() == trie.End());
+
+ TCompactTrie<> trieNull;
+
+ UNIT_ASSERT(!trieNull.Find(" ", 1));
+
+ TCompactTrie<>::TPhraseMatchVector matches;
+ trieNull.FindPhrases(" ", 1, matches); // just to be sure it doesn't crash
+
+ UNIT_ASSERT(trieNull.Begin() == trieNull.End());
+}
+
+void TCompactTrieTest::TestUninitializedNonEmpty() {
+ TBufferOutput bufout;
+ CreateTrie<char>(bufout, false, false);
+ TCompactTrie<char> trie(bufout.Buffer().Data(), bufout.Buffer().Size());
+ typedef TCompactTrie<char>::TKey TKey;
+ typedef TCompactTrie<char>::TConstIterator TIter;
+
+ TCompactTrie<char> tails = trie.FindTails("abd", 3); // A trie that has empty value and no data.
+ UNIT_ASSERT(!tails.IsEmpty());
+ UNIT_ASSERT(!tails.IsInitialized());
+ const TKey wideKey = MakeWideKey<char>("c", 1);
+ TIter it = tails.UpperBound(wideKey);
+ UNIT_ASSERT(it == tails.End());
+ UNIT_ASSERT(it != tails.Begin());
+ --it;
+ UNIT_ASSERT(it == tails.Begin());
+ ++it;
+ UNIT_ASSERT(it == tails.End());
+}
+
+static char RandChar() {
+ return char(RandomNumber<size_t>() % 256);
+}
+
+static TString RandStr(const size_t max) {
+ size_t len = RandomNumber<size_t>() % max;
+ TString key;
+ for (size_t j = 0; j < len; ++j)
+ key += RandChar();
+ return key;
+}
+
+template <class T, bool minimize>
+void TCompactTrieTest::TestRandom(const size_t n, const size_t maxKeySize) {
+ const TStringBuf EMPTY_KEY = TStringBuf("", 1);
+ TCompactTrieBuilder<char, typename T::TData, T> builder;
+ typedef TMap<TString, typename T::TData> TKeys;
+ TKeys keys;
+
+ typename T::TData dummy;
+ for (size_t i = 0; i < n; ++i) {
+ const TString key = RandStr(maxKeySize);
+ if (key != EMPTY_KEY && keys.find(key) == keys.end()) {
+ const typename T::TData val = T::Data(key);
+ keys[key] = val;
+ UNIT_ASSERT_C(!builder.Find(key.data(), key.size(), &dummy), "key = " << HexEncode(TString(key)));
+ builder.Add(key.data(), key.size(), val);
+ UNIT_ASSERT_C(builder.Find(key.data(), key.size(), &dummy), "key = " << HexEncode(TString(key)));
+ UNIT_ASSERT(dummy == val);
+ }
+ }
+
+ TBufferStream stream;
+ size_t len = builder.Save(stream);
+ TCompactTrie<char, typename T::TData, T> trie(stream.Buffer().Data(), len);
+
+ TBufferStream buftmp;
+ if (minimize) {
+ CompactTrieMinimize<T>(buftmp, stream.Buffer().Data(), len, false);
+ }
+ TCompactTrie<char, typename T::TData, T> trieMin(buftmp.Buffer().Data(), buftmp.Buffer().Size());
+
+ TCompactTrieBuilder<char, typename T::TData, T> prefixGroupedBuilder(CTBF_PREFIX_GROUPED);
+
+ for (typename TKeys::const_iterator i = keys.begin(), mi = keys.end(); i != mi; ++i) {
+ UNIT_ASSERT(!prefixGroupedBuilder.Find(i->first.c_str(), i->first.size(), &dummy));
+ UNIT_ASSERT(trie.Find(i->first.c_str(), i->first.size(), &dummy));
+ UNIT_ASSERT(dummy == i->second);
+ if (minimize) {
+ UNIT_ASSERT(trieMin.Find(i->first.c_str(), i->first.size(), &dummy));
+ UNIT_ASSERT(dummy == i->second);
+ }
+
+ prefixGroupedBuilder.Add(i->first.c_str(), i->first.size(), dummy);
+ UNIT_ASSERT(prefixGroupedBuilder.Find(i->first.c_str(), i->first.size(), &dummy));
+
+ for (typename TKeys::const_iterator j = keys.begin(), end = keys.end(); j != end; ++j) {
+ typename T::TData valFound;
+ if (j->first <= i->first) {
+ UNIT_ASSERT(prefixGroupedBuilder.Find(j->first.c_str(), j->first.size(), &valFound));
+ UNIT_ASSERT_VALUES_EQUAL(j->second, valFound);
+ } else {
+ UNIT_ASSERT(!prefixGroupedBuilder.Find(j->first.c_str(), j->first.size(), &valFound));
+ }
+ }
+ }
+
+ TBufferStream prefixGroupedBuffer;
+ prefixGroupedBuilder.Save(prefixGroupedBuffer);
+
+ UNIT_ASSERT_VALUES_EQUAL(stream.Buffer().Size(), prefixGroupedBuffer.Buffer().Size());
+ UNIT_ASSERT(0 == memcmp(stream.Buffer().Data(), prefixGroupedBuffer.Buffer().Data(), stream.Buffer().Size()));
+}
+
+void TCompactTrieTest::TestRandom() {
+ TestRandom<TIntPacker<ui64>, true>(1000, 1000);
+ TestRandom<TIntPacker<int>, true>(100, 100);
+ TestRandom<TDummyPacker<ui64>, true>(0, 0);
+ TestRandom<TDummyPacker<ui64>, true>(100, 3);
+ TestRandom<TDummyPacker<ui64>, true>(100, 100);
+ TestRandom<TStrokaPacker, true>(100, 100);
+}
+
+void TCompactTrieTest::TestFindTailsImpl(const TString& prefix) {
+ TCompactTrieBuilder<> builder;
+
+ TMap<TString, ui64> input;
+
+ for (auto& i : SampleData) {
+ TString temp = i;
+ ui64 val = temp.size() * 2;
+ builder.Add(temp.data(), temp.size(), val);
+ if (temp.StartsWith(prefix)) {
+ input[temp.substr(prefix.size())] = val;
+ }
+ }
+
+ typedef TCompactTrie<> TTrie;
+
+ TBufferStream stream;
+ size_t len = builder.Save(stream);
+ TTrie trie(stream.Buffer().Data(), len);
+
+ TTrie subtrie = trie.FindTails(prefix.data(), prefix.size());
+
+ TMap<TString, ui64> output;
+
+ for (TTrie::TConstIterator i = subtrie.Begin(), mi = subtrie.End(); i != mi; ++i) {
+ TTrie::TValueType val = *i;
+ output[TString(val.first.data(), val.first.size())] = val.second;
+ }
+ UNIT_ASSERT(input.size() == output.size());
+ UNIT_ASSERT(input == output);
+
+ TBufferStream buftmp;
+ CompactTrieMinimize<TTrie::TPacker>(buftmp, stream.Buffer().Data(), len, false);
+ TTrie trieMin(buftmp.Buffer().Data(), buftmp.Buffer().Size());
+
+ subtrie = trieMin.FindTails(prefix.data(), prefix.size());
+ output.clear();
+
+ for (TTrie::TConstIterator i = subtrie.Begin(), mi = subtrie.End(); i != mi; ++i) {
+ TTrie::TValueType val = *i;
+ output[TString(val.first.data(), val.first.size())] = val.second;
+ }
+ UNIT_ASSERT(input.size() == output.size());
+ UNIT_ASSERT(input == output);
+}
+
+void TCompactTrieTest::TestPrefixGrouped() {
+ TBuffer b1b;
+ TCompactTrieBuilder<char, ui32> b1(CTBF_PREFIX_GROUPED);
+ const char* data[] = {
+ "Kazan",
+ "Moscow",
+ "Monino",
+ "Murmansk",
+ "Fryanovo",
+ "Fryazino",
+ "Fryazevo",
+ "Tumen",
+ };
+
+ for (size_t i = 0; i < Y_ARRAY_SIZE(data); ++i) {
+ ui32 val = strlen(data[i]) + 1;
+ b1.Add(data[i], strlen(data[i]), val);
+ for (size_t j = 0; j < Y_ARRAY_SIZE(data); ++j) {
+ ui32 mustHave = strlen(data[j]) + 1;
+ ui32 found = 0;
+ if (j <= i) {
+ UNIT_ASSERT(b1.Find(data[j], strlen(data[j]), &found));
+ UNIT_ASSERT_VALUES_EQUAL(mustHave, found);
+ } else {
+ UNIT_ASSERT(!b1.Find(data[j], strlen(data[j]), &found));
+ }
+ }
+ }
+
+ {
+ TBufferOutput b1bo(b1b);
+ b1.Save(b1bo);
+ }
+
+ TCompactTrie<char, ui32> t1(TBlob::FromBuffer(b1b));
+
+ //t1.Print(Cerr);
+
+ for (auto& i : data) {
+ ui32 v;
+ UNIT_ASSERT(t1.Find(i, strlen(i), &v));
+ UNIT_ASSERT_VALUES_EQUAL(strlen(i) + 1, v);
+ }
+}
+
+void TCompactTrieTest::CrashTestPrefixGrouped() {
+ TCompactTrieBuilder<char, ui32> builder(CTBF_PREFIX_GROUPED);
+ const char* data[] = {
+ "Fryazino",
+ "Fryanovo",
+ "Monino",
+ "",
+ "Fryazevo",
+ };
+ bool wasException = false;
+ try {
+ for (size_t i = 0; i < Y_ARRAY_SIZE(data); ++i) {
+ builder.Add(data[i], strlen(data[i]), i + 1);
+ }
+ } catch (const yexception& e) {
+ wasException = true;
+ UNIT_ASSERT(strstr(e.what(), "Bad input order - expected input strings to be prefix-grouped."));
+ }
+ UNIT_ASSERT_C(wasException, "CrashTestPrefixGrouped");
+}
+
+void TCompactTrieTest::TestMergeFromFile() {
+ {
+ TCompactTrieBuilder<> b;
+ b.Add("yandex", 12);
+ b.Add("google", 13);
+ b.Add("mail", 14);
+ TUnbufferedFileOutput out(GetSystemTempDir() + "/TCompactTrieTest-TestMerge-ru");
+ b.Save(out);
+ }
+
+ {
+ TCompactTrieBuilder<> b;
+ b.Add("yandex", 112);
+ b.Add("google", 113);
+ b.Add("yahoo", 114);
+ TUnbufferedFileOutput out(GetSystemTempDir() + "/TCompactTrieTest-TestMerge-com");
+ b.Save(out);
+ }
+
+ {
+ TCompactTrieBuilder<> b;
+ UNIT_ASSERT(b.AddSubtreeInFile("com.", GetSystemTempDir() + "/TCompactTrieTest-TestMerge-com"));
+ UNIT_ASSERT(b.Add("org.kernel", 22));
+ UNIT_ASSERT(b.AddSubtreeInFile("ru.", GetSystemTempDir() + "/TCompactTrieTest-TestMerge-ru"));
+ TUnbufferedFileOutput out(GetSystemTempDir() + "/TCompactTrieTest-TestMerge-res");
+ b.Save(out);
+ }
+
+ TCompactTrie<> trie(TBlob::FromFileSingleThreaded(GetSystemTempDir() + "/TCompactTrieTest-TestMerge-res"));
+ UNIT_ASSERT_VALUES_EQUAL(12u, trie.Get("ru.yandex"));
+ UNIT_ASSERT_VALUES_EQUAL(13u, trie.Get("ru.google"));
+ UNIT_ASSERT_VALUES_EQUAL(14u, trie.Get("ru.mail"));
+ UNIT_ASSERT_VALUES_EQUAL(22u, trie.Get("org.kernel"));
+ UNIT_ASSERT_VALUES_EQUAL(112u, trie.Get("com.yandex"));
+ UNIT_ASSERT_VALUES_EQUAL(113u, trie.Get("com.google"));
+ UNIT_ASSERT_VALUES_EQUAL(114u, trie.Get("com.yahoo"));
+
+ unlink((GetSystemTempDir() + "/TCompactTrieTest-TestMerge-res").data());
+ unlink((GetSystemTempDir() + "/TCompactTrieTest-TestMerge-com").data());
+ unlink((GetSystemTempDir() + "/TCompactTrieTest-TestMerge-ru").data());
+}
+
+void TCompactTrieTest::TestMergeFromBuffer() {
+ TArrayWithSizeHolder<char> buffer1;
+ {
+ TCompactTrieBuilder<> b;
+ b.Add("aaaaa", 1);
+ b.Add("bbbbb", 2);
+ b.Add("ccccc", 3);
+ buffer1.Resize(b.MeasureByteSize());
+ TMemoryOutput out(buffer1.Get(), buffer1.Size());
+ b.Save(out);
+ }
+
+ TArrayWithSizeHolder<char> buffer2;
+ {
+ TCompactTrieBuilder<> b;
+ b.Add("aaaaa", 10);
+ b.Add("bbbbb", 20);
+ b.Add("ccccc", 30);
+ b.Add("xxxxx", 40);
+ b.Add("yyyyy", 50);
+ buffer2.Resize(b.MeasureByteSize());
+ TMemoryOutput out(buffer2.Get(), buffer2.Size());
+ b.Save(out);
+ }
+
+ {
+ TCompactTrieBuilder<> b;
+ UNIT_ASSERT(b.AddSubtreeInBuffer("com.", std::move(buffer1)));
+ UNIT_ASSERT(b.Add("org.upyachka", 42));
+ UNIT_ASSERT(b.AddSubtreeInBuffer("ru.", std::move(buffer2)));
+ TUnbufferedFileOutput out(GetSystemTempDir() + "/TCompactTrieTest-TestMergeFromBuffer-res");
+ b.Save(out);
+ }
+
+ TCompactTrie<> trie(TBlob::FromFileSingleThreaded(GetSystemTempDir() + "/TCompactTrieTest-TestMergeFromBuffer-res"));
+ UNIT_ASSERT_VALUES_EQUAL(10u, trie.Get("ru.aaaaa"));
+ UNIT_ASSERT_VALUES_EQUAL(20u, trie.Get("ru.bbbbb"));
+ UNIT_ASSERT_VALUES_EQUAL(40u, trie.Get("ru.xxxxx"));
+ UNIT_ASSERT_VALUES_EQUAL(42u, trie.Get("org.upyachka"));
+ UNIT_ASSERT_VALUES_EQUAL(1u, trie.Get("com.aaaaa"));
+ UNIT_ASSERT_VALUES_EQUAL(2u, trie.Get("com.bbbbb"));
+ UNIT_ASSERT_VALUES_EQUAL(3u, trie.Get("com.ccccc"));
+
+ unlink((GetSystemTempDir() + "/TCompactTrieTest-TestMergeFromBuffer-res").data());
+}
+
+void TCompactTrieTest::TestUnique() {
+ TestUniqueImpl(false);
+ TestUniqueImpl(true);
+}
+
+void TCompactTrieTest::TestUniqueImpl(bool isPrefixGrouped) {
+ TCompactTrieBuilder<char, ui32> builder(CTBF_UNIQUE | (isPrefixGrouped ? CTBF_PREFIX_GROUPED : CTBF_NONE));
+ const char* data[] = {
+ "Kazan",
+ "Moscow",
+ "Monino",
+ "Murmansk",
+ "Fryanovo",
+ "Fryazino",
+ "Fryazevo",
+ "Fry",
+ "Tumen",
+ };
+ for (size_t i = 0; i < Y_ARRAY_SIZE(data); ++i) {
+ UNIT_ASSERT_C(builder.Add(data[i], strlen(data[i]), i + 1), i);
+ }
+ bool wasException = false;
+ try {
+ builder.Add(data[4], strlen(data[4]), 20);
+ } catch (const yexception& e) {
+ wasException = true;
+ UNIT_ASSERT(strstr(e.what(), "Duplicate key"));
+ }
+ UNIT_ASSERT_C(wasException, "TestUnique");
+}
+
+void TCompactTrieTest::TestAddRetValue() {
+ TCompactTrieBuilder<char, ui32> builder;
+ const char* data[] = {
+ "Kazan",
+ "Moscow",
+ "Monino",
+ "Murmansk",
+ "Fryanovo",
+ "Fryazino",
+ "Fryazevo",
+ "Fry",
+ "Tumen",
+ };
+ for (size_t i = 0; i < Y_ARRAY_SIZE(data); ++i) {
+ UNIT_ASSERT(builder.Add(data[i], strlen(data[i]), i + 1));
+ UNIT_ASSERT(!builder.Add(data[i], strlen(data[i]), i + 2));
+ ui32 value;
+ UNIT_ASSERT(builder.Find(data[i], strlen(data[i]), &value));
+ UNIT_ASSERT(value == i + 2);
+ }
+}
+
+void TCompactTrieTest::TestClear() {
+ TCompactTrieBuilder<char, ui32> builder;
+ const char* data[] = {
+ "Kazan",
+ "Moscow",
+ "Monino",
+ "Murmansk",
+ "Fryanovo",
+ "Fryazino",
+ "Fryazevo",
+ "Fry",
+ "Tumen",
+ };
+ for (size_t i = 0; i < Y_ARRAY_SIZE(data); ++i) {
+ builder.Add(data[i], strlen(data[i]), i + 1);
+ }
+ UNIT_ASSERT(builder.GetEntryCount() == Y_ARRAY_SIZE(data));
+ builder.Clear();
+ UNIT_ASSERT(builder.GetEntryCount() == 0);
+ UNIT_ASSERT(builder.GetNodeCount() == 1);
+}
+
+void TCompactTrieTest::TestFindTails() {
+ TestFindTailsImpl("aa");
+ TestFindTailsImpl("bb");
+ TestFindTailsImpl("fb");
+ TestFindTailsImpl("fbc");
+ TestFindTailsImpl("fbbaa");
+}
+
+template <class T>
+class TCompactTrieTest::TDummyPacker: public TNullPacker<T> {
+public:
+ static T Data(const TString&) {
+ T data;
+ TNullPacker<T>().UnpackLeaf(nullptr, data);
+ return data;
+ }
+
+ typedef T TData;
+};
+
+class TCompactTrieTest::TStrokaPacker: public TCompactTriePacker<TString> {
+public:
+ typedef TString TData;
+
+ static TString Data(const TString& str) {
+ return str;
+ }
+};
+
+template <class T>
+class TCompactTrieTest::TIntPacker: public TCompactTriePacker<T> {
+public:
+ typedef T TData;
+
+ static TData Data(const TString&) {
+ return RandomNumber<std::make_unsigned_t<T>>();
+ }
+};
+
+void TCompactTrieTest::TestIterateEmptyKey() {
+ TBuffer trieBuffer;
+ {
+ TCompactTrieBuilder<char, ui32> builder;
+ UNIT_ASSERT(builder.Add("", 1));
+ TBufferStream trieBufferO(trieBuffer);
+ builder.Save(trieBufferO);
+ }
+ TCompactTrie<char, ui32> trie(TBlob::FromBuffer(trieBuffer));
+ ui32 val;
+ UNIT_ASSERT(trie.Find("", &val));
+ UNIT_ASSERT(val == 1);
+ TCompactTrie<char, ui32>::TConstIterator it = trie.Begin();
+ UNIT_ASSERT(it.GetKey().empty());
+ UNIT_ASSERT(it.GetValue() == 1);
+}
+
+void TCompactTrieTest::TestTrieSet() {
+ TBuffer buffer;
+ {
+ TCompactTrieSet<char>::TBuilder builder;
+ UNIT_ASSERT(builder.Add("a", 0));
+ UNIT_ASSERT(builder.Add("ab", 1));
+ UNIT_ASSERT(builder.Add("abc", 1));
+ UNIT_ASSERT(builder.Add("abcd", 0));
+ UNIT_ASSERT(!builder.Add("abcd", 1));
+
+ TBufferStream stream(buffer);
+ builder.Save(stream);
+ }
+
+ TCompactTrieSet<char> set(TBlob::FromBuffer(buffer));
+ UNIT_ASSERT(set.Has("a"));
+ UNIT_ASSERT(set.Has("ab"));
+ UNIT_ASSERT(set.Has("abc"));
+ UNIT_ASSERT(set.Has("abcd"));
+ UNIT_ASSERT(!set.Has("abcde"));
+ UNIT_ASSERT(!set.Has("aa"));
+ UNIT_ASSERT(!set.Has("b"));
+ UNIT_ASSERT(!set.Has(""));
+
+ TCompactTrieSet<char> tails;
+ UNIT_ASSERT(set.FindTails("a", tails));
+ UNIT_ASSERT(tails.Has("b"));
+ UNIT_ASSERT(tails.Has("bcd"));
+ UNIT_ASSERT(!tails.Has("ab"));
+ UNIT_ASSERT(!set.Has(""));
+
+ TCompactTrieSet<char> empty;
+ UNIT_ASSERT(set.FindTails("abcd", empty));
+ UNIT_ASSERT(!empty.Has("a"));
+ UNIT_ASSERT(!empty.Has("b"));
+ UNIT_ASSERT(!empty.Has("c"));
+ UNIT_ASSERT(!empty.Has("d"));
+ UNIT_ASSERT(!empty.Has("d"));
+
+ UNIT_ASSERT(empty.Has("")); // contains only empty string
+}
+
+// Tests for trie with vector (list, set) values
+
+TVector<TUtf16String> TCompactTrieTest::GetSampleKeys(size_t nKeys) const {
+ Y_ASSERT(nKeys <= 10);
+ TString sampleKeys[] = {"a", "b", "ac", "bd", "abe", "bcf", "deg", "ah", "xy", "abc"};
+ TVector<TUtf16String> result;
+ for (size_t i = 0; i < nKeys; i++)
+ result.push_back(ASCIIToWide(sampleKeys[i]));
+ return result;
+}
+
+template <class TContainer>
+TVector<TContainer> TCompactTrieTest::GetSampleVectorData(size_t nValues) {
+ TVector<TContainer> data;
+ for (size_t i = 0; i < nValues; i++) {
+ data.push_back(TContainer());
+ for (size_t j = 0; j < i; j++)
+ data[i].insert(data[i].end(), (typename TContainer::value_type)((j == 3) ? 0 : (1 << (j * 5))));
+ }
+ return data;
+}
+
+template <class TContainer>
+TVector<TContainer> TCompactTrieTest::GetSampleTextVectorData(size_t nValues) {
+ TVector<TContainer> data;
+ for (size_t i = 0; i < nValues; i++) {
+ data.push_back(TContainer());
+ for (size_t j = 0; j < i; j++)
+ data[i].insert(data[i].end(), TString("abc") + ToString<size_t>(j));
+ }
+ return data;
+}
+
+template <class T>
+void TCompactTrieTest::CheckEquality(const T& value1, const T& value2) const {
+ UNIT_ASSERT_VALUES_EQUAL(value1, value2);
+}
+
+template <>
+void TCompactTrieTest::CheckEquality<TVector<i64>>(const TVector<i64>& value1, const TVector<i64>& value2) const {
+ UNIT_ASSERT_VALUES_EQUAL(value1.size(), value2.size());
+ for (size_t i = 0; i < value1.size(); i++)
+ UNIT_ASSERT_VALUES_EQUAL(value1[i], value2[i]);
+}
+
+template <class TContainer>
+void TCompactTrieTest::TestTrieWithContainers(const TVector<TUtf16String>& keys, const TVector<TContainer>& sampleData, TString methodName) {
+ TString fileName = GetSystemTempDir() + "/TCompactTrieTest-TestTrieWithContainers-" + methodName;
+
+ TCompactTrieBuilder<wchar16, TContainer> b;
+ for (size_t i = 0; i < keys.size(); i++) {
+ b.Add(keys[i], sampleData[i]);
+ }
+ TUnbufferedFileOutput out(fileName);
+ b.Save(out);
+
+ TCompactTrie<wchar16, TContainer> trie(TBlob::FromFileSingleThreaded(fileName));
+ for (size_t i = 0; i < keys.size(); i++) {
+ TContainer value = trie.Get(keys[i]);
+ UNIT_ASSERT_VALUES_EQUAL(value.size(), sampleData[i].size());
+ typename TContainer::const_iterator p = value.begin();
+ typename TContainer::const_iterator p1 = sampleData[i].begin();
+ for (; p != value.end(); p++, p1++)
+ CheckEquality<typename TContainer::value_type>(*p, *p1);
+ }
+
+ unlink(fileName.data());
+}
+
+template <>
+void TCompactTrieTest::TestTrieWithContainers<std::pair<TUtf16String, TVector<i64>>>(const TVector<TUtf16String>& keys, const TVector<std::pair<TUtf16String, TVector<i64>>>& sampleData, TString methodName) {
+ typedef std::pair<TUtf16String, TVector<i64>> TContainer;
+ TString fileName = GetSystemTempDir() + "/TCompactTrieTest-TestTrieWithContainers-" + methodName;
+
+ TCompactTrieBuilder<wchar16, TContainer> b;
+ for (size_t i = 0; i < keys.size(); i++) {
+ b.Add(keys[i], sampleData[i]);
+ }
+ TUnbufferedFileOutput out(fileName);
+ b.Save(out);
+
+ TCompactTrie<wchar16, TContainer> trie(TBlob::FromFileSingleThreaded(fileName));
+ for (size_t i = 0; i < keys.size(); i++) {
+ TContainer value = trie.Get(keys[i]);
+ CheckEquality<TContainer::first_type>(value.first, sampleData[i].first);
+ CheckEquality<TContainer::second_type>(value.second, sampleData[i].second);
+ }
+
+ unlink(fileName.data());
+}
+
+void TCompactTrieTest::TestTrieForVectorInt64() {
+ TestTrieWithContainers<TVector<i64>>(GetSampleKeys(10), GetSampleVectorData<TVector<i64>>(10), "v-i64");
+}
+
+void TCompactTrieTest::TestTrieForListInt64() {
+ TestTrieWithContainers<TList<i64>>(GetSampleKeys(10), GetSampleVectorData<TList<i64>>(10), "l-i64");
+}
+
+void TCompactTrieTest::TestTrieForSetInt64() {
+ TestTrieWithContainers<TSet<i64>>(GetSampleKeys(10), GetSampleVectorData<TSet<i64>>(10), "s-i64");
+}
+
+void TCompactTrieTest::TestTrieForVectorStroka() {
+ TestTrieWithContainers<TVector<TString>>(GetSampleKeys(10), GetSampleTextVectorData<TVector<TString>>(10), "v-str");
+}
+
+void TCompactTrieTest::TestTrieForListStroka() {
+ TestTrieWithContainers<TList<TString>>(GetSampleKeys(10), GetSampleTextVectorData<TList<TString>>(10), "l-str");
+}
+
+void TCompactTrieTest::TestTrieForSetStroka() {
+ TestTrieWithContainers<TSet<TString>>(GetSampleKeys(10), GetSampleTextVectorData<TSet<TString>>(10), "s-str");
+}
+
+void TCompactTrieTest::TestTrieForVectorWtroka() {
+ TVector<TVector<TString>> data = GetSampleTextVectorData<TVector<TString>>(10);
+ TVector<TVector<TUtf16String>> wData;
+ for (size_t i = 0; i < data.size(); i++) {
+ wData.push_back(TVector<TUtf16String>());
+ for (size_t j = 0; j < data[i].size(); j++)
+ wData[i].push_back(UTF8ToWide(data[i][j]));
+ }
+ TestTrieWithContainers<TVector<TUtf16String>>(GetSampleKeys(10), wData, "v-wtr");
+}
+
+void TCompactTrieTest::TestTrieForVectorFloat() {
+ TestTrieWithContainers<TVector<float>>(GetSampleKeys(10), GetSampleVectorData<TVector<float>>(10), "v-float");
+}
+
+void TCompactTrieTest::TestTrieForVectorDouble() {
+ TestTrieWithContainers<TVector<double>>(GetSampleKeys(10), GetSampleVectorData<TVector<double>>(10), "v-double");
+}
+
+void TCompactTrieTest::TestTrieForListVectorInt64() {
+ TVector<i64> tmp;
+ tmp.push_back(0);
+ TList<TVector<i64>> dataElement(5, tmp);
+ TVector<TList<TVector<i64>>> data(10, dataElement);
+ TestTrieWithContainers<TList<TVector<i64>>>(GetSampleKeys(10), data, "l-v-i64");
+}
+
+void TCompactTrieTest::TestTrieForPairWtrokaVectorInt64() {
+ TVector<TUtf16String> keys = GetSampleKeys(10);
+ TVector<TVector<i64>> values = GetSampleVectorData<TVector<i64>>(10);
+ TVector<std::pair<TUtf16String, TVector<i64>>> data;
+ for (size_t i = 0; i < 10; i++)
+ data.push_back(std::pair<TUtf16String, TVector<i64>>(keys[i] + u"_v", values[i]));
+ TestTrieWithContainers<std::pair<TUtf16String, TVector<i64>>>(keys, data, "pair-str-v-i64");
+}
+
+void TCompactTrieTest::TestEmptyValueOutOfOrder() {
+ TBufferOutput buffer;
+ using TSymbol = ui32;
+ {
+ TCompactTrieBuilder<TSymbol, ui32> builder;
+ TSymbol key = 1;
+ builder.Add(&key, 1, 10);
+ builder.Add(nullptr, 0, 14);
+ builder.Save(buffer);
+ }
+ {
+ TCompactTrie<TSymbol, ui32> trie(buffer.Buffer().Data(), buffer.Buffer().Size());
+ UNIT_ASSERT(trie.Find(nullptr, 0));
+ }
+}
+
+void TCompactTrieTest::TestFindLongestPrefixWithEmptyValue() {
+ TBufferOutput buffer;
+ {
+ TCompactTrieBuilder<wchar16, ui32> builder;
+ builder.Add(u"", 42);
+ builder.Add(u"yandex", 271828);
+ builder.Add(u"ya", 31415);
+ builder.Save(buffer);
+ }
+ {
+ TCompactTrie<wchar16, ui32> trie(buffer.Buffer().Data(), buffer.Buffer().Size());
+ size_t prefixLen = 123;
+ ui32 value = 0;
+
+ UNIT_ASSERT(trie.FindLongestPrefix(u"google", &prefixLen, &value));
+ UNIT_ASSERT(prefixLen == 0);
+ UNIT_ASSERT(value == 42);
+
+ UNIT_ASSERT(trie.FindLongestPrefix(u"yahoo", &prefixLen, &value));
+ UNIT_ASSERT(prefixLen == 2);
+ UNIT_ASSERT(value == 31415);
+ }
+}
+
+template <typename TChar>
+struct TConvertKey {
+ static inline TString Convert(const TStringBuf& key) {
+ return ToString(key);
+ }
+};
+
+template <>
+struct TConvertKey<wchar16> {
+ static inline TUtf16String Convert(const TStringBuf& key) {
+ return UTF8ToWide(key);
+ }
+};
+
+template <>
+struct TConvertKey<wchar32> {
+ static inline TUtf32String Convert(const TStringBuf& key) {
+ return TUtf32String::FromUtf8(key);
+ }
+};
+
+template <class TSearchIter, class TKeyBuf>
+static void MoveIter(TSearchIter& iter, const TKeyBuf& key) {
+ for (size_t i = 0; i < key.length(); ++i) {
+ UNIT_ASSERT(iter.Advance(key[i]));
+ }
+}
+
+template <typename TChar>
+void TCompactTrieTest::TestSearchIterImpl() {
+ TBufferOutput buffer;
+ {
+ TCompactTrieBuilder<TChar, ui32> builder;
+ TStringBuf data[] = {
+ TStringBuf("abaab"),
+ TStringBuf("abcdef"),
+ TStringBuf("abbbc"),
+ TStringBuf("bdfaa"),
+ };
+ for (size_t i = 0; i < Y_ARRAY_SIZE(data); ++i) {
+ builder.Add(TConvertKey<TChar>::Convert(data[i]), i + 1);
+ }
+ builder.Save(buffer);
+ }
+
+ TCompactTrie<TChar, ui32> trie(buffer.Buffer().Data(), buffer.Buffer().Size());
+ ui32 value = 0;
+ auto iter(MakeSearchIterator(trie));
+ MoveIter(iter, TConvertKey<TChar>::Convert(TStringBuf("abc")));
+ UNIT_ASSERT(!iter.GetValue(&value));
+
+ iter = MakeSearchIterator(trie);
+ MoveIter(iter, TConvertKey<TChar>::Convert(TStringBuf("abbbc")));
+ UNIT_ASSERT(iter.GetValue(&value));
+ UNIT_ASSERT_EQUAL(value, 3);
+
+ iter = MakeSearchIterator(trie);
+ UNIT_ASSERT(iter.Advance(TConvertKey<TChar>::Convert(TStringBuf("bdfa"))));
+ UNIT_ASSERT(!iter.GetValue(&value));
+
+ iter = MakeSearchIterator(trie);
+ UNIT_ASSERT(iter.Advance(TConvertKey<TChar>::Convert(TStringBuf("bdfaa"))));
+ UNIT_ASSERT(iter.GetValue(&value));
+ UNIT_ASSERT_EQUAL(value, 4);
+
+ UNIT_ASSERT(!MakeSearchIterator(trie).Advance(TChar('z')));
+ UNIT_ASSERT(!MakeSearchIterator(trie).Advance(TConvertKey<TChar>::Convert(TStringBuf("cdf"))));
+ UNIT_ASSERT(!MakeSearchIterator(trie).Advance(TConvertKey<TChar>::Convert(TStringBuf("abca"))));
+}
+
+void TCompactTrieTest::TestSearchIterChar() {
+ TestSearchIterImpl<char>();
+}
+
+void TCompactTrieTest::TestSearchIterWchar() {
+ TestSearchIterImpl<wchar16>();
+}
+
+void TCompactTrieTest::TestSearchIterWchar32() {
+ TestSearchIterImpl<wchar32>();
+}
+
+void TCompactTrieTest::TestCopyAndAssignment() {
+ TBufferOutput bufout;
+ typedef TCompactTrie<> TTrie;
+ CreateTrie<char>(bufout, false, false);
+ TTrie trie(bufout.Buffer().Data(), bufout.Buffer().Size());
+ TTrie copy(trie);
+ UNIT_ASSERT(copy.HasCorrectSkipper());
+ TTrie assign;
+ assign = trie;
+ UNIT_ASSERT(assign.HasCorrectSkipper());
+ TTrie move(std::move(trie));
+ UNIT_ASSERT(move.HasCorrectSkipper());
+ TTrie moveAssign;
+ moveAssign = TTrie(bufout.Buffer().Data(), bufout.Buffer().Size());
+ UNIT_ASSERT(moveAssign.HasCorrectSkipper());
+}
+
+template <class TTrie>
+void TCompactTrieTest::TestFirstSymbolIteratorForTrie(const TTrie& trie, const TStringBuf& narrowAnswers) {
+ NCompactTrie::TFirstSymbolIterator<TTrie> it;
+ it.SetTrie(trie, trie.GetSkipper());
+ typename TTrie::TKey answers = MakeWideKey<typename TTrie::TSymbol>(narrowAnswers);
+ auto answer = answers.begin();
+ for (; !it.AtEnd(); it.MakeStep(), ++answer) {
+ UNIT_ASSERT(answer != answers.end());
+ UNIT_ASSERT(it.GetKey() == *answer);
+ }
+ UNIT_ASSERT(answer == answers.end());
+}
+
+template <class TSymbol>
+void TCompactTrieTest::TestFirstSymbolIterator() {
+ TBufferOutput bufout;
+ typedef TCompactTrie<TSymbol> TTrie;
+ CreateTrie<TSymbol>(bufout, false, false);
+ TTrie trie(bufout.Buffer().Data(), bufout.Buffer().Size());
+ TStringBuf rootAnswers = "abcdf";
+ TestFirstSymbolIteratorForTrie(trie, rootAnswers);
+ TStringBuf aAnswers = "abcd";
+ TestFirstSymbolIteratorForTrie(trie.FindTails(MakeWideKey<TSymbol>("a", 1)), aAnswers);
+}
+
+void TCompactTrieTest::TestFirstSymbolIterator8() {
+ TestFirstSymbolIterator<char>();
+}
+
+void TCompactTrieTest::TestFirstSymbolIterator16() {
+ TestFirstSymbolIterator<wchar16>();
+}
+
+void TCompactTrieTest::TestFirstSymbolIterator32() {
+ TestFirstSymbolIterator<ui32>();
+}
+
+void TCompactTrieTest::TestFirstSymbolIteratorChar32() {
+ TestFirstSymbolIterator<wchar32>();
+}
+
+
+void TCompactTrieTest::TestArrayPacker() {
+ using TDataInt = std::array<int, 2>;
+ const std::pair<TString, TDataInt> dataXxx{"xxx", {{15, 16}}};
+ const std::pair<TString, TDataInt> dataYyy{"yyy", {{20, 30}}};
+
+ TCompactTrieBuilder<char, TDataInt> trieBuilderOne;
+ trieBuilderOne.Add(dataXxx.first, dataXxx.second);
+ trieBuilderOne.Add(dataYyy.first, dataYyy.second);
+
+ TBufferOutput bufferOne;
+ trieBuilderOne.Save(bufferOne);
+
+ const TCompactTrie<char, TDataInt> trieOne(bufferOne.Buffer().Data(), bufferOne.Buffer().Size());
+ UNIT_ASSERT_VALUES_EQUAL(dataXxx.second, trieOne.Get(dataXxx.first));
+ UNIT_ASSERT_VALUES_EQUAL(dataYyy.second, trieOne.Get(dataYyy.first));
+
+ using TDataStroka = std::array<TString, 2>;
+ const std::pair<TString, TDataStroka> dataZzz{"zzz", {{"hello", "there"}}};
+ const std::pair<TString, TDataStroka> dataWww{"www", {{"half", "life"}}};
+
+ TCompactTrieBuilder<char, TDataStroka> trieBuilderTwo;
+ trieBuilderTwo.Add(dataZzz.first, dataZzz.second);
+ trieBuilderTwo.Add(dataWww.first, dataWww.second);
+
+ TBufferOutput bufferTwo;
+ trieBuilderTwo.Save(bufferTwo);
+
+ const TCompactTrie<char, TDataStroka> trieTwo(bufferTwo.Buffer().Data(), bufferTwo.Buffer().Size());
+ UNIT_ASSERT_VALUES_EQUAL(dataZzz.second, trieTwo.Get(dataZzz.first));
+ UNIT_ASSERT_VALUES_EQUAL(dataWww.second, trieTwo.Get(dataWww.first));
+}
+
+void TCompactTrieTest::TestBuilderFindLongestPrefix() {
+ const size_t sizes[] = {10, 100};
+ const double branchProbabilities[] = {0.01, 0.1, 0.5, 0.9, 0.99};
+ for (size_t size : sizes) {
+ for (double branchProbability : branchProbabilities) {
+ TestBuilderFindLongestPrefix(size, branchProbability, false, false);
+ TestBuilderFindLongestPrefix(size, branchProbability, false, true);
+ TestBuilderFindLongestPrefix(size, branchProbability, true, false);
+ TestBuilderFindLongestPrefix(size, branchProbability, true, true);
+ }
+ }
+}
+
+void TCompactTrieTest::TestBuilderFindLongestPrefix(size_t keysCount, double branchProbability, bool isPrefixGrouped, bool hasEmptyKey) {
+ TVector<TString> keys;
+ TString keyToAdd;
+ for (size_t i = 0; i < keysCount; ++i) {
+ const size_t prevKeyLen = keyToAdd.Size();
+ // add two random chars to prev key
+ keyToAdd += RandChar();
+ keyToAdd += RandChar();
+ const bool changeBranch = prevKeyLen && RandomNumber<double>() < branchProbability;
+ if (changeBranch) {
+ const size_t branchPlace = RandomNumber<size_t>(prevKeyLen + 1); // random place in [0, prevKeyLen]
+ *(keyToAdd.begin() + branchPlace) = RandChar();
+ }
+ keys.push_back(keyToAdd);
+ }
+
+ if (isPrefixGrouped)
+ Sort(keys.begin(), keys.end());
+ else
+ Shuffle(keys.begin(), keys.end());
+
+ TCompactTrieBuilder<char, TString> builder(isPrefixGrouped ? CTBF_PREFIX_GROUPED : CTBF_NONE);
+ const TString EMPTY_VALUE = "empty";
+ if (hasEmptyKey)
+ builder.Add(nullptr, 0, EMPTY_VALUE);
+
+ for (size_t i = 0; i < keysCount; ++i) {
+ const TString& key = keys[i];
+
+ for (size_t j = 0; j < keysCount; ++j) {
+ const TString& otherKey = keys[j];
+ const bool exists = j < i;
+ size_t expectedSize = 0;
+ if (exists) {
+ expectedSize = otherKey.size();
+ } else {
+ size_t max = 0;
+ for (size_t k = 0; k < i; ++k)
+ if (keys[k].Size() < otherKey.Size() && keys[k].Size() > max && otherKey.StartsWith(keys[k]))
+ max = keys[k].Size();
+ expectedSize = max;
+ }
+
+ size_t prefixSize = 0xfcfcfc;
+ TString value = "abcd";
+ const bool expectedResult = hasEmptyKey || expectedSize != 0;
+ UNIT_ASSERT_VALUES_EQUAL_C(expectedResult, builder.FindLongestPrefix(otherKey.data(), otherKey.size(), &prefixSize, &value), "otherKey = " << HexEncode(otherKey));
+ if (expectedResult) {
+ UNIT_ASSERT_VALUES_EQUAL(expectedSize, prefixSize);
+ if (expectedSize) {
+ UNIT_ASSERT_VALUES_EQUAL(TStringBuf(otherKey).SubStr(0, prefixSize), value);
+ } else {
+ UNIT_ASSERT_VALUES_EQUAL(EMPTY_VALUE, value);
+ }
+ } else {
+ UNIT_ASSERT_VALUES_EQUAL("abcd", value);
+ UNIT_ASSERT_VALUES_EQUAL(0xfcfcfc, prefixSize);
+ }
+
+ for (int c = 0; c < 10; ++c) {
+ TString extendedKey = otherKey;
+ extendedKey += RandChar();
+ size_t extendedPrefixSize = 0xdddddd;
+ TString extendedValue = "dcba";
+ UNIT_ASSERT_VALUES_EQUAL(expectedResult, builder.FindLongestPrefix(extendedKey.data(), extendedKey.size(), &extendedPrefixSize, &extendedValue));
+ if (expectedResult) {
+ UNIT_ASSERT_VALUES_EQUAL(value, extendedValue);
+ UNIT_ASSERT_VALUES_EQUAL(prefixSize, extendedPrefixSize);
+ } else {
+ UNIT_ASSERT_VALUES_EQUAL("dcba", extendedValue);
+ UNIT_ASSERT_VALUES_EQUAL(0xdddddd, extendedPrefixSize);
+ }
+ }
+ }
+ builder.Add(key.data(), key.size(), key);
+ }
+
+ TBufferOutput buffer;
+ builder.Save(buffer);
+}
+
+void TCompactTrieTest::TestBuilderFindLongestPrefixWithEmptyValue() {
+ TCompactTrieBuilder<wchar16, ui32> builder;
+ builder.Add(u"", 42);
+ builder.Add(u"yandex", 271828);
+ builder.Add(u"ya", 31415);
+
+ size_t prefixLen = 123;
+ ui32 value = 0;
+
+ UNIT_ASSERT(builder.FindLongestPrefix(u"google", &prefixLen, &value));
+ UNIT_ASSERT_VALUES_EQUAL(prefixLen, 0);
+ UNIT_ASSERT_VALUES_EQUAL(value, 42);
+
+ UNIT_ASSERT(builder.FindLongestPrefix(u"yahoo", &prefixLen, &value));
+ UNIT_ASSERT_VALUES_EQUAL(prefixLen, 2);
+ UNIT_ASSERT_VALUES_EQUAL(value, 31415);
+
+ TBufferOutput buffer;
+ builder.Save(buffer);
+}
+
+void TCompactTrieTest::TestPatternSearcherEmpty() {
+ TCompactPatternSearcherBuilder<char, ui32> builder;
+
+ TBufferOutput searcherData;
+ builder.Save(searcherData);
+
+ TCompactPatternSearcher<char, ui32> searcher(
+ searcherData.Buffer().Data(),
+ searcherData.Buffer().Size()
+ );
+
+ UNIT_ASSERT(searcher.SearchMatches("a").empty());
+ UNIT_ASSERT(searcher.SearchMatches("").empty());
+ UNIT_ASSERT(searcher.SearchMatches("abc").empty());
+}
+
+void TCompactTrieTest::TestPatternSearcherOnDataset(
+ const TVector<TString>& patterns,
+ const TVector<TString>& samples
+) {
+ TCompactPatternSearcherBuilder<char, ui32> builder;
+
+ for (size_t patternIdx = 0; patternIdx < patterns.size(); ++patternIdx) {
+ builder.Add(patterns[patternIdx], patternIdx);
+ }
+
+ TBufferOutput searcherData;
+ builder.Save(searcherData);
+
+ TCompactPatternSearcher<char, ui32> searcher(
+ searcherData.Buffer().Data(),
+ searcherData.Buffer().Size()
+ );
+
+ for (const auto& sample : samples) {
+ const auto matches = searcher.SearchMatches(sample);
+
+ size_t matchesNum = 0;
+ THashSet<TString> processedPatterns;
+ for (const auto& pattern : patterns) {
+ if (pattern.Empty() || processedPatterns.contains(pattern)) {
+ continue;
+ }
+ for (size_t start = 0; start + pattern.Size() <= sample.Size(); ++start) {
+ matchesNum += (pattern == sample.substr(start, pattern.Size()));
+ }
+ processedPatterns.insert(pattern);
+ }
+ UNIT_ASSERT_VALUES_EQUAL(matchesNum, matches.size());
+
+
+ TSet<std::pair<size_t, ui32>> foundMatches;
+ for (const auto& match : matches) {
+ std::pair<size_t, ui32> matchParams(match.End, match.Data);
+ UNIT_ASSERT(!foundMatches.contains(matchParams));
+ foundMatches.insert(matchParams);
+
+ const auto& pattern = patterns[match.Data];
+ UNIT_ASSERT_VALUES_EQUAL(
+ sample.substr(match.End - pattern.size() + 1, pattern.size()),
+ pattern
+ );
+ }
+ }
+}
+
+void TCompactTrieTest::TestPatternSearcherSimple() {
+ TestPatternSearcherOnDataset(
+ { // patterns
+ "abcd",
+ "abc",
+ "ab",
+ "a",
+ ""
+ },
+ { // samples
+ "abcde",
+ "abcd",
+ "abc",
+ "ab",
+ "a",
+ ""
+ }
+ );
+ TestPatternSearcherOnDataset(
+ { // patterns
+ "a"
+ "ab",
+ "abcd",
+ },
+ { // samples
+ "abcde",
+ "abcd",
+ "abc",
+ "ab",
+ "a",
+ ""
+ }
+ );
+ TestPatternSearcherOnDataset(
+ { // patterns
+ "aaaa",
+ "aaa",
+ "aa",
+ "a",
+ },
+ { // samples
+ "aaaaaaaaaaaa"
+ }
+ );
+ TestPatternSearcherOnDataset(
+ { // patterns
+ "aa", "ab", "ac", "ad", "ae", "af",
+ "ba", "bb", "bc", "bd", "be", "bf",
+ "ca", "cb", "cc", "cd", "ce", "cf",
+ "da", "db", "dc", "dd", "de", "df",
+ "ea", "eb", "ec", "ed", "ee", "ef",
+ "fa", "fb", "fc", "fd", "fe", "ff"
+ },
+ { // samples
+ "dcabafeebfdcbacddacadbaabecdbaeffecdbfabcdcabcfaefecdfebacfedacefbdcacfeb",
+ "abcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefancdefancdef",
+ "fedcbafedcbafedcbafedcbafedcbafedcbafedcbafedcbafedcbafedcbafedcbafedcba",
+ "",
+ "a", "b", "c", "d", "e", "f",
+ "aa", "ab", "ac", "ad", "ae", "af",
+ "ba", "bb", "bc", "bd", "be", "bf",
+ "ca", "cb", "cc", "cd", "ce", "cf",
+ "da", "db", "dc", "dd", "de", "df",
+ "ea", "eb", "ec", "ed", "ee", "ef",
+ "fa", "fb", "fc", "fd", "fe", "ff"
+ }
+ );
+}
+
+static char RandChar(
+ TFastRng<ui64>& rng,
+ int maxChar
+) {
+ return static_cast<char>(rng.GenRand() % (maxChar + 1));
+}
+
+static TString RandStr(
+ TFastRng<ui64>& rng,
+ size_t maxLength,
+ int maxChar,
+ bool nonEmpty = false
+) {
+ Y_ASSERT(maxLength > 0);
+
+ size_t length;
+ if (nonEmpty) {
+ length = rng.GenRand() % maxLength + 1;
+ } else {
+ length = rng.GenRand() % (maxLength + 1);
+ }
+
+ TString result;
+ while (result.size() < length) {
+ result += RandChar(rng, maxChar);
+ }
+
+ return result;
+}
+
+void TCompactTrieTest::TestPatternSearcherRandom(
+ size_t patternsNum,
+ size_t patternMaxLength,
+ size_t strMaxLength,
+ int maxChar,
+ TFastRng<ui64>& rng
+) {
+ auto patternToSearch = RandStr(rng, patternMaxLength, maxChar, /*nonEmpty*/true);
+
+ TVector<TString> patterns = {patternToSearch};
+ while (patterns.size() < patternsNum) {
+ patterns.push_back(RandStr(rng, patternMaxLength, maxChar, /*nonEmpty*/true));
+ }
+
+ auto filler = RandStr(rng, strMaxLength - patternToSearch.Size() + 1, maxChar);
+ size_t leftFillerSize = rng.GenRand() % (filler.size() + 1);
+ auto leftFiller = filler.substr(0, leftFillerSize);
+ auto rightFiller = filler.substr(leftFillerSize, filler.size() - leftFillerSize);
+ auto sample = leftFiller + patternToSearch + rightFiller;
+
+ TestPatternSearcherOnDataset(patterns, {sample});
+}
+
+void TCompactTrieTest::TestPatternSearcherRandom() {
+ TFastRng<ui64> rng(0);
+ for (size_t patternMaxLen : {1, 2, 10}) {
+ for (size_t strMaxLen : TVector<size_t>{patternMaxLen, 2 * patternMaxLen, 10}) {
+ for (int maxChar : {0, 1, 5, 255}) {
+ for (size_t patternsNum : {1, 10}) {
+ for (size_t testIdx = 0; testIdx < 3; ++testIdx) {
+ TestPatternSearcherRandom(
+ patternsNum,
+ patternMaxLen,
+ strMaxLen,
+ maxChar,
+ rng
+ );
+ }
+ }
+ }
+ }
+ }
+}
diff --git a/library/cpp/containers/comptrie/first_symbol_iterator.h b/library/cpp/containers/comptrie/first_symbol_iterator.h
new file mode 100644
index 0000000000..d06135f06f
--- /dev/null
+++ b/library/cpp/containers/comptrie/first_symbol_iterator.h
@@ -0,0 +1,61 @@
+#pragma once
+
+#include "opaque_trie_iterator.h"
+#include <util/generic/ptr.h>
+
+namespace NCompactTrie {
+ // Iterates over possible first symbols in a trie.
+ // Allows one to get the symbol and the subtrie starting from it.
+ template <class TTrie>
+ class TFirstSymbolIterator {
+ public:
+ using TSymbol = typename TTrie::TSymbol;
+ using TData = typename TTrie::TData;
+
+ void SetTrie(const TTrie& trie, const ILeafSkipper& skipper) {
+ Trie = trie;
+ Impl.Reset(new TOpaqueTrieIterator(
+ TOpaqueTrie(Trie.Data().AsCharPtr(), Trie.Data().Size(), skipper),
+ nullptr,
+ false,
+ sizeof(TSymbol)));
+ if (Impl->MeasureKey<TSymbol>() == 0) {
+ MakeStep();
+ }
+ }
+
+ const TTrie& GetTrie() const {
+ return Trie;
+ }
+
+ bool AtEnd() const {
+ return *Impl == TOpaqueTrieIterator(Impl->GetTrie(), nullptr, true, sizeof(TSymbol));
+ }
+
+ TSymbol GetKey() const {
+ return Impl->GetKey<TSymbol>()[0];
+ }
+
+ TTrie GetTails() const {
+ const TNode& node = Impl->GetNode();
+ const size_t forwardOffset = node.GetForwardOffset();
+ const char* emptyValue = node.IsFinal() ? Trie.Data().AsCharPtr() + node.GetLeafOffset() : nullptr;
+ if (forwardOffset) {
+ const char* start = Trie.Data().AsCharPtr() + forwardOffset;
+ TBlob body = TBlob::NoCopy(start, Trie.Data().Size() - forwardOffset);
+ return TTrie(body, emptyValue, Trie.GetPacker());
+ } else {
+ return TTrie(emptyValue);
+ }
+ }
+
+ void MakeStep() {
+ Impl->Forward();
+ }
+
+ private:
+ TTrie Trie;
+ TCopyPtr<TOpaqueTrieIterator> Impl;
+ };
+
+}
diff --git a/library/cpp/containers/comptrie/key_selector.h b/library/cpp/containers/comptrie/key_selector.h
new file mode 100644
index 0000000000..60466cef71
--- /dev/null
+++ b/library/cpp/containers/comptrie/key_selector.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <util/generic/vector.h>
+#include <util/generic/string.h>
+#include <util/generic/strbuf.h>
+
+template <class T>
+struct TCompactTrieKeySelector {
+ typedef TVector<T> TKey;
+ typedef TVector<T> TKeyBuf;
+};
+
+template <class TChar>
+struct TCompactTrieCharKeySelector {
+ typedef TBasicString<TChar> TKey;
+ typedef TBasicStringBuf<TChar> TKeyBuf;
+};
+
+template <>
+struct TCompactTrieKeySelector<char>: public TCompactTrieCharKeySelector<char> {
+};
+
+template <>
+struct TCompactTrieKeySelector<wchar16>: public TCompactTrieCharKeySelector<wchar16> {
+};
+
+template <>
+struct TCompactTrieKeySelector<wchar32>: public TCompactTrieCharKeySelector<wchar32> {
+};
diff --git a/library/cpp/containers/comptrie/leaf_skipper.h b/library/cpp/containers/comptrie/leaf_skipper.h
new file mode 100644
index 0000000000..3959258948
--- /dev/null
+++ b/library/cpp/containers/comptrie/leaf_skipper.h
@@ -0,0 +1,56 @@
+#pragma once
+
+#include <cstddef>
+
+namespace NCompactTrie {
+ class ILeafSkipper {
+ public:
+ virtual size_t SkipLeaf(const char* p) const = 0;
+ virtual ~ILeafSkipper() = default;
+ };
+
+ template <class TPacker>
+ class TPackerLeafSkipper: public ILeafSkipper {
+ private:
+ const TPacker* Packer;
+
+ public:
+ TPackerLeafSkipper(const TPacker* packer)
+ : Packer(packer)
+ {
+ }
+
+ size_t SkipLeaf(const char* p) const override {
+ return Packer->SkipLeaf(p);
+ }
+
+ // For test purposes.
+ const TPacker* GetPacker() const {
+ return Packer;
+ }
+ };
+
+ // The data you need to traverse the trie without unpacking the values.
+ struct TOpaqueTrie {
+ const char* Data;
+ size_t Length;
+ const ILeafSkipper& SkipFunction;
+
+ TOpaqueTrie(const char* data, size_t dataLength, const ILeafSkipper& skipFunction)
+ : Data(data)
+ , Length(dataLength)
+ , SkipFunction(skipFunction)
+ {
+ }
+
+ bool operator==(const TOpaqueTrie& other) const {
+ return Data == other.Data &&
+ Length == other.Length &&
+ &SkipFunction == &other.SkipFunction;
+ }
+
+ bool operator!=(const TOpaqueTrie& other) const {
+ return !(*this == other);
+ }
+ };
+}
diff --git a/library/cpp/containers/comptrie/loader/loader.cpp b/library/cpp/containers/comptrie/loader/loader.cpp
new file mode 100644
index 0000000000..4811e9d521
--- /dev/null
+++ b/library/cpp/containers/comptrie/loader/loader.cpp
@@ -0,0 +1 @@
+#include "loader.h"
diff --git a/library/cpp/containers/comptrie/loader/loader.h b/library/cpp/containers/comptrie/loader/loader.h
new file mode 100644
index 0000000000..ee10e9b451
--- /dev/null
+++ b/library/cpp/containers/comptrie/loader/loader.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <library/cpp/archive/yarchive.h>
+#include <util/generic/string.h>
+#include <util/generic/ptr.h>
+#include <util/generic/yexception.h>
+#include <util/memory/blob.h>
+
+template <class TrieType, size_t N>
+TrieType LoadTrieFromArchive(const TString& key,
+ const unsigned char (&data)[N],
+ bool ignoreErrors = false) {
+ TArchiveReader archive(TBlob::NoCopy(data, sizeof(data)));
+ if (archive.Has(key)) {
+ TAutoPtr<IInputStream> trie = archive.ObjectByKey(key);
+ return TrieType(TBlob::FromStream(*trie));
+ }
+ if (!ignoreErrors) {
+ ythrow yexception() << "Resource " << key << " not found";
+ }
+ return TrieType();
+}
diff --git a/library/cpp/containers/comptrie/loader/loader_ut.cpp b/library/cpp/containers/comptrie/loader/loader_ut.cpp
new file mode 100644
index 0000000000..345063a31e
--- /dev/null
+++ b/library/cpp/containers/comptrie/loader/loader_ut.cpp
@@ -0,0 +1,30 @@
+#include <library/cpp/testing/unittest/registar.h>
+#include <library/cpp/containers/comptrie/comptrie.h>
+#include <library/cpp/containers/comptrie/loader/loader.h>
+
+using TDummyTrie = TCompactTrie<char, i32>;
+
+namespace {
+ const unsigned char DATA[] = {
+#include "data.inc"
+ };
+}
+
+Y_UNIT_TEST_SUITE(ArchiveLoaderTests) {
+ Y_UNIT_TEST(BaseTest) {
+ TDummyTrie trie = LoadTrieFromArchive<TDummyTrie>("/dummy.trie", DATA, true);
+ UNIT_ASSERT_EQUAL(trie.Size(), 3);
+
+ const TString TrieKyes[3] = {
+ "zero", "one", "two"};
+ i32 val = -1;
+ for (i32 i = 0; i < 3; ++i) {
+ UNIT_ASSERT(trie.Find(TrieKyes[i].data(), TrieKyes[i].size(), &val));
+ UNIT_ASSERT_EQUAL(i, val);
+ }
+
+ UNIT_CHECK_GENERATED_EXCEPTION(
+ LoadTrieFromArchive<TDummyTrie>("/noname.trie", DATA),
+ yexception);
+ }
+}
diff --git a/library/cpp/containers/comptrie/loader/ut/dummy.trie b/library/cpp/containers/comptrie/loader/ut/dummy.trie
new file mode 100644
index 0000000000..4af18add2f
--- /dev/null
+++ b/library/cpp/containers/comptrie/loader/ut/dummy.trie
Binary files differ
diff --git a/library/cpp/containers/comptrie/loader/ut/ya.make b/library/cpp/containers/comptrie/loader/ut/ya.make
new file mode 100644
index 0000000000..6c0334d3ea
--- /dev/null
+++ b/library/cpp/containers/comptrie/loader/ut/ya.make
@@ -0,0 +1,18 @@
+UNITTEST_FOR(library/cpp/containers/comptrie/loader)
+
+OWNER(my34)
+
+ARCHIVE(
+ NAME data.inc
+ dummy.trie
+)
+
+SRCS(
+ loader_ut.cpp
+)
+
+PEERDIR(
+ library/cpp/containers/comptrie/loader
+)
+
+END()
diff --git a/library/cpp/containers/comptrie/loader/ya.make b/library/cpp/containers/comptrie/loader/ya.make
new file mode 100644
index 0000000000..1e23e442a0
--- /dev/null
+++ b/library/cpp/containers/comptrie/loader/ya.make
@@ -0,0 +1,15 @@
+LIBRARY()
+
+OWNER(my34)
+
+SRCS(
+ loader.h
+ loader.cpp
+)
+
+PEERDIR(
+ library/cpp/archive
+ library/cpp/containers/comptrie
+)
+
+END()
diff --git a/library/cpp/containers/comptrie/make_fast_layout.cpp b/library/cpp/containers/comptrie/make_fast_layout.cpp
new file mode 100644
index 0000000000..ade78d7899
--- /dev/null
+++ b/library/cpp/containers/comptrie/make_fast_layout.cpp
@@ -0,0 +1,467 @@
+#include "make_fast_layout.h"
+#include "node.h"
+#include "writeable_node.h"
+#include "write_trie_backwards.h"
+#include "comptrie_impl.h"
+
+#include <util/generic/hash.h>
+#include <util/generic/utility.h>
+
+// Lay the trie in memory in such a way that there are less cache misses when jumping from root to leaf.
+// The trie becomes about 2% larger, but the access became about 25% faster in our experiments.
+// Can be called on minimized and non-minimized tries, in the first case in requires half a trie more memory.
+// Calling it the second time on the same trie does nothing.
+//
+// The algorithm is based on van Emde Boas layout as described in the yandex data school lectures on external memory algoritms
+// by Maxim Babenko and Ivan Puzyrevsky. The difference is that when we cut the tree into levels
+// two nodes connected by a forward link are put into the same level (because they usually lie near each other in the original tree).
+// The original paper (describing the layout in Section 2.1) is:
+// Michael A. Bender, Erik D. Demaine, Martin Farach-Colton. Cache-Oblivious B-Trees // SIAM Journal on Computing, volume 35, number 2, 2005, pages 341–358.
+// Available on the web: http://erikdemaine.org/papers/CacheObliviousBTrees_SICOMP/
+// Or: Michael A. Bender, Erik D. Demaine, and Martin Farach-Colton. Cache-Oblivious B-Trees // Proceedings of the 41st Annual Symposium
+// on Foundations of Computer Science (FOCS 2000), Redondo Beach, California, November 12–14, 2000, pages 399–409.
+// Available on the web: http://erikdemaine.org/papers/FOCS2000b/
+// (there is not much difference between these papers, actually).
+//
+namespace NCompactTrie {
+ static size_t FindSupportingPowerOf2(size_t n) {
+ size_t result = 1ull << (8 * sizeof(size_t) - 1);
+ while (result > n) {
+ result >>= 1;
+ }
+ return result;
+ }
+
+ namespace {
+ class TTrieNodeSet {
+ public:
+ TTrieNodeSet() = default;
+
+ explicit TTrieNodeSet(const TOpaqueTrie& trie)
+ : Body(trie.Length / (8 * MinNodeSize) + 1, 0)
+ {
+ }
+
+ bool Has(size_t offset) const {
+ const size_t reducedOffset = ReducedOffset(offset);
+ return OffsetCell(reducedOffset) & OffsetMask(reducedOffset);
+ }
+
+ void Add(size_t offset) {
+ const size_t reducedOffset = ReducedOffset(offset);
+ OffsetCell(reducedOffset) |= OffsetMask(reducedOffset);
+ }
+
+ void Remove(size_t offset) {
+ const size_t reducedOffset = ReducedOffset(offset);
+ OffsetCell(reducedOffset) &= ~OffsetMask(reducedOffset);
+ }
+
+ void Swap(TTrieNodeSet& other) {
+ Body.swap(other.Body);
+ }
+
+ private:
+ static const size_t MinNodeSize = 2;
+ TVector<ui8> Body;
+
+ static size_t ReducedOffset(size_t offset) {
+ return offset / MinNodeSize;
+ }
+ static ui8 OffsetMask(size_t reducedOffset) {
+ return 1 << (reducedOffset % 8);
+ }
+ ui8& OffsetCell(size_t reducedOffset) {
+ return Body.at(reducedOffset / 8);
+ }
+ const ui8& OffsetCell(size_t reducedOffset) const {
+ return Body.at(reducedOffset / 8);
+ }
+ };
+
+ //---------------------------------------------------------------------
+
+ class TTrieNodeCounts {
+ public:
+ TTrieNodeCounts() = default;
+
+ explicit TTrieNodeCounts(const TOpaqueTrie& trie)
+ : Body(trie.Length / MinNodeSize, 0)
+ , IsTree(false)
+ {
+ }
+
+ size_t Get(size_t offset) const {
+ return IsTree ? 1 : Body.at(offset / MinNodeSize);
+ }
+
+ void Inc(size_t offset) {
+ if (IsTree) {
+ return;
+ }
+ ui8& count = Body.at(offset / MinNodeSize);
+ if (count != MaxCount) {
+ ++count;
+ }
+ }
+
+ size_t Dec(size_t offset) {
+ if (IsTree) {
+ return 0;
+ }
+ ui8& count = Body.at(offset / MinNodeSize);
+ Y_ASSERT(count > 0);
+ if (count != MaxCount) {
+ --count;
+ }
+ return count;
+ }
+
+ void Swap(TTrieNodeCounts& other) {
+ Body.swap(other.Body);
+ ::DoSwap(IsTree, other.IsTree);
+ }
+
+ void SetTreeMode() {
+ IsTree = true;
+ Body = TVector<ui8>();
+ }
+
+ private:
+ static const size_t MinNodeSize = 2;
+ static const ui8 MaxCount = 255;
+
+ TVector<ui8> Body;
+ bool IsTree = false;
+ };
+
+ //----------------------------------------------------------
+
+ class TOffsetIndex {
+ public:
+ // In all methods:
+ // Key --- offset from the beginning of the old trie.
+ // Value --- offset from the end of the new trie.
+
+ explicit TOffsetIndex(TTrieNodeCounts& counts) {
+ ParentCounts.Swap(counts);
+ }
+
+ void Add(size_t key, size_t value) {
+ Data[key] = value;
+ }
+
+ size_t Size() const {
+ return Data.size();
+ }
+
+ size_t Get(size_t key) {
+ auto pos = Data.find(key);
+ if (pos == Data.end()) {
+ ythrow yexception() << "Bad node walking order: trying to get node offset too early or too many times!";
+ }
+ size_t result = pos->second;
+ if (ParentCounts.Dec(key) == 0) {
+ // We don't need this offset any more.
+ Data.erase(pos);
+ }
+ return result;
+ }
+
+ private:
+ TTrieNodeCounts ParentCounts;
+ THashMap<size_t, size_t> Data;
+ };
+
+ //---------------------------------------------------------------------------------------
+
+ class TTrieMeasurer {
+ public:
+ TTrieMeasurer(const TOpaqueTrie& trie, bool verbose);
+ void Measure();
+
+ size_t GetDepth() const {
+ return Depth;
+ }
+
+ size_t GetNodeCount() const {
+ return NodeCount;
+ }
+
+ size_t GetUnminimizedNodeCount() const {
+ return UnminimizedNodeCount;
+ }
+
+ bool IsTree() const {
+ return NodeCount == UnminimizedNodeCount;
+ }
+
+ TTrieNodeCounts& GetParentCounts() {
+ return ParentCounts;
+ }
+
+ const TOpaqueTrie& GetTrie() const {
+ return Trie;
+ }
+
+ private:
+ const TOpaqueTrie& Trie;
+ size_t Depth;
+ TTrieNodeCounts ParentCounts;
+ size_t NodeCount;
+ size_t UnminimizedNodeCount;
+ const bool Verbose;
+
+ // returns depth, increments NodeCount.
+ size_t MeasureSubtrie(size_t rootOffset, bool isNewPath);
+ };
+
+ TTrieMeasurer::TTrieMeasurer(const TOpaqueTrie& trie, bool verbose)
+ : Trie(trie)
+ , Depth(0)
+ , ParentCounts(trie)
+ , NodeCount(0)
+ , UnminimizedNodeCount(0)
+ , Verbose(verbose)
+ {
+ Y_ASSERT(Trie.Data);
+ }
+
+ void TTrieMeasurer::Measure() {
+ if (Verbose) {
+ Cerr << "Measuring the trie..." << Endl;
+ }
+ NodeCount = 0;
+ UnminimizedNodeCount = 0;
+ Depth = MeasureSubtrie(0, true);
+ if (IsTree()) {
+ ParentCounts.SetTreeMode();
+ }
+ if (Verbose) {
+ Cerr << "Unminimized node count: " << UnminimizedNodeCount << Endl;
+ Cerr << "Trie depth: " << Depth << Endl;
+ Cerr << "Node count: " << NodeCount << Endl;
+ }
+ }
+
+ // A chain of nodes linked by forward links
+ // is considered one node with many left and right children
+ // for depth measuring here and in
+ // TVanEmdeBoasReverseNodeEnumerator::FindDescendants.
+ size_t TTrieMeasurer::MeasureSubtrie(size_t rootOffset, bool isNewPath) {
+ Y_ASSERT(rootOffset < Trie.Length);
+ TNode node(Trie.Data, rootOffset, Trie.SkipFunction);
+ size_t depth = 0;
+ for (;;) {
+ ++UnminimizedNodeCount;
+ if (Verbose) {
+ ShowProgress(UnminimizedNodeCount);
+ }
+ if (isNewPath) {
+ if (ParentCounts.Get(node.GetOffset()) > 0) {
+ isNewPath = false;
+ } else {
+ ++NodeCount;
+ }
+ ParentCounts.Inc(node.GetOffset());
+ }
+ if (node.GetLeftOffset()) {
+ depth = Max(depth, 1 + MeasureSubtrie(node.GetLeftOffset(), isNewPath));
+ }
+ if (node.GetRightOffset()) {
+ depth = Max(depth, 1 + MeasureSubtrie(node.GetRightOffset(), isNewPath));
+ }
+ if (node.GetForwardOffset()) {
+ node = TNode(Trie.Data, node.GetForwardOffset(), Trie.SkipFunction);
+ } else {
+ break;
+ }
+ }
+ return depth;
+ }
+
+ //--------------------------------------------------------------------------------------
+
+ using TLevelNodes = TVector<size_t>;
+
+ struct TLevel {
+ size_t Depth;
+ TLevelNodes Nodes;
+
+ explicit TLevel(size_t depth)
+ : Depth(depth)
+ {
+ }
+ };
+
+ //----------------------------------------------------------------------------------------
+
+ class TVanEmdeBoasReverseNodeEnumerator: public TReverseNodeEnumerator {
+ public:
+ TVanEmdeBoasReverseNodeEnumerator(TTrieMeasurer& measurer, bool verbose)
+ : Fresh(true)
+ , Trie(measurer.GetTrie())
+ , Depth(measurer.GetDepth())
+ , MaxIndexSize(0)
+ , BackIndex(measurer.GetParentCounts())
+ , ProcessedNodes(measurer.GetTrie())
+ , Verbose(verbose)
+ {
+ }
+
+ bool Move() override {
+ if (!Fresh) {
+ CentralWord.pop_back();
+ }
+ if (CentralWord.empty()) {
+ return MoveCentralWordStart();
+ }
+ return true;
+ }
+
+ const TNode& Get() const {
+ return CentralWord.back();
+ }
+
+ size_t GetLeafLength() const override {
+ return Get().GetLeafLength();
+ }
+
+ // Returns recalculated offset from the end of the current node.
+ size_t PrepareOffset(size_t absoffset, size_t resultLength) {
+ if (!absoffset)
+ return NPOS;
+ return resultLength - BackIndex.Get(absoffset);
+ }
+
+ size_t RecreateNode(char* buffer, size_t resultLength) override {
+ TWriteableNode newNode(Get(), Trie.Data);
+ newNode.ForwardOffset = PrepareOffset(Get().GetForwardOffset(), resultLength);
+ newNode.LeftOffset = PrepareOffset(Get().GetLeftOffset(), resultLength);
+ newNode.RightOffset = PrepareOffset(Get().GetRightOffset(), resultLength);
+
+ const size_t len = newNode.Pack(buffer);
+ ProcessedNodes.Add(Get().GetOffset());
+ BackIndex.Add(Get().GetOffset(), resultLength + len);
+ MaxIndexSize = Max(MaxIndexSize, BackIndex.Size());
+ return len;
+ }
+
+ private:
+ bool Fresh;
+ TOpaqueTrie Trie;
+ size_t Depth;
+ size_t MaxIndexSize;
+
+ TVector<TLevel> Trace;
+ TOffsetIndex BackIndex;
+ TVector<TNode> CentralWord;
+ TTrieNodeSet ProcessedNodes;
+
+ const bool Verbose;
+
+ private:
+ bool IsVisited(size_t offset) const {
+ return ProcessedNodes.Has(offset);
+ }
+
+ void AddCascade(size_t step) {
+ Y_ASSERT(!(step & (step - 1))); // Should be a power of 2.
+ while (step > 0) {
+ size_t root = Trace.back().Nodes.back();
+ TLevel level(Trace.back().Depth + step);
+ Trace.push_back(level);
+ size_t actualStep = FindSupportingPowerOf2(FindDescendants(root, step, Trace.back().Nodes));
+ if (actualStep != step) {
+ // Retry with a smaller step.
+ Y_ASSERT(actualStep < step);
+ step = actualStep;
+ Trace.pop_back();
+ } else {
+ step /= 2;
+ }
+ }
+ }
+
+ void FillCentralWord() {
+ CentralWord.clear();
+ CentralWord.push_back(TNode(Trie.Data, Trace.back().Nodes.back(), Trie.SkipFunction));
+ // Do not check for epsilon links, as the traversal order now is different.
+ while (CentralWord.back().GetForwardOffset() && !IsVisited(CentralWord.back().GetForwardOffset())) {
+ CentralWord.push_back(TNode(Trie.Data, CentralWord.back().GetForwardOffset(), Trie.SkipFunction));
+ }
+ }
+
+ bool MoveCentralWordStart() {
+ do {
+ if (Fresh) {
+ TLevel root(0);
+ Trace.push_back(root);
+ Trace.back().Nodes.push_back(0);
+ const size_t sectionDepth = FindSupportingPowerOf2(Depth);
+ AddCascade(sectionDepth);
+ Fresh = false;
+ } else {
+ Trace.back().Nodes.pop_back();
+ if (Trace.back().Nodes.empty() && Trace.size() == 1) {
+ if (Verbose) {
+ Cerr << "Max index size: " << MaxIndexSize << Endl;
+ Cerr << "Current index size: " << BackIndex.Size() << Endl;
+ }
+ // We just popped the root.
+ return false;
+ }
+ size_t lastStep = Trace.back().Depth - Trace[Trace.size() - 2].Depth;
+ if (Trace.back().Nodes.empty()) {
+ Trace.pop_back();
+ }
+ AddCascade(lastStep / 2);
+ }
+ } while (IsVisited(Trace.back().Nodes.back()));
+ FillCentralWord();
+ return true;
+ }
+
+ // Returns the maximal depth it has reached while searching.
+ // This is a method because it needs OffsetIndex to skip processed nodes.
+ size_t FindDescendants(size_t rootOffset, size_t depth, TLevelNodes& result) const {
+ if (depth == 0) {
+ result.push_back(rootOffset);
+ return 0;
+ }
+ size_t actualDepth = 0;
+ TNode node(Trie.Data, rootOffset, Trie.SkipFunction);
+ for (;;) {
+ if (node.GetLeftOffset() && !IsVisited(node.GetLeftOffset())) {
+ actualDepth = Max(actualDepth,
+ 1 + FindDescendants(node.GetLeftOffset(), depth - 1, result));
+ }
+ if (node.GetRightOffset() && !IsVisited(node.GetRightOffset())) {
+ actualDepth = Max(actualDepth,
+ 1 + FindDescendants(node.GetRightOffset(), depth - 1, result));
+ }
+ if (node.GetForwardOffset() && !IsVisited(node.GetForwardOffset())) {
+ node = TNode(Trie.Data, node.GetForwardOffset(), Trie.SkipFunction);
+ } else {
+ break;
+ }
+ }
+ return actualDepth;
+ }
+ };
+
+ } // Anonymous namespace.
+
+ //-----------------------------------------------------------------------------------
+
+ size_t RawCompactTrieFastLayoutImpl(IOutputStream& os, const TOpaqueTrie& trie, bool verbose) {
+ if (!trie.Data || !trie.Length) {
+ return 0;
+ }
+ TTrieMeasurer mes(trie, verbose);
+ mes.Measure();
+ TVanEmdeBoasReverseNodeEnumerator enumerator(mes, verbose);
+ return WriteTrieBackwards(os, enumerator, verbose);
+ }
+
+}
diff --git a/library/cpp/containers/comptrie/make_fast_layout.h b/library/cpp/containers/comptrie/make_fast_layout.h
new file mode 100644
index 0000000000..b8fab5d65b
--- /dev/null
+++ b/library/cpp/containers/comptrie/make_fast_layout.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include "leaf_skipper.h"
+#include <cstddef>
+
+class IOutputStream;
+
+namespace NCompactTrie {
+ // Return value: size of the resulting trie.
+ size_t RawCompactTrieFastLayoutImpl(IOutputStream& os, const NCompactTrie::TOpaqueTrie& trie, bool verbose);
+
+ // Return value: size of the resulting trie.
+ template <class TPacker>
+ size_t CompactTrieMakeFastLayoutImpl(IOutputStream& os, const char* data, size_t datalength, bool verbose, const TPacker* packer) {
+ TPackerLeafSkipper<TPacker> skipper(packer);
+ TOpaqueTrie trie(data, datalength, skipper);
+ return RawCompactTrieFastLayoutImpl(os, trie, verbose);
+ }
+
+}
diff --git a/library/cpp/containers/comptrie/minimize.cpp b/library/cpp/containers/comptrie/minimize.cpp
new file mode 100644
index 0000000000..80d0b25217
--- /dev/null
+++ b/library/cpp/containers/comptrie/minimize.cpp
@@ -0,0 +1,359 @@
+#include "minimize.h"
+#include "node.h"
+#include "writeable_node.h"
+#include "write_trie_backwards.h"
+#include "comptrie_impl.h"
+
+#include <util/generic/hash.h>
+#include <util/generic/algorithm.h>
+
+namespace NCompactTrie {
+ // Minimize the trie. The result is equivalent to the original
+ // trie, except that it takes less space (and has marginally lower
+ // performance, because of eventual epsilon links).
+ // The algorithm is as follows: starting from the largest pieces, we find
+ // nodes that have identical continuations (Daciuk's right language),
+ // and repack the trie. Repacking is done in-place, so memory is less
+ // of an issue; however, it may take considerable time.
+
+ // IMPORTANT: never try to reminimize an already minimized trie or a trie with fast layout.
+ // Because of non-local structure and epsilon links, it won't work
+ // as you expect it to, and can destroy the trie in the making.
+
+ namespace {
+ using TOffsetList = TVector<size_t>;
+ using TPieceIndex = THashMap<size_t, TOffsetList>;
+
+ using TSizePair = std::pair<size_t, size_t>;
+ using TSizePairVector = TVector<TSizePair>;
+ using TSizePairVectorVector = TVector<TSizePairVector>;
+
+ class TOffsetMap {
+ protected:
+ TSizePairVectorVector Data;
+
+ public:
+ TOffsetMap() {
+ Data.reserve(0x10000);
+ }
+
+ void Add(size_t key, size_t value) {
+ size_t hikey = key & 0xFFFF;
+
+ if (Data.size() <= hikey)
+ Data.resize(hikey + 1);
+
+ TSizePairVector& sublist = Data[hikey];
+
+ for (auto& it : sublist) {
+ if (it.first == key) {
+ it.second = value;
+
+ return;
+ }
+ }
+
+ sublist.push_back(TSizePair(key, value));
+ }
+
+ bool Contains(size_t key) const {
+ return (Get(key) != 0);
+ }
+
+ size_t Get(size_t key) const {
+ size_t hikey = key & 0xFFFF;
+
+ if (Data.size() <= hikey)
+ return 0;
+
+ const TSizePairVector& sublist = Data[hikey];
+
+ for (const auto& it : sublist) {
+ if (it.first == key)
+ return it.second;
+ }
+
+ return 0;
+ }
+ };
+
+ class TOffsetDeltas {
+ protected:
+ TSizePairVector Data;
+
+ public:
+ void Add(size_t key, size_t value) {
+ if (Data.empty()) {
+ if (key == value)
+ return; // no offset
+ } else {
+ TSizePair last = Data.back();
+
+ if (key <= last.first) {
+ Cerr << "Trouble: elements to offset delta list added in wrong order" << Endl;
+
+ return;
+ }
+
+ if (last.first + value == last.second + key)
+ return; // same offset
+ }
+
+ Data.push_back(TSizePair(key, value));
+ }
+
+ size_t Get(size_t key) const {
+ if (Data.empty())
+ return key; // difference is zero;
+
+ if (key < Data.front().first)
+ return key;
+
+ // Binary search for the highest entry in the list that does not exceed the key
+ size_t from = 0;
+ size_t to = Data.size() - 1;
+
+ while (from < to) {
+ size_t midpoint = (from + to + 1) / 2;
+
+ if (key < Data[midpoint].first)
+ to = midpoint - 1;
+ else
+ from = midpoint;
+ }
+
+ TSizePair entry = Data[from];
+
+ return key - entry.first + entry.second;
+ }
+ };
+
+ class TPieceComparer {
+ private:
+ const char* Data;
+ const size_t Length;
+
+ public:
+ TPieceComparer(const char* buf, size_t len)
+ : Data(buf)
+ , Length(len)
+ {
+ }
+
+ bool operator()(size_t p1, const size_t p2) {
+ int res = memcmp(Data + p1, Data + p2, Length);
+
+ if (res)
+ return (res > 0);
+
+ return (p1 > p2); // the pieces are sorted in the reverse order of appearance
+ }
+ };
+
+ struct TBranchPoint {
+ TNode Node;
+ int Selector;
+
+ public:
+ TBranchPoint()
+ : Selector(0)
+ {
+ }
+
+ TBranchPoint(const char* data, size_t offset, const ILeafSkipper& skipFunction)
+ : Node(data, offset, skipFunction)
+ , Selector(0)
+ {
+ }
+
+ bool IsFinal() const {
+ return Node.IsFinal();
+ }
+
+ // NextNode returns child nodes, starting from the last node: Right, then Left, then Forward
+ size_t NextNode(const TOffsetMap& mergedNodes) {
+ while (Selector < 3) {
+ size_t nextOffset = 0;
+
+ switch (++Selector) {
+ case 1:
+ if (Node.GetRightOffset())
+ nextOffset = Node.GetRightOffset();
+ break;
+
+ case 2:
+ if (Node.GetLeftOffset())
+ nextOffset = Node.GetLeftOffset();
+ break;
+
+ case 3:
+ if (Node.GetForwardOffset())
+ nextOffset = Node.GetForwardOffset();
+ break;
+
+ default:
+ break;
+ }
+
+ if (nextOffset && !mergedNodes.Contains(nextOffset))
+ return nextOffset;
+ }
+ return 0;
+ }
+ };
+
+ class TMergingReverseNodeEnumerator: public TReverseNodeEnumerator {
+ private:
+ bool Fresh;
+ TOpaqueTrie Trie;
+ const TOffsetMap& MergeMap;
+ TVector<TBranchPoint> Trace;
+ TOffsetDeltas OffsetIndex;
+
+ public:
+ TMergingReverseNodeEnumerator(const TOpaqueTrie& trie, const TOffsetMap& mergers)
+ : Fresh(true)
+ , Trie(trie)
+ , MergeMap(mergers)
+ {
+ }
+
+ bool Move() override {
+ if (Fresh) {
+ Trace.push_back(TBranchPoint(Trie.Data, 0, Trie.SkipFunction));
+ Fresh = false;
+ } else {
+ if (Trace.empty())
+ return false;
+
+ Trace.pop_back();
+
+ if (Trace.empty())
+ return false;
+ }
+
+ size_t nextnode = Trace.back().NextNode(MergeMap);
+
+ while (nextnode) {
+ Trace.push_back(TBranchPoint(Trie.Data, nextnode, Trie.SkipFunction));
+ nextnode = Trace.back().NextNode(MergeMap);
+ }
+
+ return (!Trace.empty());
+ }
+
+ const TNode& Get() const {
+ return Trace.back().Node;
+ }
+ size_t GetLeafLength() const override {
+ return Get().GetLeafLength();
+ }
+
+ // Returns recalculated offset from the end of the current node
+ size_t PrepareOffset(size_t absoffset, size_t minilength) {
+ if (!absoffset)
+ return NPOS;
+
+ if (MergeMap.Contains(absoffset))
+ absoffset = MergeMap.Get(absoffset);
+ return minilength - OffsetIndex.Get(Trie.Length - absoffset);
+ }
+
+ size_t RecreateNode(char* buffer, size_t resultLength) override {
+ TWriteableNode newNode(Get(), Trie.Data);
+ newNode.ForwardOffset = PrepareOffset(Get().GetForwardOffset(), resultLength);
+ newNode.LeftOffset = PrepareOffset(Get().GetLeftOffset(), resultLength);
+ newNode.RightOffset = PrepareOffset(Get().GetRightOffset(), resultLength);
+
+ if (!buffer)
+ return newNode.Measure();
+
+ const size_t len = newNode.Pack(buffer);
+ OffsetIndex.Add(Trie.Length - Get().GetOffset(), resultLength + len);
+
+ return len;
+ }
+ };
+
+ }
+
+ static void AddPiece(TPieceIndex& index, size_t offset, size_t len) {
+ index[len].push_back(offset);
+ }
+
+ static TOffsetMap FindEquivalentSubtries(const TOpaqueTrie& trie, bool verbose, size_t minMergeSize) {
+ // Tree nodes, arranged by span length.
+ // When all nodes of a given size are considered, they pop off the queue.
+ TPieceIndex subtries;
+ TOffsetMap merger;
+ // Start walking the trie from head.
+ AddPiece(subtries, 0, trie.Length);
+
+ size_t counter = 0;
+ // Now consider all nodes with sizeable continuations
+ for (size_t curlen = trie.Length; curlen >= minMergeSize && !subtries.empty(); curlen--) {
+ TPieceIndex::iterator iit = subtries.find(curlen);
+
+ if (iit == subtries.end())
+ continue; // fast forward to the next available length value
+
+ TOffsetList& batch = iit->second;
+ TPieceComparer comparer(trie.Data, curlen);
+ Sort(batch.begin(), batch.end(), comparer);
+
+ TOffsetList::iterator it = batch.begin();
+ while (it != batch.end()) {
+ if (verbose)
+ ShowProgress(++counter);
+
+ size_t offset = *it;
+
+ // Fill the array with the subnodes of the element
+ TNode node(trie.Data, offset, trie.SkipFunction);
+ size_t end = offset + curlen;
+ if (size_t rightOffset = node.GetRightOffset()) {
+ AddPiece(subtries, rightOffset, end - rightOffset);
+ end = rightOffset;
+ }
+ if (size_t leftOffset = node.GetLeftOffset()) {
+ AddPiece(subtries, leftOffset, end - leftOffset);
+ end = leftOffset;
+ }
+ if (size_t forwardOffset = node.GetForwardOffset()) {
+ AddPiece(subtries, forwardOffset, end - forwardOffset);
+ }
+
+ while (++it != batch.end()) {
+ // Find next different; until then, just add the offsets to the list of merged nodes.
+ size_t nextoffset = *it;
+
+ if (memcmp(trie.Data + offset, trie.Data + nextoffset, curlen))
+ break;
+
+ merger.Add(nextoffset, offset);
+ }
+ }
+
+ subtries.erase(curlen);
+ }
+ if (verbose) {
+ Cerr << counter << Endl;
+ }
+ return merger;
+ }
+
+ size_t RawCompactTrieMinimizeImpl(IOutputStream& os, TOpaqueTrie& trie, bool verbose, size_t minMergeSize, EMinimizeMode mode) {
+ if (!trie.Data || !trie.Length) {
+ return 0;
+ }
+
+ TOffsetMap merger = FindEquivalentSubtries(trie, verbose, minMergeSize);
+ TMergingReverseNodeEnumerator enumerator(trie, merger);
+
+ if (mode == MM_DEFAULT)
+ return WriteTrieBackwards(os, enumerator, verbose);
+ else
+ return WriteTrieBackwardsNoAlloc(os, enumerator, trie, mode);
+ }
+
+}
diff --git a/library/cpp/containers/comptrie/minimize.h b/library/cpp/containers/comptrie/minimize.h
new file mode 100644
index 0000000000..baaa431d04
--- /dev/null
+++ b/library/cpp/containers/comptrie/minimize.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#include "leaf_skipper.h"
+#include <cstddef>
+
+class IOutputStream;
+
+namespace NCompactTrie {
+ size_t MeasureOffset(size_t offset);
+
+ enum EMinimizeMode {
+ MM_DEFAULT, // alollocate new memory for minimized tree
+ MM_NOALLOC, // minimize tree in the same buffer
+ MM_INPLACE // do not write tree to the stream, but move to the buffer beginning
+ };
+
+ // Return value: size of the minimized trie.
+ size_t RawCompactTrieMinimizeImpl(IOutputStream& os, TOpaqueTrie& trie, bool verbose, size_t minMergeSize, EMinimizeMode mode);
+
+ // Return value: size of the minimized trie.
+ template <class TPacker>
+ size_t CompactTrieMinimizeImpl(IOutputStream& os, const char* data, size_t datalength, bool verbose, const TPacker* packer, EMinimizeMode mode) {
+ TPackerLeafSkipper<TPacker> skipper(packer);
+ size_t minmerge = MeasureOffset(datalength);
+ TOpaqueTrie trie(data, datalength, skipper);
+ return RawCompactTrieMinimizeImpl(os, trie, verbose, minmerge, mode);
+ }
+
+}
diff --git a/library/cpp/containers/comptrie/node.cpp b/library/cpp/containers/comptrie/node.cpp
new file mode 100644
index 0000000000..5fd22f15ec
--- /dev/null
+++ b/library/cpp/containers/comptrie/node.cpp
@@ -0,0 +1,79 @@
+#include "node.h"
+#include "leaf_skipper.h"
+#include "comptrie_impl.h"
+
+#include <util/system/yassert.h>
+#include <util/generic/yexception.h>
+
+namespace NCompactTrie {
+ TNode::TNode()
+ : Offset(0)
+ , LeafLength(0)
+ , CoreLength(0)
+ , Label(0)
+ {
+ for (auto& offset : Offsets) {
+ offset = 0;
+ }
+ }
+
+ // We believe that epsilon links are found only on the forward-position and that afer jumping an epsilon link you come to an ordinary node.
+
+ TNode::TNode(const char* data, size_t offset, const ILeafSkipper& skipFunction)
+ : Offset(offset)
+ , LeafLength(0)
+ , CoreLength(0)
+ , Label(0)
+ {
+ for (auto& anOffset : Offsets) {
+ anOffset = 0;
+ }
+ if (!data)
+ return; // empty constructor
+
+ const char* datapos = data + offset;
+ char flags = *(datapos++);
+ Y_ASSERT(!IsEpsilonLink(flags));
+ Label = *(datapos++);
+
+ size_t leftsize = LeftOffsetLen(flags);
+ size_t& leftOffset = Offsets[D_LEFT];
+ leftOffset = UnpackOffset(datapos, leftsize);
+ if (leftOffset) {
+ leftOffset += Offset;
+ }
+ datapos += leftsize;
+
+ size_t rightsize = RightOffsetLen(flags);
+ size_t& rightOffset = Offsets[D_RIGHT];
+ rightOffset = UnpackOffset(datapos, rightsize);
+ if (rightOffset) {
+ rightOffset += Offset;
+ }
+ datapos += rightsize;
+
+ if (flags & MT_FINAL) {
+ Offsets[D_FINAL] = datapos - data;
+ LeafLength = skipFunction.SkipLeaf(datapos);
+ }
+
+ CoreLength = 2 + leftsize + rightsize + LeafLength;
+ if (flags & MT_NEXT) {
+ size_t& forwardOffset = Offsets[D_NEXT];
+ forwardOffset = Offset + CoreLength;
+ // There might be an epsilon link at the forward position.
+ // If so, set the ForwardOffset to the value that points to the link's end.
+ const char* forwardPos = data + forwardOffset;
+ const char forwardFlags = *forwardPos;
+ if (IsEpsilonLink(forwardFlags)) {
+ // Jump through the epsilon link.
+ size_t epsilonOffset = UnpackOffset(forwardPos + 1, forwardFlags & MT_SIZEMASK);
+ if (!epsilonOffset) {
+ ythrow yexception() << "Corrupted epsilon link";
+ }
+ forwardOffset += epsilonOffset;
+ }
+ }
+ }
+
+}
diff --git a/library/cpp/containers/comptrie/node.h b/library/cpp/containers/comptrie/node.h
new file mode 100644
index 0000000000..d6f4317db0
--- /dev/null
+++ b/library/cpp/containers/comptrie/node.h
@@ -0,0 +1,80 @@
+#pragma once
+
+#include <cstddef>
+
+namespace NCompactTrie {
+ class ILeafSkipper;
+
+ enum TDirection {
+ D_LEFT,
+ D_FINAL,
+ D_NEXT,
+ D_RIGHT,
+ D_MAX
+ };
+
+ inline TDirection& operator++(TDirection& direction) {
+ direction = static_cast<TDirection>(direction + 1);
+ return direction;
+ }
+
+ inline TDirection& operator--(TDirection& direction) {
+ direction = static_cast<TDirection>(direction - 1);
+ return direction;
+ }
+
+ class TNode {
+ public:
+ TNode();
+ // Processes epsilon links and sets ForwardOffset to correct value. Assumes an epsilon link doesn't point to an epsilon link.
+ TNode(const char* data, size_t offset, const ILeafSkipper& skipFunction);
+
+ size_t GetOffset() const {
+ return Offset;
+ }
+
+ size_t GetLeafOffset() const {
+ return Offsets[D_FINAL];
+ }
+ size_t GetLeafLength() const {
+ return LeafLength;
+ }
+ size_t GetCoreLength() const {
+ return CoreLength;
+ }
+
+ size_t GetOffsetByDirection(TDirection direction) const {
+ return Offsets[direction];
+ }
+
+ size_t GetForwardOffset() const {
+ return Offsets[D_NEXT];
+ }
+ size_t GetLeftOffset() const {
+ return Offsets[D_LEFT];
+ }
+ size_t GetRightOffset() const {
+ return Offsets[D_RIGHT];
+ }
+ char GetLabel() const {
+ return Label;
+ }
+
+ bool IsFinal() const {
+ return GetLeafOffset() != 0;
+ }
+
+ bool HasEpsilonLinkForward() const {
+ return GetForwardOffset() > Offset + CoreLength;
+ }
+
+ private:
+ size_t Offsets[D_MAX];
+ size_t Offset;
+ size_t LeafLength;
+ size_t CoreLength;
+
+ char Label;
+ };
+
+}
diff --git a/library/cpp/containers/comptrie/opaque_trie_iterator.cpp b/library/cpp/containers/comptrie/opaque_trie_iterator.cpp
new file mode 100644
index 0000000000..5fd3914be6
--- /dev/null
+++ b/library/cpp/containers/comptrie/opaque_trie_iterator.cpp
@@ -0,0 +1,231 @@
+#include "opaque_trie_iterator.h"
+#include "comptrie_impl.h"
+#include "node.h"
+
+namespace NCompactTrie {
+ TOpaqueTrieIterator::TOpaqueTrieIterator(const TOpaqueTrie& trie, const char* emptyValue, bool atend,
+ size_t maxKeyLength)
+ : Trie(trie)
+ , EmptyValue(emptyValue)
+ , AtEmptyValue(emptyValue && !atend)
+ , MaxKeyLength(maxKeyLength)
+ {
+ if (!AtEmptyValue && !atend)
+ Forward();
+ }
+
+ bool TOpaqueTrieIterator::operator==(const TOpaqueTrieIterator& rhs) const {
+ return (Trie == rhs.Trie &&
+ Forks == rhs.Forks &&
+ EmptyValue == rhs.EmptyValue &&
+ AtEmptyValue == rhs.AtEmptyValue &&
+ MaxKeyLength == rhs.MaxKeyLength);
+ }
+
+ bool TOpaqueTrieIterator::HasMaxKeyLength() const {
+ return MaxKeyLength != size_t(-1) && MeasureNarrowKey() == MaxKeyLength;
+ }
+
+ bool TOpaqueTrieIterator::Forward() {
+ if (AtEmptyValue) {
+ AtEmptyValue = false;
+ bool res = Forward(); // TODO delete this after format change
+ if (res && MeasureNarrowKey() != 0) {
+ return res; // there was not "\0" key
+ }
+ // otherwise we are skipping "\0" key
+ }
+
+ if (!Trie.Length)
+ return false;
+
+ if (Forks.Empty()) {
+ TFork fork(Trie.Data, 0, Trie.Length, Trie.SkipFunction);
+ Forks.Push(fork);
+ } else {
+ TFork* topFork = &Forks.Top();
+ while (!topFork->NextDirection()) {
+ if (topFork->Node.GetOffset() >= Trie.Length)
+ return false;
+ Forks.Pop();
+ if (Forks.Empty())
+ return false;
+ topFork = &Forks.Top();
+ }
+ }
+
+ Y_ASSERT(!Forks.Empty());
+ while (Forks.Top().CurrentDirection != D_FINAL && !HasMaxKeyLength()) {
+ TFork nextFork = Forks.Top().NextFork(Trie.SkipFunction);
+ Forks.Push(nextFork);
+ }
+ TFork& top = Forks.Top();
+ static_assert(D_FINAL < D_NEXT, "relative order of NEXT and FINAL directions has changed");
+ if (HasMaxKeyLength() && top.CurrentDirection == D_FINAL && top.HasDirection(D_NEXT)) {
+ top.NextDirection();
+ }
+ return true;
+ }
+
+ bool TOpaqueTrieIterator::Backward() {
+ if (AtEmptyValue)
+ return false;
+
+ if (!Trie.Length) {
+ if (EmptyValue) {
+ // A trie that has only the empty value;
+ // we are not at the empty value, so move to it.
+ AtEmptyValue = true;
+ return true;
+ } else {
+ // Empty trie.
+ return false;
+ }
+ }
+
+ if (Forks.Empty()) {
+ TFork fork(Trie.Data, 0, Trie.Length, Trie.SkipFunction);
+ fork.LastDirection();
+ Forks.Push(fork);
+ } else {
+ TFork* topFork = &Forks.Top();
+ while (!topFork->PrevDirection()) {
+ if (topFork->Node.GetOffset() >= Trie.Length)
+ return false;
+ Forks.Pop();
+ if (!Forks.Empty()) {
+ topFork = &Forks.Top();
+ } else {
+ // When there are no more forks,
+ // we have to iterate over the empty value.
+ if (!EmptyValue)
+ return false;
+ AtEmptyValue = true;
+ return true;
+ }
+ }
+ }
+
+ Y_ASSERT(!Forks.Empty());
+ while (Forks.Top().CurrentDirection != D_FINAL && !HasMaxKeyLength()) {
+ TFork nextFork = Forks.Top().NextFork(Trie.SkipFunction);
+ nextFork.LastDirection();
+ Forks.Push(nextFork);
+ }
+ TFork& top = Forks.Top();
+ static_assert(D_FINAL < D_NEXT, "relative order of NEXT and FINAL directions has changed");
+ if (HasMaxKeyLength() && top.CurrentDirection == D_NEXT && top.HasDirection(D_FINAL)) {
+ top.PrevDirection();
+ }
+ if (MeasureNarrowKey() == 0) {
+ // This is the '\0' key, skip it and get to the EmptyValue.
+ AtEmptyValue = true;
+ Forks.Clear();
+ }
+ return true;
+ }
+
+ const char* TOpaqueTrieIterator::GetValuePtr() const {
+ if (!Forks.Empty()) {
+ const TFork& lastFork = Forks.Top();
+ Y_ASSERT(lastFork.Node.IsFinal() && lastFork.CurrentDirection == D_FINAL);
+ return Trie.Data + lastFork.GetValueOffset();
+ }
+ Y_ASSERT(AtEmptyValue);
+ return EmptyValue;
+ }
+
+ //-------------------------------------------------------------------------
+
+ TString TForkStack::GetKey() const {
+ if (HasEmptyKey()) {
+ return TString();
+ }
+
+ TString result(Key);
+ if (TopHasLabelInKey()) {
+ result.append(Top().GetLabel());
+ }
+ return result;
+ }
+
+ bool TForkStack::HasEmptyKey() const {
+ // Special case: if we get a single zero label, treat it as an empty key
+ // TODO delete this after format change
+ if (TopHasLabelInKey()) {
+ return Key.size() == 0 && Top().GetLabel() == '\0';
+ } else {
+ return Key.size() == 1 && Key[0] == '\0';
+ }
+ }
+
+ size_t TForkStack::MeasureKey() const {
+ size_t result = Key.size() + (TopHasLabelInKey() ? 1 : 0);
+ if (result == 1 && HasEmptyKey()) {
+ return 0;
+ }
+ return result;
+ }
+
+ //-------------------------------------------------------------------------
+
+ TFork::TFork(const char* data, size_t offset, size_t limit, const ILeafSkipper& skipper)
+ : Node(data, offset, skipper)
+ , Data(data)
+ , Limit(limit)
+ , CurrentDirection(TDirection(0))
+ {
+#if COMPTRIE_DATA_CHECK
+ if (Node.GetOffset() >= Limit - 1)
+ ythrow yexception() << "gone beyond the limit, data is corrupted";
+#endif
+ while (CurrentDirection < D_MAX && !HasDirection(CurrentDirection)) {
+ ++CurrentDirection;
+ }
+ }
+
+ bool TFork::operator==(const TFork& rhs) const {
+ return (Data == rhs.Data &&
+ Node.GetOffset() == rhs.Node.GetOffset() &&
+ CurrentDirection == rhs.CurrentDirection);
+ }
+
+ inline bool TFork::NextDirection() {
+ do {
+ ++CurrentDirection;
+ } while (CurrentDirection < D_MAX && !HasDirection(CurrentDirection));
+ return CurrentDirection < D_MAX;
+ }
+
+ inline bool TFork::PrevDirection() {
+ if (CurrentDirection == TDirection(0)) {
+ return false;
+ }
+ do {
+ --CurrentDirection;
+ } while (CurrentDirection > 0 && !HasDirection(CurrentDirection));
+ return HasDirection(CurrentDirection);
+ }
+
+ void TFork::LastDirection() {
+ CurrentDirection = D_MAX;
+ PrevDirection();
+ }
+
+ bool TFork::SetDirection(TDirection direction) {
+ if (!HasDirection(direction)) {
+ return false;
+ }
+ CurrentDirection = direction;
+ return true;
+ }
+
+ char TFork::GetLabel() const {
+ return Node.GetLabel();
+ }
+
+ size_t TFork::GetValueOffset() const {
+ return Node.GetLeafOffset();
+ }
+
+}
diff --git a/library/cpp/containers/comptrie/opaque_trie_iterator.h b/library/cpp/containers/comptrie/opaque_trie_iterator.h
new file mode 100644
index 0000000000..195da3c191
--- /dev/null
+++ b/library/cpp/containers/comptrie/opaque_trie_iterator.h
@@ -0,0 +1,266 @@
+#pragma once
+
+#include "comptrie_impl.h"
+#include "node.h"
+#include "key_selector.h"
+#include "leaf_skipper.h"
+
+#include <util/generic/vector.h>
+#include <util/generic/yexception.h>
+
+namespace NCompactTrie {
+ class ILeafSkipper;
+
+ class TFork { // Auxiliary class for a branching point in the iterator
+ public:
+ TNode Node;
+ const char* Data;
+ size_t Limit; // valid data is in range [Data + Node.GetOffset(), Data + Limit)
+ TDirection CurrentDirection;
+
+ public:
+ TFork(const char* data, size_t offset, size_t limit, const ILeafSkipper& skipper);
+
+ bool operator==(const TFork& rhs) const;
+
+ bool HasLabelInKey() const {
+ return CurrentDirection == D_NEXT || CurrentDirection == D_FINAL;
+ }
+
+ bool NextDirection();
+ bool PrevDirection();
+ void LastDirection();
+
+ bool HasDirection(TDirection direction) const {
+ return Node.GetOffsetByDirection(direction);
+ }
+ // If the fork doesn't have the specified direction,
+ // leaves the direction intact and returns false.
+ // Otherwise returns true.
+ bool SetDirection(TDirection direction);
+ TFork NextFork(const ILeafSkipper& skipper) const;
+
+ char GetLabel() const;
+ size_t GetValueOffset() const;
+ };
+
+ inline TFork TFork::NextFork(const ILeafSkipper& skipper) const {
+ Y_ASSERT(CurrentDirection != D_FINAL);
+ size_t offset = Node.GetOffsetByDirection(CurrentDirection);
+ return TFork(Data, offset, Limit, skipper);
+ }
+
+ //------------------------------------------------------------------------------------------------
+ class TForkStack {
+ public:
+ void Push(const TFork& fork) {
+ if (TopHasLabelInKey()) {
+ Key.push_back(Top().GetLabel());
+ }
+ Forks.push_back(fork);
+ }
+
+ void Pop() {
+ Forks.pop_back();
+ if (TopHasLabelInKey()) {
+ Key.pop_back();
+ }
+ }
+
+ TFork& Top() {
+ return Forks.back();
+ }
+ const TFork& Top() const {
+ return Forks.back();
+ }
+
+ bool Empty() const {
+ return Forks.empty();
+ }
+
+ void Clear() {
+ Forks.clear();
+ Key.clear();
+ }
+
+ bool operator==(const TForkStack& other) const {
+ return Forks == other.Forks;
+ }
+ bool operator!=(const TForkStack& other) const {
+ return !(*this == other);
+ }
+
+ TString GetKey() const;
+ size_t MeasureKey() const;
+
+ private:
+ TVector<TFork> Forks;
+ TString Key;
+
+ private:
+ bool TopHasLabelInKey() const {
+ return !Empty() && Top().HasLabelInKey();
+ }
+ bool HasEmptyKey() const;
+ };
+
+ //------------------------------------------------------------------------------------------------
+
+ template <class TSymbol>
+ struct TConvertRawKey {
+ typedef typename TCompactTrieKeySelector<TSymbol>::TKey TKey;
+ static TKey Get(const TString& rawkey) {
+ TKey result;
+ const size_t sz = rawkey.size();
+ result.reserve(sz / sizeof(TSymbol));
+ for (size_t i = 0; i < sz; i += sizeof(TSymbol)) {
+ TSymbol sym = 0;
+ for (size_t j = 0; j < sizeof(TSymbol); j++) {
+ if (sizeof(TSymbol) <= 1)
+ sym = 0;
+ else
+ sym <<= 8;
+ if (i + j < sz)
+ sym |= TSymbol(0x00FF & rawkey[i + j]);
+ }
+ result.push_back(sym);
+ }
+ return result;
+ }
+
+ static size_t Size(size_t rawsize) {
+ return rawsize / sizeof(TSymbol);
+ }
+ };
+
+ template <>
+ struct TConvertRawKey<char> {
+ static TString Get(const TString& rawkey) {
+ return rawkey;
+ }
+
+ static size_t Size(size_t rawsize) {
+ return rawsize;
+ }
+ };
+
+ //------------------------------------------------------------------------------------------------
+ class TOpaqueTrieIterator { // Iterator stuff. Stores a stack of visited forks.
+ public:
+ TOpaqueTrieIterator(const TOpaqueTrie& trie, const char* emptyValue, bool atend,
+ size_t maxKeyLength = size_t(-1));
+
+ bool operator==(const TOpaqueTrieIterator& rhs) const;
+ bool operator!=(const TOpaqueTrieIterator& rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool Forward();
+ bool Backward();
+
+ template <class TSymbol>
+ bool UpperBound(const typename TCompactTrieKeySelector<TSymbol>::TKeyBuf& key); // True if matched exactly.
+
+ template <class TSymbol>
+ typename TCompactTrieKeySelector<TSymbol>::TKey GetKey() const {
+ return TConvertRawKey<TSymbol>::Get(GetNarrowKey());
+ }
+
+ template <class TSymbol>
+ size_t MeasureKey() const {
+ return TConvertRawKey<TSymbol>::Size(MeasureNarrowKey());
+ }
+
+ TString GetNarrowKey() const {
+ return Forks.GetKey();
+ }
+ size_t MeasureNarrowKey() const {
+ return Forks.MeasureKey();
+ }
+
+ const char* GetValuePtr() const; // 0 if none
+ const TNode& GetNode() const { // Could be called for non-empty key and not AtEnd.
+ return Forks.Top().Node;
+ }
+ const TOpaqueTrie& GetTrie() const {
+ return Trie;
+ }
+
+ private:
+ TOpaqueTrie Trie;
+ TForkStack Forks;
+ const char* const EmptyValue;
+ bool AtEmptyValue;
+ const size_t MaxKeyLength;
+
+ private:
+ bool HasMaxKeyLength() const;
+
+ template <class TSymbol>
+ int LongestPrefix(const typename TCompactTrieKeySelector<TSymbol>::TKeyBuf& key); // Used in UpperBound.
+ };
+
+ template <class TSymbol>
+ int TOpaqueTrieIterator::LongestPrefix(const typename TCompactTrieKeySelector<TSymbol>::TKeyBuf& key) {
+ Forks.Clear();
+ TFork next(Trie.Data, 0, Trie.Length, Trie.SkipFunction);
+ for (size_t i = 0; i < key.size(); i++) {
+ TSymbol symbol = key[i];
+ const bool isLastSymbol = (i + 1 == key.size());
+ for (i64 shift = (i64)NCompactTrie::ExtraBits<TSymbol>(); shift >= 0; shift -= 8) {
+ const unsigned char label = (unsigned char)(symbol >> shift);
+ const bool isLastByte = (isLastSymbol && shift == 0);
+ do {
+ Forks.Push(next);
+ TFork& top = Forks.Top();
+ if (label < (unsigned char)top.GetLabel()) {
+ if (!top.SetDirection(D_LEFT))
+ return 1;
+ } else if (label > (unsigned char)top.GetLabel()) {
+ if (!top.SetDirection(D_RIGHT)) {
+ Forks.Pop(); // We don't pass this fork on the way to the upper bound.
+ return -1;
+ }
+ } else if (isLastByte) { // Here and below label == top.GetLabel().
+ if (top.SetDirection(D_FINAL)) {
+ return 0; // Skip the NextFork() call at the end of the cycle.
+ } else {
+ top.SetDirection(D_NEXT);
+ return 1;
+ }
+ } else if (!top.SetDirection(D_NEXT)) {
+ top.SetDirection(D_FINAL);
+ return -1;
+ }
+ next = top.NextFork(Trie.SkipFunction);
+ } while (Forks.Top().CurrentDirection != D_NEXT); // Proceed to the next byte.
+ }
+ }
+ // We get here only if the key was empty.
+ Forks.Push(next);
+ return 1;
+ }
+
+ template <class TSymbol>
+ bool TOpaqueTrieIterator::UpperBound(const typename TCompactTrieKeySelector<TSymbol>::TKeyBuf& key) {
+ Forks.Clear();
+ if (key.empty() && EmptyValue) {
+ AtEmptyValue = true;
+ return true;
+ } else {
+ AtEmptyValue = false;
+ }
+ const int defect = LongestPrefix<TSymbol>(key);
+ if (defect > 0) {
+ // Continue the constructed forks with the smallest key possible.
+ while (Forks.Top().CurrentDirection != D_FINAL) {
+ TFork next = Forks.Top().NextFork(Trie.SkipFunction);
+ Forks.Push(next);
+ }
+ } else if (defect < 0) {
+ Forward();
+ }
+ return defect == 0;
+ }
+
+}
diff --git a/library/cpp/containers/comptrie/pattern_searcher.h b/library/cpp/containers/comptrie/pattern_searcher.h
new file mode 100644
index 0000000000..caab51dc1c
--- /dev/null
+++ b/library/cpp/containers/comptrie/pattern_searcher.h
@@ -0,0 +1,606 @@
+#pragma once
+
+#include "comptrie_builder.h"
+#include "comptrie_trie.h"
+#include "comptrie_impl.h"
+#include <library/cpp/packers/packers.h>
+
+#include <util/system/yassert.h>
+#include <util/generic/vector.h>
+#include <util/generic/deque.h>
+#include <util/stream/str.h>
+
+// Aho-Corasick algorithm implementation using CompactTrie implementation of Sedgewick's T-trie
+
+namespace NCompactTrie {
+ struct TSuffixLink {
+ ui64 NextSuffixOffset;
+ ui64 NextSuffixWithDataOffset;
+
+ TSuffixLink(ui64 nextSuffixOffset = 0, ui64 nextSuffixWithDataOffset = 0)
+ : NextSuffixOffset(nextSuffixOffset)
+ , NextSuffixWithDataOffset(nextSuffixWithDataOffset)
+ {
+ }
+ };
+
+ const size_t FLAGS_SIZE = sizeof(char);
+ const size_t SYMBOL_SIZE = sizeof(char);
+};
+
+template <class T = char, class D = ui64, class S = TCompactTriePacker<D>>
+class TCompactPatternSearcherBuilder : protected TCompactTrieBuilder<T, D, S> {
+public:
+ typedef T TSymbol;
+ typedef D TData;
+ typedef S TPacker;
+
+ typedef typename TCompactTrieKeySelector<TSymbol>::TKey TKey;
+ typedef typename TCompactTrieKeySelector<TSymbol>::TKeyBuf TKeyBuf;
+
+ typedef TCompactTrieBuilder<T, D, S> TBase;
+
+public:
+ TCompactPatternSearcherBuilder() {
+ TBase::Impl = MakeHolder<TCompactPatternSearcherBuilderImpl>();
+ }
+
+ bool Add(const TSymbol* key, size_t keyLength, const TData& value) {
+ return TBase::Impl->AddEntry(key, keyLength, value);
+ }
+ bool Add(const TKeyBuf& key, const TData& value) {
+ return Add(key.data(), key.size(), value);
+ }
+
+ bool Find(const TSymbol* key, size_t keyLength, TData* value) const {
+ return TBase::Impl->FindEntry(key, keyLength, value);
+ }
+ bool Find(const TKeyBuf& key, TData* value = nullptr) const {
+ return Find(key.data(), key.size(), value);
+ }
+
+ size_t Save(IOutputStream& os) const {
+ size_t trieSize = TBase::Impl->MeasureByteSize();
+ TBufferOutput serializedTrie(trieSize);
+ TBase::Impl->Save(serializedTrie);
+
+ auto serializedTrieBuffer = serializedTrie.Buffer();
+ CalculateSuffixLinks(
+ serializedTrieBuffer.Data(),
+ serializedTrieBuffer.Data() + serializedTrieBuffer.Size()
+ );
+
+ os.Write(serializedTrieBuffer.Data(), serializedTrieBuffer.Size());
+ return trieSize;
+ }
+
+ TBlob Save() const {
+ TBufferStream buffer;
+ Save(buffer);
+ return TBlob::FromStream(buffer);
+ }
+
+ size_t SaveToFile(const TString& fileName) const {
+ TFileOutput out(fileName);
+ return Save(out);
+ }
+
+ size_t MeasureByteSize() const {
+ return TBase::Impl->MeasureByteSize();
+ }
+
+private:
+ void CalculateSuffixLinks(char* trieStart, const char* trieEnd) const;
+
+protected:
+ class TCompactPatternSearcherBuilderImpl : public TBase::TCompactTrieBuilderImpl {
+ public:
+ typedef typename TBase::TCompactTrieBuilderImpl TImplBase;
+
+ TCompactPatternSearcherBuilderImpl(
+ TCompactTrieBuilderFlags flags = CTBF_NONE,
+ TPacker packer = TPacker(),
+ IAllocator* alloc = TDefaultAllocator::Instance()
+ ) : TImplBase(flags, packer, alloc) {
+ }
+
+ ui64 ArcMeasure(
+ const typename TImplBase::TArc* arc,
+ size_t leftSize,
+ size_t rightSize
+ ) const override {
+ using namespace NCompactTrie;
+
+ size_t coreSize = SYMBOL_SIZE + FLAGS_SIZE +
+ sizeof(TSuffixLink) +
+ this->NodeMeasureLeafValue(arc->Node);
+ size_t treeSize = this->NodeMeasureSubtree(arc->Node);
+
+ if (arc->Label.Length() > 0)
+ treeSize += (SYMBOL_SIZE + FLAGS_SIZE + sizeof(TSuffixLink)) *
+ (arc->Label.Length() - 1);
+
+ // Triple measurements are needed because the space needed to store the offset
+ // shall be added to the offset itself. Hence three iterations.
+ size_t leftOffsetSize = 0;
+ size_t rightOffsetSize = 0;
+ for (size_t iteration = 0; iteration < 3; ++iteration) {
+ leftOffsetSize = leftSize ? MeasureOffset(
+ coreSize + treeSize + leftOffsetSize + rightOffsetSize) : 0;
+ rightOffsetSize = rightSize ? MeasureOffset(
+ coreSize + treeSize + leftSize + leftOffsetSize + rightOffsetSize) : 0;
+ }
+
+ coreSize += leftOffsetSize + rightOffsetSize;
+ arc->LeftOffset = leftSize ? coreSize + treeSize : 0;
+ arc->RightOffset = rightSize ? coreSize + treeSize + leftSize : 0;
+
+ return coreSize + treeSize + leftSize + rightSize;
+ }
+
+ ui64 ArcSaveSelf(const typename TImplBase::TArc* arc, IOutputStream& os) const override {
+ using namespace NCompactTrie;
+
+ ui64 written = 0;
+
+ size_t leftOffsetSize = MeasureOffset(arc->LeftOffset);
+ size_t rightOffsetSize = MeasureOffset(arc->RightOffset);
+
+ size_t labelLen = arc->Label.Length();
+
+ for (size_t labelPos = 0; labelPos < labelLen; ++labelPos) {
+ char flags = 0;
+
+ if (labelPos == 0) {
+ flags |= (leftOffsetSize << MT_LEFTSHIFT);
+ flags |= (rightOffsetSize << MT_RIGHTSHIFT);
+ }
+
+ if (labelPos == labelLen - 1) {
+ if (arc->Node->IsFinal())
+ flags |= MT_FINAL;
+ if (!arc->Node->IsLast())
+ flags |= MT_NEXT;
+ } else {
+ flags |= MT_NEXT;
+ }
+
+ os.Write(&flags, 1);
+ os.Write(&arc->Label.AsCharPtr()[labelPos], 1);
+ written += 2;
+
+ TSuffixLink suffixlink;
+ os.Write(&suffixlink, sizeof(TSuffixLink));
+ written += sizeof(TSuffixLink);
+
+ if (labelPos == 0) {
+ written += ArcSaveOffset(arc->LeftOffset, os);
+ written += ArcSaveOffset(arc->RightOffset, os);
+ }
+ }
+
+ written += this->NodeSaveLeafValue(arc->Node, os);
+ return written;
+ }
+ };
+};
+
+
+template <class T>
+struct TPatternMatch {
+ ui64 End;
+ T Data;
+
+ TPatternMatch(ui64 end, const T& data)
+ : End(end)
+ , Data(data)
+ {
+ }
+};
+
+
+template <class T = char, class D = ui64, class S = TCompactTriePacker<D>>
+class TCompactPatternSearcher {
+public:
+ typedef T TSymbol;
+ typedef D TData;
+ typedef S TPacker;
+
+ typedef typename TCompactTrieKeySelector<TSymbol>::TKey TKey;
+ typedef typename TCompactTrieKeySelector<TSymbol>::TKeyBuf TKeyBuf;
+
+ typedef TCompactTrie<TSymbol, TData, TPacker> TTrie;
+public:
+ TCompactPatternSearcher()
+ {
+ }
+
+ explicit TCompactPatternSearcher(const TBlob& data)
+ : Trie(data)
+ {
+ }
+
+ TCompactPatternSearcher(const char* data, size_t size)
+ : Trie(data, size)
+ {
+ }
+
+ TVector<TPatternMatch<TData>> SearchMatches(const TSymbol* text, size_t textSize) const;
+ TVector<TPatternMatch<TData>> SearchMatches(const TKeyBuf& text) const {
+ return SearchMatches(text.data(), text.size());
+ }
+private:
+ TTrie Trie;
+};
+
+////////////////////
+// Implementation //
+////////////////////
+
+namespace {
+
+template <class TData, class TPacker>
+char ReadNode(
+ char* nodeStart,
+ char*& leftSibling,
+ char*& rightSibling,
+ char*& directChild,
+ NCompactTrie::TSuffixLink*& suffixLink,
+ TPacker packer = TPacker()
+) {
+ char* dataPos = nodeStart;
+ char flags = *(dataPos++);
+
+ Y_ASSERT(!NCompactTrie::IsEpsilonLink(flags)); // Epsilon links are not allowed
+
+ char label = *(dataPos++);
+
+ suffixLink = (NCompactTrie::TSuffixLink*)dataPos;
+ dataPos += sizeof(NCompactTrie::TSuffixLink);
+
+ { // Left branch
+ size_t offsetLength = NCompactTrie::LeftOffsetLen(flags);
+ size_t leftOffset = NCompactTrie::UnpackOffset(dataPos, offsetLength);
+ leftSibling = leftOffset ? (nodeStart + leftOffset) : nullptr;
+
+ dataPos += offsetLength;
+ }
+
+
+ { // Right branch
+ size_t offsetLength = NCompactTrie::RightOffsetLen(flags);
+ size_t rightOffset = NCompactTrie::UnpackOffset(dataPos, offsetLength);
+ rightSibling = rightOffset ? (nodeStart + rightOffset) : nullptr;
+
+ dataPos += offsetLength;
+ }
+
+ directChild = nullptr;
+ if (flags & NCompactTrie::MT_NEXT) {
+ directChild = dataPos;
+ if (flags & NCompactTrie::MT_FINAL) {
+ directChild += packer.SkipLeaf(directChild);
+ }
+ }
+
+ return label;
+}
+
+template <class TData, class TPacker>
+char ReadNodeConst(
+ const char* nodeStart,
+ const char*& leftSibling,
+ const char*& rightSibling,
+ const char*& directChild,
+ const char*& data,
+ NCompactTrie::TSuffixLink& suffixLink,
+ TPacker packer = TPacker()
+) {
+ const char* dataPos = nodeStart;
+ char flags = *(dataPos++);
+
+ Y_ASSERT(!NCompactTrie::IsEpsilonLink(flags)); // Epsilon links are not allowed
+
+ char label = *(dataPos++);
+
+ suffixLink = *((NCompactTrie::TSuffixLink*)dataPos);
+ dataPos += sizeof(NCompactTrie::TSuffixLink);
+
+ { // Left branch
+ size_t offsetLength = NCompactTrie::LeftOffsetLen(flags);
+ size_t leftOffset = NCompactTrie::UnpackOffset(dataPos, offsetLength);
+ leftSibling = leftOffset ? (nodeStart + leftOffset) : nullptr;
+
+ dataPos += offsetLength;
+ }
+
+
+ { // Right branch
+ size_t offsetLength = NCompactTrie::RightOffsetLen(flags);
+ size_t rightOffset = NCompactTrie::UnpackOffset(dataPos, offsetLength);
+ rightSibling = rightOffset ? (nodeStart + rightOffset) : nullptr;
+
+ dataPos += offsetLength;
+ }
+
+ data = nullptr;
+ if (flags & NCompactTrie::MT_FINAL) {
+ data = dataPos;
+ }
+ directChild = nullptr;
+ if (flags & NCompactTrie::MT_NEXT) {
+ directChild = dataPos;
+ if (flags & NCompactTrie::MT_FINAL) {
+ directChild += packer.SkipLeaf(directChild);
+ }
+ }
+
+ return label;
+}
+
+Y_FORCE_INLINE bool Advance(
+ const char*& dataPos,
+ const char* const dataEnd,
+ char label
+) {
+ if (dataPos == nullptr) {
+ return false;
+ }
+
+ while (dataPos < dataEnd) {
+ size_t offsetLength, offset;
+ const char* startPos = dataPos;
+ char flags = *(dataPos++);
+ char symbol = *(dataPos++);
+ dataPos += sizeof(NCompactTrie::TSuffixLink);
+
+ // Left branch
+ offsetLength = NCompactTrie::LeftOffsetLen(flags);
+ if ((unsigned char)label < (unsigned char)symbol) {
+ offset = NCompactTrie::UnpackOffset(dataPos, offsetLength);
+ if (!offset)
+ break;
+
+ dataPos = startPos + offset;
+ continue;
+ }
+
+ dataPos += offsetLength;
+
+ // Right branch
+ offsetLength = NCompactTrie::RightOffsetLen(flags);
+ if ((unsigned char)label > (unsigned char)symbol) {
+ offset = NCompactTrie::UnpackOffset(dataPos, offsetLength);
+ if (!offset)
+ break;
+
+ dataPos = startPos + offset;
+ continue;
+ }
+
+ dataPos = startPos;
+ return true;
+ }
+
+ // if we got here, we're past the dataend - bail out ASAP
+ dataPos = nullptr;
+ return false;
+}
+
+} // anonymous
+
+template <class T, class D, class S>
+void TCompactPatternSearcherBuilder<T, D, S>::CalculateSuffixLinks(
+ char* trieStart,
+ const char* trieEnd
+) const {
+ struct TBfsElement {
+ char* Node;
+ const char* Parent;
+
+ TBfsElement(char* node, const char* parent)
+ : Node(node)
+ , Parent(parent)
+ {
+ }
+ };
+
+ TDeque<TBfsElement> bfsQueue;
+ if (trieStart && trieStart != trieEnd) {
+ bfsQueue.emplace_back(trieStart, nullptr);
+ }
+
+ while (!bfsQueue.empty()) {
+ auto front = bfsQueue.front();
+ char* node = front.Node;
+ const char* parent = front.Parent;
+ bfsQueue.pop_front();
+
+ char* leftSibling;
+ char* rightSibling;
+ char* directChild;
+ NCompactTrie::TSuffixLink* suffixLink;
+
+ char label = ReadNode<TData, TPacker>(
+ node,
+ leftSibling,
+ rightSibling,
+ directChild,
+ suffixLink
+ );
+
+ const char* suffix;
+
+ if (parent == nullptr) {
+ suffix = node;
+ } else {
+ const char* parentOfSuffix = parent;
+ const char* temp;
+ do {
+ NCompactTrie::TSuffixLink parentOfSuffixSuffixLink;
+
+ ReadNodeConst<TData, TPacker>(
+ parentOfSuffix,
+ /*left*/temp,
+ /*right*/temp,
+ /*direct*/temp,
+ /*data*/temp,
+ parentOfSuffixSuffixLink
+ );
+ if (parentOfSuffixSuffixLink.NextSuffixOffset == 0) {
+ suffix = trieStart;
+ if (!Advance(suffix, trieEnd, label)) {
+ suffix = node;
+ }
+ break;
+ }
+ parentOfSuffix += parentOfSuffixSuffixLink.NextSuffixOffset;
+
+ NCompactTrie::TSuffixLink tempSuffixLink;
+ ReadNodeConst<TData, TPacker>(
+ parentOfSuffix,
+ /*left*/temp,
+ /*right*/temp,
+ /*direct*/suffix,
+ /*data*/temp,
+ tempSuffixLink
+ );
+
+ if (suffix == nullptr) {
+ continue;
+ }
+ } while (!Advance(suffix, trieEnd, label));
+ }
+
+ suffixLink->NextSuffixOffset = suffix - node;
+
+ NCompactTrie::TSuffixLink suffixSuffixLink;
+ const char* suffixData;
+ const char* temp;
+ ReadNodeConst<TData, TPacker>(
+ suffix,
+ /*left*/temp,
+ /*right*/temp,
+ /*direct*/temp,
+ suffixData,
+ suffixSuffixLink
+ );
+ suffixLink->NextSuffixWithDataOffset = suffix - node;
+ if (suffixData == nullptr) {
+ suffixLink->NextSuffixWithDataOffset += suffixSuffixLink.NextSuffixWithDataOffset;
+ }
+
+ if (directChild) {
+ bfsQueue.emplace_back(directChild, node);
+ }
+
+ if (leftSibling) {
+ bfsQueue.emplace_front(leftSibling, parent);
+ }
+
+ if (rightSibling) {
+ bfsQueue.emplace_front(rightSibling, parent);
+ }
+ }
+}
+
+
+template<class T, class D, class S>
+TVector<TPatternMatch<D>> TCompactPatternSearcher<T, D, S>::SearchMatches(
+ const TSymbol* text,
+ size_t textSize
+) const {
+ const char* temp;
+ NCompactTrie::TSuffixLink tempSuffixLink;
+
+ const auto& trieData = Trie.Data();
+ const char* trieStart = trieData.AsCharPtr();
+ size_t dataSize = trieData.Length();
+ const char* trieEnd = trieStart + dataSize;
+
+ const char* lastNode = nullptr;
+ const char* currentSubtree = trieStart;
+
+ TVector<TPatternMatch<TData>> matches;
+
+ for (const TSymbol* position = text; position < text + textSize; ++position) {
+ TSymbol symbol = *position;
+ for (i64 i = (i64)NCompactTrie::ExtraBits<TSymbol>(); i >= 0; i -= 8) {
+ char label = (char)(symbol >> i);
+
+ // Find first suffix extendable by label
+ while (true) {
+ const char* nextLastNode = currentSubtree;
+ if (Advance(nextLastNode, trieEnd, label)) {
+ lastNode = nextLastNode;
+ ReadNodeConst<TData, TPacker>(
+ lastNode,
+ /*left*/temp,
+ /*right*/temp,
+ currentSubtree,
+ /*data*/temp,
+ tempSuffixLink
+ );
+ break;
+ } else {
+ if (lastNode == nullptr) {
+ break;
+ }
+ }
+
+ NCompactTrie::TSuffixLink suffixLink;
+ ReadNodeConst<TData, TPacker>(
+ lastNode,
+ /*left*/temp,
+ /*right*/temp,
+ /*direct*/temp,
+ /*data*/temp,
+ suffixLink
+ );
+ if (suffixLink.NextSuffixOffset == 0) {
+ lastNode = nullptr;
+ currentSubtree = trieStart;
+ continue;
+ }
+ lastNode += suffixLink.NextSuffixOffset;
+ ReadNodeConst<TData, TPacker>(
+ lastNode,
+ /*left*/temp,
+ /*right*/temp,
+ currentSubtree,
+ /*data*/temp,
+ tempSuffixLink
+ );
+ }
+
+ // Iterate through all suffixes
+ const char* suffix = lastNode;
+ while (suffix != nullptr) {
+ const char* nodeData;
+ NCompactTrie::TSuffixLink suffixLink;
+ ReadNodeConst<TData, TPacker>(
+ suffix,
+ /*left*/temp,
+ /*right*/temp,
+ /*direct*/temp,
+ nodeData,
+ suffixLink
+ );
+ if (nodeData != nullptr) {
+ TData data;
+ Trie.GetPacker().UnpackLeaf(nodeData, data);
+ matches.emplace_back(
+ position - text,
+ data
+ );
+ }
+ if (suffixLink.NextSuffixOffset == 0) {
+ break;
+ }
+ suffix += suffixLink.NextSuffixWithDataOffset;
+ }
+ }
+ }
+
+ return matches;
+}
diff --git a/library/cpp/containers/comptrie/prefix_iterator.cpp b/library/cpp/containers/comptrie/prefix_iterator.cpp
new file mode 100644
index 0000000000..5d4dfa3500
--- /dev/null
+++ b/library/cpp/containers/comptrie/prefix_iterator.cpp
@@ -0,0 +1 @@
+#include "prefix_iterator.h"
diff --git a/library/cpp/containers/comptrie/prefix_iterator.h b/library/cpp/containers/comptrie/prefix_iterator.h
new file mode 100644
index 0000000000..b369bb4f42
--- /dev/null
+++ b/library/cpp/containers/comptrie/prefix_iterator.h
@@ -0,0 +1,88 @@
+#pragma once
+
+#include "comptrie_trie.h"
+
+// Iterates over all prefixes of the given key in the trie.
+template <class TTrie>
+class TPrefixIterator {
+public:
+ using TSymbol = typename TTrie::TSymbol;
+ using TPacker = typename TTrie::TPacker;
+ using TData = typename TTrie::TData;
+
+private:
+ const TTrie& Trie;
+ const TSymbol* key;
+ size_t keylen;
+ const TSymbol* keyend;
+ size_t prefixLen;
+ const char* valuepos;
+ const char* datapos;
+ const char* dataend;
+ TPacker Packer;
+ const char* EmptyValue;
+ bool result;
+
+ bool Next();
+
+public:
+ TPrefixIterator(const TTrie& trie, const TSymbol* aKey, size_t aKeylen)
+ : Trie(trie)
+ , key(aKey)
+ , keylen(aKeylen)
+ , keyend(aKey + aKeylen)
+ , prefixLen(0)
+ , valuepos(nullptr)
+ , datapos(trie.DataHolder.AsCharPtr())
+ , dataend(datapos + trie.DataHolder.Length())
+ {
+ result = Next();
+ }
+
+ operator bool() const {
+ return result;
+ }
+
+ TPrefixIterator& operator++() {
+ result = Next();
+ return *this;
+ }
+
+ size_t GetPrefixLen() const {
+ return prefixLen;
+ }
+
+ void GetValue(TData& to) const {
+ Trie.Packer.UnpackLeaf(valuepos, to);
+ }
+};
+
+template <class TTrie>
+bool TPrefixIterator<TTrie>::Next() {
+ using namespace NCompactTrie;
+ if (!key || datapos == dataend)
+ return false;
+
+ if ((key == keyend - keylen) && !valuepos && Trie.EmptyValue) {
+ valuepos = Trie.EmptyValue;
+ return true;
+ }
+
+ while (datapos && key != keyend) {
+ TSymbol label = *(key++);
+ if (!Advance(datapos, dataend, valuepos, label, Packer)) {
+ return false;
+ }
+ if (valuepos) { // There is a value at the end of this symbol.
+ prefixLen = keylen - (keyend - key);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+template <class TTrie>
+TPrefixIterator<TTrie> MakePrefixIterator(const TTrie& trie, const typename TTrie::TSymbol* key, size_t keylen) {
+ return TPrefixIterator<TTrie>(trie, key, keylen);
+}
diff --git a/library/cpp/containers/comptrie/protopacker.h b/library/cpp/containers/comptrie/protopacker.h
new file mode 100644
index 0000000000..3e15866dc5
--- /dev/null
+++ b/library/cpp/containers/comptrie/protopacker.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <util/stream/mem.h>
+#include <util/ysaveload.h>
+
+template <class Proto>
+class TProtoPacker {
+public:
+ TProtoPacker() = default;
+
+ void UnpackLeaf(const char* p, Proto& entry) const {
+ TMemoryInput in(p + sizeof(ui32), SkipLeaf(p) - sizeof(ui32));
+ entry.ParseFromArcadiaStream(&in);
+ }
+ void PackLeaf(char* p, const Proto& entry, size_t size) const {
+ TMemoryOutput out(p, size + sizeof(ui32));
+ Save<ui32>(&out, size);
+ entry.SerializeToArcadiaStream(&out);
+ }
+ size_t MeasureLeaf(const Proto& entry) const {
+ return entry.ByteSize() + sizeof(ui32);
+ }
+ size_t SkipLeaf(const char* p) const {
+ TMemoryInput in(p, sizeof(ui32));
+ ui32 size;
+ Load<ui32>(&in, size);
+ return size;
+ }
+};
diff --git a/library/cpp/containers/comptrie/search_iterator.cpp b/library/cpp/containers/comptrie/search_iterator.cpp
new file mode 100644
index 0000000000..eb91523574
--- /dev/null
+++ b/library/cpp/containers/comptrie/search_iterator.cpp
@@ -0,0 +1 @@
+#include "search_iterator.h"
diff --git a/library/cpp/containers/comptrie/search_iterator.h b/library/cpp/containers/comptrie/search_iterator.h
new file mode 100644
index 0000000000..247f7e5936
--- /dev/null
+++ b/library/cpp/containers/comptrie/search_iterator.h
@@ -0,0 +1,140 @@
+#pragma once
+
+#include "comptrie_trie.h"
+#include "first_symbol_iterator.h"
+
+#include <util/str_stl.h>
+#include <util/digest/numeric.h>
+#include <util/digest/multi.h>
+
+// Iterator for incremental searching.
+// All Advance() methods shift the iterator using specifed key/char.
+// The subsequent Advance() call starts searching from the previous state.
+// The Advance() returns 'true' if specified key part exists in the trie and
+// returns 'false' for unsuccessful search. In case of 'false' result
+// all subsequent calls also will return 'false'.
+// If current iterator state is final then GetValue() method returns 'true' and
+// associated value.
+
+template <class TTrie>
+class TSearchIterator {
+public:
+ using TData = typename TTrie::TData;
+ using TSymbol = typename TTrie::TSymbol;
+ using TKeyBuf = typename TTrie::TKeyBuf;
+
+ TSearchIterator() = default;
+
+ explicit TSearchIterator(const TTrie& trie)
+ : Trie(&trie)
+ , DataPos(trie.DataHolder.AsCharPtr())
+ , DataEnd(DataPos + trie.DataHolder.Length())
+ , ValuePos(trie.EmptyValue)
+ {
+ }
+
+ explicit TSearchIterator(const TTrie& trie, const TTrie& subTrie)
+ : Trie(&trie)
+ , DataPos(subTrie.Data().AsCharPtr())
+ , DataEnd(trie.DataHolder.AsCharPtr() + trie.DataHolder.Length())
+ , ValuePos(subTrie.EmptyValue)
+ {
+ }
+
+ bool operator==(const TSearchIterator& other) const {
+ Y_ASSERT(Trie && other.Trie);
+ return Trie == other.Trie &&
+ DataPos == other.DataPos &&
+ DataEnd == other.DataEnd &&
+ ValuePos == other.ValuePos;
+ }
+ bool operator!=(const TSearchIterator& other) const {
+ return !(*this == other);
+ }
+
+ inline bool Advance(TSymbol label) {
+ Y_ASSERT(Trie);
+ if (DataPos == nullptr || DataPos >= DataEnd) {
+ return false;
+ }
+ return NCompactTrie::Advance(DataPos, DataEnd, ValuePos, label, Trie->Packer);
+ }
+ inline bool Advance(const TKeyBuf& key) {
+ return Advance(key.data(), key.size());
+ }
+ bool Advance(const TSymbol* key, size_t keylen);
+ bool GetValue(TData* value = nullptr) const;
+ bool HasValue() const;
+ inline size_t GetHash() const;
+
+private:
+ const TTrie* Trie = nullptr;
+ const char* DataPos = nullptr;
+ const char* DataEnd = nullptr;
+ const char* ValuePos = nullptr;
+};
+
+template <class TTrie>
+inline TSearchIterator<TTrie> MakeSearchIterator(const TTrie& trie) {
+ return TSearchIterator<TTrie>(trie);
+}
+
+template <class TTrie>
+struct THash<TSearchIterator<TTrie>> {
+ inline size_t operator()(const TSearchIterator<TTrie>& item) {
+ return item.GetHash();
+ }
+};
+
+//----------------------------------------------------------------------------
+
+template <class TTrie>
+bool TSearchIterator<TTrie>::Advance(const TSymbol* key, size_t keylen) {
+ Y_ASSERT(Trie);
+ if (!key || DataPos == nullptr || DataPos >= DataEnd) {
+ return false;
+ }
+ if (!keylen) {
+ return true;
+ }
+
+ const TSymbol* keyend = key + keylen;
+ while (key != keyend && DataPos != nullptr) {
+ if (!NCompactTrie::Advance(DataPos, DataEnd, ValuePos, *(key++), Trie->Packer)) {
+ return false;
+ }
+ if (key == keyend) {
+ return true;
+ }
+ }
+ return false;
+}
+
+template <class TTrie>
+bool TSearchIterator<TTrie>::GetValue(TData* value) const {
+ Y_ASSERT(Trie);
+ bool result = false;
+ if (value) {
+ if (ValuePos) {
+ result = true;
+ Trie->Packer.UnpackLeaf(ValuePos, *value);
+ }
+ }
+ return result;
+}
+
+template <class TTrie>
+bool TSearchIterator<TTrie>::HasValue() const {
+ Y_ASSERT(Trie);
+ return ValuePos;
+}
+
+template <class TTrie>
+inline size_t TSearchIterator<TTrie>::GetHash() const {
+ Y_ASSERT(Trie);
+ return MultiHash(
+ static_cast<const void*>(Trie),
+ static_cast<const void*>(DataPos),
+ static_cast<const void*>(DataEnd),
+ static_cast<const void*>(ValuePos));
+}
diff --git a/library/cpp/containers/comptrie/set.h b/library/cpp/containers/comptrie/set.h
new file mode 100644
index 0000000000..acd43338f0
--- /dev/null
+++ b/library/cpp/containers/comptrie/set.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include "comptrie_trie.h"
+
+template <typename T = char>
+class TCompactTrieSet: public TCompactTrie<T, ui8, TNullPacker<ui8>> {
+public:
+ typedef TCompactTrie<T, ui8, TNullPacker<ui8>> TBase;
+
+ using typename TBase::TBuilder;
+ using typename TBase::TKey;
+ using typename TBase::TKeyBuf;
+ using typename TBase::TSymbol;
+
+ TCompactTrieSet() = default;
+
+ explicit TCompactTrieSet(const TBlob& data)
+ : TBase(data)
+ {
+ }
+
+ template <typename D>
+ explicit TCompactTrieSet(const TCompactTrie<T, D, TNullPacker<D>>& trie)
+ : TBase(trie.Data()) // should be binary compatible for any D
+ {
+ }
+
+ TCompactTrieSet(const char* data, size_t len)
+ : TBase(data, len)
+ {
+ }
+
+ bool Has(const typename TBase::TKeyBuf& key) const {
+ return TBase::Find(key.data(), key.size());
+ }
+
+ bool FindTails(const typename TBase::TKeyBuf& key, TCompactTrieSet<T>& res) const {
+ return TBase::FindTails(key, res);
+ }
+};
diff --git a/library/cpp/containers/comptrie/ut/ya.make b/library/cpp/containers/comptrie/ut/ya.make
new file mode 100644
index 0000000000..c4f4666009
--- /dev/null
+++ b/library/cpp/containers/comptrie/ut/ya.make
@@ -0,0 +1,9 @@
+UNITTEST_FOR(library/cpp/containers/comptrie)
+
+OWNER(alzobnin)
+
+SRCS(
+ comptrie_ut.cpp
+)
+
+END()
diff --git a/library/cpp/containers/comptrie/write_trie_backwards.cpp b/library/cpp/containers/comptrie/write_trie_backwards.cpp
new file mode 100644
index 0000000000..fd8c28b0ed
--- /dev/null
+++ b/library/cpp/containers/comptrie/write_trie_backwards.cpp
@@ -0,0 +1,110 @@
+#include "write_trie_backwards.h"
+
+#include "comptrie_impl.h"
+#include "leaf_skipper.h"
+
+#include <util/generic/buffer.h>
+#include <util/generic/vector.h>
+
+namespace NCompactTrie {
+ size_t WriteTrieBackwards(IOutputStream& os, TReverseNodeEnumerator& enumerator, bool verbose) {
+ if (verbose) {
+ Cerr << "Writing down the trie..." << Endl;
+ }
+
+ // Rewrite everything from the back, removing unused pieces.
+ const size_t chunksize = 0x10000;
+ TVector<char*> resultData;
+
+ resultData.push_back(new char[chunksize]);
+ char* chunkend = resultData.back() + chunksize;
+
+ size_t resultLength = 0;
+ size_t chunkLength = 0;
+
+ size_t counter = 0;
+ TBuffer bufferHolder;
+ while (enumerator.Move()) {
+ if (verbose)
+ ShowProgress(++counter);
+
+ size_t bufferLength = 64 + enumerator.GetLeafLength(); // never know how big leaf data can be
+ bufferHolder.Clear();
+ bufferHolder.Resize(bufferLength);
+ char* buffer = bufferHolder.Data();
+
+ size_t nodelength = enumerator.RecreateNode(buffer, resultLength);
+ Y_ASSERT(nodelength <= bufferLength);
+
+ resultLength += nodelength;
+
+ if (chunkLength + nodelength <= chunksize) {
+ chunkLength += nodelength;
+ memcpy(chunkend - chunkLength, buffer, nodelength);
+ } else { // allocate a new chunk
+ memcpy(chunkend - chunksize, buffer + nodelength - (chunksize - chunkLength), chunksize - chunkLength);
+ chunkLength = chunkLength + nodelength - chunksize;
+
+ resultData.push_back(new char[chunksize]);
+ chunkend = resultData.back() + chunksize;
+
+ while (chunkLength > chunksize) { // allocate a new chunks
+ chunkLength -= chunksize;
+ memcpy(chunkend - chunksize, buffer + chunkLength, chunksize);
+
+ resultData.push_back(new char[chunksize]);
+ chunkend = resultData.back() + chunksize;
+ }
+
+ memcpy(chunkend - chunkLength, buffer, chunkLength);
+ }
+ }
+
+ if (verbose)
+ Cerr << counter << Endl;
+
+ // Write the whole thing down
+ while (!resultData.empty()) {
+ char* chunk = resultData.back();
+ os.Write(chunk + chunksize - chunkLength, chunkLength);
+ chunkLength = chunksize;
+ delete[] chunk;
+ resultData.pop_back();
+ }
+
+ return resultLength;
+ }
+
+ size_t WriteTrieBackwardsNoAlloc(IOutputStream& os, TReverseNodeEnumerator& enumerator, TOpaqueTrie& trie, EMinimizeMode mode) {
+ char* data = const_cast<char*>(trie.Data);
+ char* end = data + trie.Length;
+ char* pos = end;
+
+ TVector<char> buf(64);
+ while (enumerator.Move()) {
+ size_t nodeLength = enumerator.RecreateNode(nullptr, end - pos);
+ if (nodeLength > buf.size())
+ buf.resize(nodeLength);
+
+ size_t realLength = enumerator.RecreateNode(buf.data(), end - pos);
+ Y_ASSERT(realLength == nodeLength);
+
+ pos -= nodeLength;
+ memcpy(pos, buf.data(), nodeLength);
+ }
+
+ switch (mode) {
+ case MM_NOALLOC:
+ os.Write(pos, end - pos);
+ break;
+ case MM_INPLACE:
+ memmove(data, pos, end - pos);
+ break;
+ default:
+ Y_VERIFY(false);
+ }
+
+ return end - pos;
+ }
+
+}
diff --git a/library/cpp/containers/comptrie/write_trie_backwards.h b/library/cpp/containers/comptrie/write_trie_backwards.h
new file mode 100644
index 0000000000..634e6b811a
--- /dev/null
+++ b/library/cpp/containers/comptrie/write_trie_backwards.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include "minimize.h"
+
+#include <util/generic/vector.h>
+#include <util/stream/output.h>
+#include <cstddef>
+
+namespace NCompactTrie {
+ class TReverseNodeEnumerator {
+ public:
+ virtual ~TReverseNodeEnumerator() = default;
+ virtual bool Move() = 0;
+ virtual size_t GetLeafLength() const = 0;
+ virtual size_t RecreateNode(char* buffer, size_t resultLength) = 0;
+ };
+
+ struct TOpaqueTrie;
+
+ size_t WriteTrieBackwards(IOutputStream& os, TReverseNodeEnumerator& enumerator, bool verbose);
+ size_t WriteTrieBackwardsNoAlloc(IOutputStream& os, TReverseNodeEnumerator& enumerator, TOpaqueTrie& trie, EMinimizeMode mode);
+
+}
diff --git a/library/cpp/containers/comptrie/writeable_node.cpp b/library/cpp/containers/comptrie/writeable_node.cpp
new file mode 100644
index 0000000000..404003dbbd
--- /dev/null
+++ b/library/cpp/containers/comptrie/writeable_node.cpp
@@ -0,0 +1,96 @@
+#include "writeable_node.h"
+#include "node.h"
+#include "comptrie_impl.h"
+
+namespace NCompactTrie {
+ TWriteableNode::TWriteableNode()
+ : LeafPos(nullptr)
+ , LeafLength(0)
+ , ForwardOffset(NPOS)
+ , LeftOffset(NPOS)
+ , RightOffset(NPOS)
+ , Label(0)
+ {
+ }
+
+ static size_t GetOffsetFromEnd(const TNode& node, size_t absOffset) {
+ return absOffset ? absOffset - node.GetOffset() - node.GetCoreLength() : NPOS;
+ }
+
+ TWriteableNode::TWriteableNode(const TNode& node, const char* data)
+ : LeafPos(node.IsFinal() ? data + node.GetLeafOffset() : nullptr)
+ , LeafLength(node.GetLeafLength())
+ , ForwardOffset(GetOffsetFromEnd(node, node.GetForwardOffset()))
+ , LeftOffset(GetOffsetFromEnd(node, node.GetLeftOffset()))
+ , RightOffset(GetOffsetFromEnd(node, node.GetRightOffset()))
+ , Label(node.GetLabel())
+ {
+ }
+
+ size_t TWriteableNode::Measure() const {
+ size_t len = 2 + LeafLength;
+ size_t fwdLen = 0;
+ size_t lastLen = 0;
+ size_t lastFwdLen = 0;
+ // Now, increase all the offsets by the length and recalculate everything, until it converges
+ do {
+ lastLen = len;
+ lastFwdLen = fwdLen;
+
+ len = 2 + LeafLength;
+ len += MeasureOffset(LeftOffset != NPOS ? LeftOffset + lastLen : 0);
+ len += MeasureOffset(RightOffset != NPOS ? RightOffset + lastLen : 0);
+
+ // Relative forward offset of 0 means we don't need extra length for an epsilon link.
+ // But an epsilon link means we need an extra 1 for the flags and the forward offset is measured
+ // from the start of the epsilon link, not from the start of our node.
+ if (ForwardOffset != NPOS && ForwardOffset != 0) {
+ fwdLen = MeasureOffset(ForwardOffset + lastFwdLen) + 1;
+ len += fwdLen;
+ }
+
+ } while (lastLen != len || lastFwdLen != fwdLen);
+
+ return len;
+ }
+
+ size_t TWriteableNode::Pack(char* buffer) const {
+ const size_t length = Measure();
+
+ char flags = 0;
+ if (LeafPos) {
+ flags |= MT_FINAL;
+ }
+ if (ForwardOffset != NPOS) {
+ flags |= MT_NEXT;
+ }
+
+ const size_t leftOffset = LeftOffset != NPOS ? LeftOffset + length : 0;
+ const size_t rightOffset = RightOffset != NPOS ? RightOffset + length : 0;
+ const size_t leftOffsetSize = MeasureOffset(leftOffset);
+ const size_t rightOffsetSize = MeasureOffset(rightOffset);
+ flags |= (leftOffsetSize << MT_LEFTSHIFT);
+ flags |= (rightOffsetSize << MT_RIGHTSHIFT);
+
+ buffer[0] = flags;
+ buffer[1] = Label;
+ size_t usedLen = 2;
+ usedLen += PackOffset(buffer + usedLen, leftOffset);
+ usedLen += PackOffset(buffer + usedLen, rightOffset);
+
+ if (LeafPos && LeafLength) {
+ memcpy(buffer + usedLen, LeafPos, LeafLength);
+ usedLen += LeafLength;
+ }
+
+ if (ForwardOffset != NPOS && ForwardOffset != 0) {
+ const size_t fwdOffset = ForwardOffset + length - usedLen;
+ size_t fwdOffsetSize = MeasureOffset(fwdOffset);
+ buffer[usedLen++] = (char)(fwdOffsetSize & MT_SIZEMASK);
+ usedLen += PackOffset(buffer + usedLen, fwdOffset);
+ }
+ Y_ASSERT(usedLen == length);
+ return usedLen;
+ }
+
+}
diff --git a/library/cpp/containers/comptrie/writeable_node.h b/library/cpp/containers/comptrie/writeable_node.h
new file mode 100644
index 0000000000..5454e579ef
--- /dev/null
+++ b/library/cpp/containers/comptrie/writeable_node.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <cstddef>
+
+namespace NCompactTrie {
+ class TNode;
+
+ class TWriteableNode {
+ public:
+ const char* LeafPos;
+ size_t LeafLength;
+
+ size_t ForwardOffset;
+ size_t LeftOffset;
+ size_t RightOffset;
+ char Label;
+
+ TWriteableNode();
+ TWriteableNode(const TNode& node, const char* data);
+
+ // When you call this, the offsets should be relative to the end of the node. Use NPOS to indicate an absent offset.
+ size_t Pack(char* buffer) const;
+ size_t Measure() const;
+ };
+
+}
diff --git a/library/cpp/containers/comptrie/ya.make b/library/cpp/containers/comptrie/ya.make
new file mode 100644
index 0000000000..81352da4b2
--- /dev/null
+++ b/library/cpp/containers/comptrie/ya.make
@@ -0,0 +1,35 @@
+LIBRARY()
+
+OWNER(velavokr)
+
+SRCS(
+ array_with_size.h
+ chunked_helpers_trie.h
+ comptrie.h
+ comptrie_packer.h
+ comptrie_trie.h
+ first_symbol_iterator.h
+ key_selector.h
+ leaf_skipper.h
+ set.h
+ comptrie.cpp
+ comptrie_builder.cpp
+ comptrie_impl.cpp
+ make_fast_layout.cpp
+ minimize.cpp
+ node.cpp
+ opaque_trie_iterator.cpp
+ prefix_iterator.cpp
+ search_iterator.cpp
+ write_trie_backwards.cpp
+ writeable_node.cpp
+)
+
+PEERDIR(
+ library/cpp/packers
+ library/cpp/containers/compact_vector
+ library/cpp/on_disk/chunks
+ util/draft
+)
+
+END()