aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/containers/comptrie/minimize.cpp
diff options
context:
space:
mode:
authorDevtools Arcadia <arcadia-devtools@yandex-team.ru>2022-02-07 18:08:42 +0300
committerDevtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>2022-02-07 18:08:42 +0300
commit1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
treee26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/containers/comptrie/minimize.cpp
downloadydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/containers/comptrie/minimize.cpp')
-rw-r--r--library/cpp/containers/comptrie/minimize.cpp359
1 files changed, 359 insertions, 0 deletions
diff --git a/library/cpp/containers/comptrie/minimize.cpp b/library/cpp/containers/comptrie/minimize.cpp
new file mode 100644
index 0000000000..80d0b25217
--- /dev/null
+++ b/library/cpp/containers/comptrie/minimize.cpp
@@ -0,0 +1,359 @@
+#include "minimize.h"
+#include "node.h"
+#include "writeable_node.h"
+#include "write_trie_backwards.h"
+#include "comptrie_impl.h"
+
+#include <util/generic/hash.h>
+#include <util/generic/algorithm.h>
+
+namespace NCompactTrie {
+ // Minimize the trie. The result is equivalent to the original
+ // trie, except that it takes less space (and has marginally lower
+ // performance, because of eventual epsilon links).
+ // The algorithm is as follows: starting from the largest pieces, we find
+ // nodes that have identical continuations (Daciuk's right language),
+ // and repack the trie. Repacking is done in-place, so memory is less
+ // of an issue; however, it may take considerable time.
+
+ // IMPORTANT: never try to reminimize an already minimized trie or a trie with fast layout.
+ // Because of non-local structure and epsilon links, it won't work
+ // as you expect it to, and can destroy the trie in the making.
+
+ namespace {
+ using TOffsetList = TVector<size_t>;
+ using TPieceIndex = THashMap<size_t, TOffsetList>;
+
+ using TSizePair = std::pair<size_t, size_t>;
+ using TSizePairVector = TVector<TSizePair>;
+ using TSizePairVectorVector = TVector<TSizePairVector>;
+
+ class TOffsetMap {
+ protected:
+ TSizePairVectorVector Data;
+
+ public:
+ TOffsetMap() {
+ Data.reserve(0x10000);
+ }
+
+ void Add(size_t key, size_t value) {
+ size_t hikey = key & 0xFFFF;
+
+ if (Data.size() <= hikey)
+ Data.resize(hikey + 1);
+
+ TSizePairVector& sublist = Data[hikey];
+
+ for (auto& it : sublist) {
+ if (it.first == key) {
+ it.second = value;
+
+ return;
+ }
+ }
+
+ sublist.push_back(TSizePair(key, value));
+ }
+
+ bool Contains(size_t key) const {
+ return (Get(key) != 0);
+ }
+
+ size_t Get(size_t key) const {
+ size_t hikey = key & 0xFFFF;
+
+ if (Data.size() <= hikey)
+ return 0;
+
+ const TSizePairVector& sublist = Data[hikey];
+
+ for (const auto& it : sublist) {
+ if (it.first == key)
+ return it.second;
+ }
+
+ return 0;
+ }
+ };
+
+ class TOffsetDeltas {
+ protected:
+ TSizePairVector Data;
+
+ public:
+ void Add(size_t key, size_t value) {
+ if (Data.empty()) {
+ if (key == value)
+ return; // no offset
+ } else {
+ TSizePair last = Data.back();
+
+ if (key <= last.first) {
+ Cerr << "Trouble: elements to offset delta list added in wrong order" << Endl;
+
+ return;
+ }
+
+ if (last.first + value == last.second + key)
+ return; // same offset
+ }
+
+ Data.push_back(TSizePair(key, value));
+ }
+
+ size_t Get(size_t key) const {
+ if (Data.empty())
+ return key; // difference is zero;
+
+ if (key < Data.front().first)
+ return key;
+
+ // Binary search for the highest entry in the list that does not exceed the key
+ size_t from = 0;
+ size_t to = Data.size() - 1;
+
+ while (from < to) {
+ size_t midpoint = (from + to + 1) / 2;
+
+ if (key < Data[midpoint].first)
+ to = midpoint - 1;
+ else
+ from = midpoint;
+ }
+
+ TSizePair entry = Data[from];
+
+ return key - entry.first + entry.second;
+ }
+ };
+
+ class TPieceComparer {
+ private:
+ const char* Data;
+ const size_t Length;
+
+ public:
+ TPieceComparer(const char* buf, size_t len)
+ : Data(buf)
+ , Length(len)
+ {
+ }
+
+ bool operator()(size_t p1, const size_t p2) {
+ int res = memcmp(Data + p1, Data + p2, Length);
+
+ if (res)
+ return (res > 0);
+
+ return (p1 > p2); // the pieces are sorted in the reverse order of appearance
+ }
+ };
+
+ struct TBranchPoint {
+ TNode Node;
+ int Selector;
+
+ public:
+ TBranchPoint()
+ : Selector(0)
+ {
+ }
+
+ TBranchPoint(const char* data, size_t offset, const ILeafSkipper& skipFunction)
+ : Node(data, offset, skipFunction)
+ , Selector(0)
+ {
+ }
+
+ bool IsFinal() const {
+ return Node.IsFinal();
+ }
+
+ // NextNode returns child nodes, starting from the last node: Right, then Left, then Forward
+ size_t NextNode(const TOffsetMap& mergedNodes) {
+ while (Selector < 3) {
+ size_t nextOffset = 0;
+
+ switch (++Selector) {
+ case 1:
+ if (Node.GetRightOffset())
+ nextOffset = Node.GetRightOffset();
+ break;
+
+ case 2:
+ if (Node.GetLeftOffset())
+ nextOffset = Node.GetLeftOffset();
+ break;
+
+ case 3:
+ if (Node.GetForwardOffset())
+ nextOffset = Node.GetForwardOffset();
+ break;
+
+ default:
+ break;
+ }
+
+ if (nextOffset && !mergedNodes.Contains(nextOffset))
+ return nextOffset;
+ }
+ return 0;
+ }
+ };
+
+ class TMergingReverseNodeEnumerator: public TReverseNodeEnumerator {
+ private:
+ bool Fresh;
+ TOpaqueTrie Trie;
+ const TOffsetMap& MergeMap;
+ TVector<TBranchPoint> Trace;
+ TOffsetDeltas OffsetIndex;
+
+ public:
+ TMergingReverseNodeEnumerator(const TOpaqueTrie& trie, const TOffsetMap& mergers)
+ : Fresh(true)
+ , Trie(trie)
+ , MergeMap(mergers)
+ {
+ }
+
+ bool Move() override {
+ if (Fresh) {
+ Trace.push_back(TBranchPoint(Trie.Data, 0, Trie.SkipFunction));
+ Fresh = false;
+ } else {
+ if (Trace.empty())
+ return false;
+
+ Trace.pop_back();
+
+ if (Trace.empty())
+ return false;
+ }
+
+ size_t nextnode = Trace.back().NextNode(MergeMap);
+
+ while (nextnode) {
+ Trace.push_back(TBranchPoint(Trie.Data, nextnode, Trie.SkipFunction));
+ nextnode = Trace.back().NextNode(MergeMap);
+ }
+
+ return (!Trace.empty());
+ }
+
+ const TNode& Get() const {
+ return Trace.back().Node;
+ }
+ size_t GetLeafLength() const override {
+ return Get().GetLeafLength();
+ }
+
+ // Returns recalculated offset from the end of the current node
+ size_t PrepareOffset(size_t absoffset, size_t minilength) {
+ if (!absoffset)
+ return NPOS;
+
+ if (MergeMap.Contains(absoffset))
+ absoffset = MergeMap.Get(absoffset);
+ return minilength - OffsetIndex.Get(Trie.Length - absoffset);
+ }
+
+ size_t RecreateNode(char* buffer, size_t resultLength) override {
+ TWriteableNode newNode(Get(), Trie.Data);
+ newNode.ForwardOffset = PrepareOffset(Get().GetForwardOffset(), resultLength);
+ newNode.LeftOffset = PrepareOffset(Get().GetLeftOffset(), resultLength);
+ newNode.RightOffset = PrepareOffset(Get().GetRightOffset(), resultLength);
+
+ if (!buffer)
+ return newNode.Measure();
+
+ const size_t len = newNode.Pack(buffer);
+ OffsetIndex.Add(Trie.Length - Get().GetOffset(), resultLength + len);
+
+ return len;
+ }
+ };
+
+ }
+
+ static void AddPiece(TPieceIndex& index, size_t offset, size_t len) {
+ index[len].push_back(offset);
+ }
+
+ static TOffsetMap FindEquivalentSubtries(const TOpaqueTrie& trie, bool verbose, size_t minMergeSize) {
+ // Tree nodes, arranged by span length.
+ // When all nodes of a given size are considered, they pop off the queue.
+ TPieceIndex subtries;
+ TOffsetMap merger;
+ // Start walking the trie from head.
+ AddPiece(subtries, 0, trie.Length);
+
+ size_t counter = 0;
+ // Now consider all nodes with sizeable continuations
+ for (size_t curlen = trie.Length; curlen >= minMergeSize && !subtries.empty(); curlen--) {
+ TPieceIndex::iterator iit = subtries.find(curlen);
+
+ if (iit == subtries.end())
+ continue; // fast forward to the next available length value
+
+ TOffsetList& batch = iit->second;
+ TPieceComparer comparer(trie.Data, curlen);
+ Sort(batch.begin(), batch.end(), comparer);
+
+ TOffsetList::iterator it = batch.begin();
+ while (it != batch.end()) {
+ if (verbose)
+ ShowProgress(++counter);
+
+ size_t offset = *it;
+
+ // Fill the array with the subnodes of the element
+ TNode node(trie.Data, offset, trie.SkipFunction);
+ size_t end = offset + curlen;
+ if (size_t rightOffset = node.GetRightOffset()) {
+ AddPiece(subtries, rightOffset, end - rightOffset);
+ end = rightOffset;
+ }
+ if (size_t leftOffset = node.GetLeftOffset()) {
+ AddPiece(subtries, leftOffset, end - leftOffset);
+ end = leftOffset;
+ }
+ if (size_t forwardOffset = node.GetForwardOffset()) {
+ AddPiece(subtries, forwardOffset, end - forwardOffset);
+ }
+
+ while (++it != batch.end()) {
+ // Find next different; until then, just add the offsets to the list of merged nodes.
+ size_t nextoffset = *it;
+
+ if (memcmp(trie.Data + offset, trie.Data + nextoffset, curlen))
+ break;
+
+ merger.Add(nextoffset, offset);
+ }
+ }
+
+ subtries.erase(curlen);
+ }
+ if (verbose) {
+ Cerr << counter << Endl;
+ }
+ return merger;
+ }
+
+ size_t RawCompactTrieMinimizeImpl(IOutputStream& os, TOpaqueTrie& trie, bool verbose, size_t minMergeSize, EMinimizeMode mode) {
+ if (!trie.Data || !trie.Length) {
+ return 0;
+ }
+
+ TOffsetMap merger = FindEquivalentSubtries(trie, verbose, minMergeSize);
+ TMergingReverseNodeEnumerator enumerator(trie, merger);
+
+ if (mode == MM_DEFAULT)
+ return WriteTrieBackwards(os, enumerator, verbose);
+ else
+ return WriteTrieBackwardsNoAlloc(os, enumerator, trie, mode);
+ }
+
+}