intermediate changes

ref:cde9a383711a11544ce7e107a78147fb96cc4029
author: Devtools Arcadia <arcadia-devtools@yandex-team.ru> 2022-02-07 18:08:42 +0300
committer: Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> 2022-02-07 18:08:42 +0300
commit: 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
tree: e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/containers/comptrie/minimize.cpp
download: ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz
1 files changed, 359 insertions, 0 deletions
diff --git a/library/cpp/containers/comptrie/minimize.cpp b/library/cpp/containers/comptrie/minimize.cpp
new file mode 100644
index 0000000000..80d0b25217
--- /dev/null
+++ b/library/cpp/containers/comptrie/minimize.cpp
@@ -0,0 +1,359 @@
+#include "minimize.h"
+#include "node.h"
+#include "writeable_node.h"
+#include "write_trie_backwards.h"
+#include "comptrie_impl.h"
+
+#include <util/generic/hash.h>
+#include <util/generic/algorithm.h>
+
+namespace NCompactTrie {
+    // Minimize the trie. The result is equivalent to the original
+    // trie, except that it takes less space (and has marginally lower
+    // performance, because of eventual epsilon links).
+    // The algorithm is as follows: starting from the largest pieces, we find
+    // nodes that have identical continuations  (Daciuk's right language),
+    // and repack the trie. Repacking is done in-place, so memory is less
+    // of an issue; however, it may take considerable time.
+
+    // IMPORTANT: never try to reminimize an already minimized trie or a trie with fast layout.
+    // Because of non-local structure and epsilon links, it won't work
+    // as you expect it to, and can destroy the trie in the making.
+
+    namespace {
+        using TOffsetList = TVector<size_t>;
+        using TPieceIndex = THashMap<size_t, TOffsetList>;
+
+        using TSizePair = std::pair<size_t, size_t>;
+        using TSizePairVector = TVector<TSizePair>;
+        using TSizePairVectorVector = TVector<TSizePairVector>;
+
+        class TOffsetMap {
+        protected:
+            TSizePairVectorVector Data;
+
+        public:
+            TOffsetMap() {
+                Data.reserve(0x10000);
+            }
+
+            void Add(size_t key, size_t value) {
+                size_t hikey = key & 0xFFFF;
+
+                if (Data.size() <= hikey)
+                    Data.resize(hikey + 1);
+
+                TSizePairVector& sublist = Data[hikey];
+
+                for (auto& it : sublist) {
+                    if (it.first == key) {
+                        it.second = value;
+
+                        return;
+                    }
+                }
+
+                sublist.push_back(TSizePair(key, value));
+            }
+
+            bool Contains(size_t key) const {
+                return (Get(key) != 0);
+            }
+
+            size_t Get(size_t key) const {
+                size_t hikey = key & 0xFFFF;
+
+                if (Data.size() <= hikey)
+                    return 0;
+
+                const TSizePairVector& sublist = Data[hikey];
+
+                for (const auto& it : sublist) {
+                    if (it.first == key)
+                        return it.second;
+                }
+
+                return 0;
+            }
+        };
+
+        class TOffsetDeltas {
+        protected:
+            TSizePairVector Data;
+
+        public:
+            void Add(size_t key, size_t value) {
+                if (Data.empty()) {
+                    if (key == value)
+                        return; // no offset
+                } else {
+                    TSizePair last = Data.back();
+
+                    if (key <= last.first) {
+                        Cerr << "Trouble: elements to offset delta list added in wrong order" << Endl;
+
+                        return;
+                    }
+
+                    if (last.first + value == last.second + key)
+                        return; // same  offset
+                }
+
+                Data.push_back(TSizePair(key, value));
+            }
+
+            size_t Get(size_t key) const {
+                if (Data.empty())
+                    return key; // difference is zero;
+
+                if (key < Data.front().first)
+                    return key;
+
+                // Binary search for the highest entry in the list that does not exceed the key
+                size_t from = 0;
+                size_t to = Data.size() - 1;
+
+                while (from < to) {
+                    size_t midpoint = (from + to + 1) / 2;
+
+                    if (key < Data[midpoint].first)
+                        to = midpoint - 1;
+                    else
+                        from = midpoint;
+                }
+
+                TSizePair entry = Data[from];
+
+                return key - entry.first + entry.second;
+            }
+        };
+
+        class TPieceComparer {
+        private:
+            const char* Data;
+            const size_t Length;
+
+        public:
+            TPieceComparer(const char* buf, size_t len)
+                : Data(buf)
+                , Length(len)
+            {
+            }
+
+            bool operator()(size_t p1, const size_t p2) {
+                int res = memcmp(Data + p1, Data + p2, Length);
+
+                if (res)
+                    return (res > 0);
+
+                return (p1 > p2); // the pieces are sorted in the reverse order of appearance
+            }
+        };
+
+        struct TBranchPoint {
+            TNode Node;
+            int Selector;
+
+        public:
+            TBranchPoint()
+                : Selector(0)
+            {
+            }
+
+            TBranchPoint(const char* data, size_t offset, const ILeafSkipper& skipFunction)
+                : Node(data, offset, skipFunction)
+                , Selector(0)
+            {
+            }
+
+            bool IsFinal() const {
+                return Node.IsFinal();
+            }
+
+            // NextNode returns child nodes, starting from the last node: Right, then Left, then Forward
+            size_t NextNode(const TOffsetMap& mergedNodes) {
+                while (Selector < 3) {
+                    size_t nextOffset = 0;
+
+                    switch (++Selector) {
+                        case 1:
+                            if (Node.GetRightOffset())
+                                nextOffset = Node.GetRightOffset();
+                            break;
+
+                        case 2:
+                            if (Node.GetLeftOffset())
+                                nextOffset = Node.GetLeftOffset();
+                            break;
+
+                        case 3:
+                            if (Node.GetForwardOffset())
+                                nextOffset = Node.GetForwardOffset();
+                            break;
+
+                        default:
+                            break;
+                    }
+
+                    if (nextOffset && !mergedNodes.Contains(nextOffset))
+                        return nextOffset;
+                }
+                return 0;
+            }
+        };
+
+        class TMergingReverseNodeEnumerator: public TReverseNodeEnumerator {
+        private:
+            bool Fresh;
+            TOpaqueTrie Trie;
+            const TOffsetMap& MergeMap;
+            TVector<TBranchPoint> Trace;
+            TOffsetDeltas OffsetIndex;
+
+        public:
+            TMergingReverseNodeEnumerator(const TOpaqueTrie& trie, const TOffsetMap& mergers)
+                : Fresh(true)
+                , Trie(trie)
+                , MergeMap(mergers)
+            {
+            }
+
+            bool Move() override {
+                if (Fresh) {
+                    Trace.push_back(TBranchPoint(Trie.Data, 0, Trie.SkipFunction));
+                    Fresh = false;
+                } else {
+                    if (Trace.empty())
+                        return false;
+
+                    Trace.pop_back();
+
+                    if (Trace.empty())
+                        return false;
+                }
+
+                size_t nextnode = Trace.back().NextNode(MergeMap);
+
+                while (nextnode) {
+                    Trace.push_back(TBranchPoint(Trie.Data, nextnode, Trie.SkipFunction));
+                    nextnode = Trace.back().NextNode(MergeMap);
+                }
+
+                return (!Trace.empty());
+            }
+
+            const TNode& Get() const {
+                return Trace.back().Node;
+            }
+            size_t GetLeafLength() const override {
+                return Get().GetLeafLength();
+            }
+
+            // Returns recalculated offset from the end of the current node
+            size_t PrepareOffset(size_t absoffset, size_t minilength) {
+                if (!absoffset)
+                    return NPOS;
+
+                if (MergeMap.Contains(absoffset))
+                    absoffset = MergeMap.Get(absoffset);
+                return minilength - OffsetIndex.Get(Trie.Length - absoffset);
+            }
+
+            size_t RecreateNode(char* buffer, size_t resultLength) override {
+                TWriteableNode newNode(Get(), Trie.Data);
+                newNode.ForwardOffset = PrepareOffset(Get().GetForwardOffset(), resultLength);
+                newNode.LeftOffset = PrepareOffset(Get().GetLeftOffset(), resultLength);
+                newNode.RightOffset = PrepareOffset(Get().GetRightOffset(), resultLength);
+
+                if (!buffer)
+                    return newNode.Measure();
+
+                const size_t len = newNode.Pack(buffer);
+                OffsetIndex.Add(Trie.Length - Get().GetOffset(), resultLength + len);
+
+                return len;
+            }
+        };
+
+    }
+
+    static void AddPiece(TPieceIndex& index, size_t offset, size_t len) {
+        index[len].push_back(offset);
+    }
+
+    static TOffsetMap FindEquivalentSubtries(const TOpaqueTrie& trie, bool verbose, size_t minMergeSize) {
+        // Tree nodes, arranged by span length.
+        // When all nodes of a given size are considered, they pop off the queue.
+        TPieceIndex subtries;
+        TOffsetMap merger;
+        // Start walking the trie from head.
+        AddPiece(subtries, 0, trie.Length);
+
+        size_t counter = 0;
+        // Now consider all nodes with sizeable continuations
+        for (size_t curlen = trie.Length; curlen >= minMergeSize && !subtries.empty(); curlen--) {
+            TPieceIndex::iterator iit = subtries.find(curlen);
+
+            if (iit == subtries.end())
+                continue; // fast forward to the next available length value
+
+            TOffsetList& batch = iit->second;
+            TPieceComparer comparer(trie.Data, curlen);
+            Sort(batch.begin(), batch.end(), comparer);
+
+            TOffsetList::iterator it = batch.begin();
+            while (it != batch.end()) {
+                if (verbose)
+                    ShowProgress(++counter);
+
+                size_t offset = *it;
+
+                // Fill the array with the subnodes of the element
+                TNode node(trie.Data, offset, trie.SkipFunction);
+                size_t end = offset + curlen;
+                if (size_t rightOffset = node.GetRightOffset()) {
+                    AddPiece(subtries, rightOffset, end - rightOffset);
+                    end = rightOffset;
+                }
+                if (size_t leftOffset = node.GetLeftOffset()) {
+                    AddPiece(subtries, leftOffset, end - leftOffset);
+                    end = leftOffset;
+                }
+                if (size_t forwardOffset = node.GetForwardOffset()) {
+                    AddPiece(subtries, forwardOffset, end - forwardOffset);
+                }
+
+                while (++it != batch.end()) {
+                    // Find next different; until then, just add the offsets to the list of merged nodes.
+                    size_t nextoffset = *it;
+
+                    if (memcmp(trie.Data + offset, trie.Data + nextoffset, curlen))
+                        break;
+
+                    merger.Add(nextoffset, offset);
+                }
+            }
+
+            subtries.erase(curlen);
+        }
+        if (verbose) {
+            Cerr << counter << Endl;
+        }
+        return merger;
+    }
+
+    size_t RawCompactTrieMinimizeImpl(IOutputStream& os, TOpaqueTrie& trie, bool verbose, size_t minMergeSize, EMinimizeMode mode) {
+        if (!trie.Data || !trie.Length) {
+            return 0;
+        }
+
+        TOffsetMap merger = FindEquivalentSubtries(trie, verbose, minMergeSize);
+        TMergingReverseNodeEnumerator enumerator(trie, merger);
+
+        if (mode == MM_DEFAULT)
+            return WriteTrieBackwards(os, enumerator, verbose);
+        else
+            return WriteTrieBackwardsNoAlloc(os, enumerator, trie, mode);
+    }
+
+}
author	Devtools Arcadia <arcadia-devtools@yandex-team.ru>	2022-02-07 18:08:42 +0300
committer	Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>	2022-02-07 18:08:42 +0300
commit	1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
tree	e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/containers/comptrie/minimize.cpp
download	ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz