Restoring authorship annotation for Anton Samokhvalov <pg83@yandex.ru>. Commit 1 of 2.

author: Anton Samokhvalov <pg83@yandex.ru> 2022-02-10 16:45:15 +0300
committer: Daniil Cherednik <dcherednik@yandex-team.ru> 2022-02-10 16:45:15 +0300
commit: 72cb13b4aff9bc9cf22e49251bc8fd143f82538f (patch)
tree: da2c34829458c7d4e74bdfbdf85dff449e9e7fb8 /library/cpp/containers/comptrie/minimize.cpp
parent: 778e51ba091dc39e7b7fcab2b9cf4dbedfb6f2b5 (diff)
download: ydb-72cb13b4aff9bc9cf22e49251bc8fd143f82538f.tar.gz
1 files changed, 339 insertions, 339 deletions
diff --git a/library/cpp/containers/comptrie/minimize.cpp b/library/cpp/containers/comptrie/minimize.cpp
index 80d0b25217d..f53511428f4 100644
--- a/library/cpp/containers/comptrie/minimize.cpp
+++ b/library/cpp/containers/comptrie/minimize.cpp
@@ -8,352 +8,352 @@
 #include <util/generic/algorithm.h>
 
 namespace NCompactTrie {
-    // Minimize the trie. The result is equivalent to the original
-    // trie, except that it takes less space (and has marginally lower
-    // performance, because of eventual epsilon links).
-    // The algorithm is as follows: starting from the largest pieces, we find
-    // nodes that have identical continuations  (Daciuk's right language),
-    // and repack the trie. Repacking is done in-place, so memory is less
-    // of an issue; however, it may take considerable time.
-
-    // IMPORTANT: never try to reminimize an already minimized trie or a trie with fast layout.
-    // Because of non-local structure and epsilon links, it won't work
-    // as you expect it to, and can destroy the trie in the making.
-
-    namespace {
-        using TOffsetList = TVector<size_t>;
-        using TPieceIndex = THashMap<size_t, TOffsetList>;
-
-        using TSizePair = std::pair<size_t, size_t>;
-        using TSizePairVector = TVector<TSizePair>;
-        using TSizePairVectorVector = TVector<TSizePairVector>;
-
-        class TOffsetMap {
-        protected:
-            TSizePairVectorVector Data;
-
-        public:
-            TOffsetMap() {
-                Data.reserve(0x10000);
+    // Minimize the trie. The result is equivalent to the original 
+    // trie, except that it takes less space (and has marginally lower 
+    // performance, because of eventual epsilon links). 
+    // The algorithm is as follows: starting from the largest pieces, we find 
+    // nodes that have identical continuations  (Daciuk's right language), 
+    // and repack the trie. Repacking is done in-place, so memory is less 
+    // of an issue; however, it may take considerable time. 
+
+    // IMPORTANT: never try to reminimize an already minimized trie or a trie with fast layout. 
+    // Because of non-local structure and epsilon links, it won't work 
+    // as you expect it to, and can destroy the trie in the making. 
+ 
+    namespace { 
+        using TOffsetList = TVector<size_t>; 
+        using TPieceIndex = THashMap<size_t, TOffsetList>; 
+
+        using TSizePair = std::pair<size_t, size_t>; 
+        using TSizePairVector = TVector<TSizePair>; 
+        using TSizePairVectorVector = TVector<TSizePairVector>; 
+
+        class TOffsetMap { 
+        protected: 
+            TSizePairVectorVector Data; 
+ 
+        public: 
+            TOffsetMap() { 
+                Data.reserve(0x10000); 
+            } 
+
+            void Add(size_t key, size_t value) { 
+                size_t hikey = key & 0xFFFF; 
+
+                if (Data.size() <= hikey) 
+                    Data.resize(hikey + 1); 
+
+                TSizePairVector& sublist = Data[hikey]; 
+
+                for (auto& it : sublist) { 
+                    if (it.first == key) { 
+                        it.second = value; 
+
+                        return; 
+                    } 
+                } 
+
+                sublist.push_back(TSizePair(key, value)); 
+            } 
+ 
+            bool Contains(size_t key) const { 
+                return (Get(key) != 0); 
             }
-
-            void Add(size_t key, size_t value) {
-                size_t hikey = key & 0xFFFF;
-
-                if (Data.size() <= hikey)
-                    Data.resize(hikey + 1);
-
-                TSizePairVector& sublist = Data[hikey];
-
-                for (auto& it : sublist) {
-                    if (it.first == key) {
-                        it.second = value;
-
-                        return;
-                    }
-                }
-
-                sublist.push_back(TSizePair(key, value));
-            }
-
-            bool Contains(size_t key) const {
-                return (Get(key) != 0);
-            }
-
-            size_t Get(size_t key) const {
-                size_t hikey = key & 0xFFFF;
-
-                if (Data.size() <= hikey)
-                    return 0;
-
-                const TSizePairVector& sublist = Data[hikey];
-
-                for (const auto& it : sublist) {
-                    if (it.first == key)
-                        return it.second;
-                }
-
-                return 0;
-            }
-        };
-
-        class TOffsetDeltas {
-        protected:
-            TSizePairVector Data;
-
-        public:
-            void Add(size_t key, size_t value) {
-                if (Data.empty()) {
-                    if (key == value)
-                        return; // no offset
-                } else {
-                    TSizePair last = Data.back();
-
-                    if (key <= last.first) {
-                        Cerr << "Trouble: elements to offset delta list added in wrong order" << Endl;
-
-                        return;
-                    }
-
-                    if (last.first + value == last.second + key)
-                        return; // same  offset
-                }
-
-                Data.push_back(TSizePair(key, value));
-            }
-
-            size_t Get(size_t key) const {
-                if (Data.empty())
-                    return key; // difference is zero;
-
-                if (key < Data.front().first)
-                    return key;
-
-                // Binary search for the highest entry in the list that does not exceed the key
-                size_t from = 0;
-                size_t to = Data.size() - 1;
-
-                while (from < to) {
-                    size_t midpoint = (from + to + 1) / 2;
-
-                    if (key < Data[midpoint].first)
-                        to = midpoint - 1;
-                    else
-                        from = midpoint;
-                }
-
-                TSizePair entry = Data[from];
-
-                return key - entry.first + entry.second;
-            }
-        };
-
-        class TPieceComparer {
-        private:
-            const char* Data;
-            const size_t Length;
-
-        public:
-            TPieceComparer(const char* buf, size_t len)
-                : Data(buf)
-                , Length(len)
-            {
+ 
+            size_t Get(size_t key) const { 
+                size_t hikey = key & 0xFFFF; 
+
+                if (Data.size() <= hikey) 
+                    return 0; 
+
+                const TSizePairVector& sublist = Data[hikey]; 
+
+                for (const auto& it : sublist) { 
+                    if (it.first == key) 
+                        return it.second; 
+                } 
+ 
+                return 0; 
+            } 
+        }; 
+
+        class TOffsetDeltas { 
+        protected: 
+            TSizePairVector Data; 
+ 
+        public: 
+            void Add(size_t key, size_t value) { 
+                if (Data.empty()) { 
+                    if (key == value) 
+                        return; // no offset 
+                } else { 
+                    TSizePair last = Data.back(); 
+
+                    if (key <= last.first) { 
+                        Cerr << "Trouble: elements to offset delta list added in wrong order" << Endl; 
+
+                        return; 
+                    } 
+ 
+                    if (last.first + value == last.second + key) 
+                        return; // same  offset 
+                } 
+ 
+                Data.push_back(TSizePair(key, value)); 
             }
-
-            bool operator()(size_t p1, const size_t p2) {
-                int res = memcmp(Data + p1, Data + p2, Length);
-
-                if (res)
-                    return (res > 0);
-
-                return (p1 > p2); // the pieces are sorted in the reverse order of appearance
-            }
-        };
-
-        struct TBranchPoint {
-            TNode Node;
-            int Selector;
-
-        public:
-            TBranchPoint()
-                : Selector(0)
-            {
+ 
+            size_t Get(size_t key) const { 
+                if (Data.empty()) 
+                    return key; // difference is zero; 
+
+                if (key < Data.front().first) 
+                    return key; 
+
+                // Binary search for the highest entry in the list that does not exceed the key 
+                size_t from = 0; 
+                size_t to = Data.size() - 1; 
+
+                while (from < to) { 
+                    size_t midpoint = (from + to + 1) / 2; 
+ 
+                    if (key < Data[midpoint].first) 
+                        to = midpoint - 1; 
+                    else 
+                        from = midpoint; 
+                } 
+
+                TSizePair entry = Data[from]; 
+ 
+                return key - entry.first + entry.second; 
+            } 
+        }; 
+ 
+        class TPieceComparer { 
+        private: 
+            const char* Data; 
+            const size_t Length; 
+ 
+        public: 
+            TPieceComparer(const char* buf, size_t len) 
+                : Data(buf) 
+                , Length(len) 
+            { 
+            } 
+
+            bool operator()(size_t p1, const size_t p2) { 
+                int res = memcmp(Data + p1, Data + p2, Length); 
+ 
+                if (res) 
+                    return (res > 0); 
+ 
+                return (p1 > p2); // the pieces are sorted in the reverse order of appearance 
+            } 
+        }; 
+ 
+        struct TBranchPoint { 
+            TNode Node; 
+            int Selector; 
+ 
+        public: 
+            TBranchPoint() 
+                : Selector(0) 
+            { 
+            } 
+
+            TBranchPoint(const char* data, size_t offset, const ILeafSkipper& skipFunction) 
+                : Node(data, offset, skipFunction) 
+                , Selector(0) 
+            { 
+            } 
+
+            bool IsFinal() const { 
+                return Node.IsFinal(); 
+            } 
+
+            // NextNode returns child nodes, starting from the last node: Right, then Left, then Forward 
+            size_t NextNode(const TOffsetMap& mergedNodes) { 
+                while (Selector < 3) { 
+                    size_t nextOffset = 0; 
+
+                    switch (++Selector) { 
+                        case 1: 
+                            if (Node.GetRightOffset()) 
+                                nextOffset = Node.GetRightOffset(); 
+                            break; 
+
+                        case 2: 
+                            if (Node.GetLeftOffset()) 
+                                nextOffset = Node.GetLeftOffset(); 
+                            break; 
+ 
+                        case 3: 
+                            if (Node.GetForwardOffset()) 
+                                nextOffset = Node.GetForwardOffset(); 
+                            break; 
+ 
+                        default: 
+                            break; 
+                    } 
+ 
+                    if (nextOffset && !mergedNodes.Contains(nextOffset)) 
+                        return nextOffset; 
+                } 
+                return 0; 
             }
-
-            TBranchPoint(const char* data, size_t offset, const ILeafSkipper& skipFunction)
-                : Node(data, offset, skipFunction)
-                , Selector(0)
-            {
-            }
-
-            bool IsFinal() const {
-                return Node.IsFinal();
-            }
-
-            // NextNode returns child nodes, starting from the last node: Right, then Left, then Forward
-            size_t NextNode(const TOffsetMap& mergedNodes) {
-                while (Selector < 3) {
-                    size_t nextOffset = 0;
-
-                    switch (++Selector) {
-                        case 1:
-                            if (Node.GetRightOffset())
-                                nextOffset = Node.GetRightOffset();
-                            break;
-
-                        case 2:
-                            if (Node.GetLeftOffset())
-                                nextOffset = Node.GetLeftOffset();
-                            break;
-
-                        case 3:
-                            if (Node.GetForwardOffset())
-                                nextOffset = Node.GetForwardOffset();
-                            break;
-
-                        default:
-                            break;
-                    }
-
-                    if (nextOffset && !mergedNodes.Contains(nextOffset))
-                        return nextOffset;
-                }
-                return 0;
-            }
-        };
-
-        class TMergingReverseNodeEnumerator: public TReverseNodeEnumerator {
-        private:
-            bool Fresh;
-            TOpaqueTrie Trie;
-            const TOffsetMap& MergeMap;
-            TVector<TBranchPoint> Trace;
-            TOffsetDeltas OffsetIndex;
-
-        public:
-            TMergingReverseNodeEnumerator(const TOpaqueTrie& trie, const TOffsetMap& mergers)
-                : Fresh(true)
-                , Trie(trie)
-                , MergeMap(mergers)
-            {
-            }
-
-            bool Move() override {
-                if (Fresh) {
-                    Trace.push_back(TBranchPoint(Trie.Data, 0, Trie.SkipFunction));
-                    Fresh = false;
-                } else {
-                    if (Trace.empty())
-                        return false;
-
-                    Trace.pop_back();
-
-                    if (Trace.empty())
-                        return false;
-                }
-
-                size_t nextnode = Trace.back().NextNode(MergeMap);
-
-                while (nextnode) {
-                    Trace.push_back(TBranchPoint(Trie.Data, nextnode, Trie.SkipFunction));
-                    nextnode = Trace.back().NextNode(MergeMap);
-                }
-
-                return (!Trace.empty());
-            }
-
-            const TNode& Get() const {
-                return Trace.back().Node;
-            }
-            size_t GetLeafLength() const override {
-                return Get().GetLeafLength();
-            }
-
-            // Returns recalculated offset from the end of the current node
-            size_t PrepareOffset(size_t absoffset, size_t minilength) {
-                if (!absoffset)
-                    return NPOS;
-
-                if (MergeMap.Contains(absoffset))
-                    absoffset = MergeMap.Get(absoffset);
-                return minilength - OffsetIndex.Get(Trie.Length - absoffset);
-            }
-
-            size_t RecreateNode(char* buffer, size_t resultLength) override {
-                TWriteableNode newNode(Get(), Trie.Data);
-                newNode.ForwardOffset = PrepareOffset(Get().GetForwardOffset(), resultLength);
-                newNode.LeftOffset = PrepareOffset(Get().GetLeftOffset(), resultLength);
-                newNode.RightOffset = PrepareOffset(Get().GetRightOffset(), resultLength);
-
-                if (!buffer)
-                    return newNode.Measure();
-
-                const size_t len = newNode.Pack(buffer);
-                OffsetIndex.Add(Trie.Length - Get().GetOffset(), resultLength + len);
-
-                return len;
-            }
-        };
-
+        }; 
+ 
+        class TMergingReverseNodeEnumerator: public TReverseNodeEnumerator { 
+        private: 
+            bool Fresh; 
+            TOpaqueTrie Trie; 
+            const TOffsetMap& MergeMap; 
+            TVector<TBranchPoint> Trace; 
+            TOffsetDeltas OffsetIndex; 
+
+        public: 
+            TMergingReverseNodeEnumerator(const TOpaqueTrie& trie, const TOffsetMap& mergers) 
+                : Fresh(true) 
+                , Trie(trie) 
+                , MergeMap(mergers) 
+            { 
+            } 
+
+            bool Move() override { 
+                if (Fresh) { 
+                    Trace.push_back(TBranchPoint(Trie.Data, 0, Trie.SkipFunction)); 
+                    Fresh = false; 
+                } else { 
+                    if (Trace.empty()) 
+                        return false; 
+
+                    Trace.pop_back(); 
+ 
+                    if (Trace.empty()) 
+                        return false; 
+                } 
+ 
+                size_t nextnode = Trace.back().NextNode(MergeMap); 
+
+                while (nextnode) { 
+                    Trace.push_back(TBranchPoint(Trie.Data, nextnode, Trie.SkipFunction)); 
+                    nextnode = Trace.back().NextNode(MergeMap); 
+                } 
+ 
+                return (!Trace.empty()); 
+            } 
+
+            const TNode& Get() const { 
+                return Trace.back().Node; 
+            } 
+            size_t GetLeafLength() const override { 
+                return Get().GetLeafLength(); 
+            } 
+
+            // Returns recalculated offset from the end of the current node 
+            size_t PrepareOffset(size_t absoffset, size_t minilength) { 
+                if (!absoffset) 
+                    return NPOS; 
+
+                if (MergeMap.Contains(absoffset)) 
+                    absoffset = MergeMap.Get(absoffset); 
+                return minilength - OffsetIndex.Get(Trie.Length - absoffset); 
+            } 
+
+            size_t RecreateNode(char* buffer, size_t resultLength) override { 
+                TWriteableNode newNode(Get(), Trie.Data); 
+                newNode.ForwardOffset = PrepareOffset(Get().GetForwardOffset(), resultLength); 
+                newNode.LeftOffset = PrepareOffset(Get().GetLeftOffset(), resultLength); 
+                newNode.RightOffset = PrepareOffset(Get().GetRightOffset(), resultLength); 
+
+                if (!buffer) 
+                    return newNode.Measure(); 
+ 
+                const size_t len = newNode.Pack(buffer); 
+                OffsetIndex.Add(Trie.Length - Get().GetOffset(), resultLength + len); 
+
+                return len; 
+            } 
+        }; 
+ 
     }
 
-    static void AddPiece(TPieceIndex& index, size_t offset, size_t len) {
-        index[len].push_back(offset);
-    }
-
-    static TOffsetMap FindEquivalentSubtries(const TOpaqueTrie& trie, bool verbose, size_t minMergeSize) {
-        // Tree nodes, arranged by span length.
-        // When all nodes of a given size are considered, they pop off the queue.
-        TPieceIndex subtries;
-        TOffsetMap merger;
-        // Start walking the trie from head.
-        AddPiece(subtries, 0, trie.Length);
-
-        size_t counter = 0;
-        // Now consider all nodes with sizeable continuations
-        for (size_t curlen = trie.Length; curlen >= minMergeSize && !subtries.empty(); curlen--) {
-            TPieceIndex::iterator iit = subtries.find(curlen);
-
-            if (iit == subtries.end())
-                continue; // fast forward to the next available length value
-
-            TOffsetList& batch = iit->second;
-            TPieceComparer comparer(trie.Data, curlen);
-            Sort(batch.begin(), batch.end(), comparer);
-
-            TOffsetList::iterator it = batch.begin();
-            while (it != batch.end()) {
-                if (verbose)
-                    ShowProgress(++counter);
-
-                size_t offset = *it;
-
-                // Fill the array with the subnodes of the element
-                TNode node(trie.Data, offset, trie.SkipFunction);
-                size_t end = offset + curlen;
-                if (size_t rightOffset = node.GetRightOffset()) {
-                    AddPiece(subtries, rightOffset, end - rightOffset);
-                    end = rightOffset;
-                }
-                if (size_t leftOffset = node.GetLeftOffset()) {
-                    AddPiece(subtries, leftOffset, end - leftOffset);
-                    end = leftOffset;
-                }
-                if (size_t forwardOffset = node.GetForwardOffset()) {
-                    AddPiece(subtries, forwardOffset, end - forwardOffset);
-                }
-
-                while (++it != batch.end()) {
-                    // Find next different; until then, just add the offsets to the list of merged nodes.
-                    size_t nextoffset = *it;
-
-                    if (memcmp(trie.Data + offset, trie.Data + nextoffset, curlen))
-                        break;
-
-                    merger.Add(nextoffset, offset);
-                }
-            }
-
-            subtries.erase(curlen);
-        }
-        if (verbose) {
-            Cerr << counter << Endl;
+    static void AddPiece(TPieceIndex& index, size_t offset, size_t len) { 
+        index[len].push_back(offset); 
+    } 
+
+    static TOffsetMap FindEquivalentSubtries(const TOpaqueTrie& trie, bool verbose, size_t minMergeSize) { 
+        // Tree nodes, arranged by span length. 
+        // When all nodes of a given size are considered, they pop off the queue. 
+        TPieceIndex subtries; 
+        TOffsetMap merger; 
+        // Start walking the trie from head. 
+        AddPiece(subtries, 0, trie.Length); 
+
+        size_t counter = 0; 
+        // Now consider all nodes with sizeable continuations 
+        for (size_t curlen = trie.Length; curlen >= minMergeSize && !subtries.empty(); curlen--) { 
+            TPieceIndex::iterator iit = subtries.find(curlen); 
+
+            if (iit == subtries.end()) 
+                continue; // fast forward to the next available length value 
+
+            TOffsetList& batch = iit->second; 
+            TPieceComparer comparer(trie.Data, curlen); 
+            Sort(batch.begin(), batch.end(), comparer); 
+
+            TOffsetList::iterator it = batch.begin(); 
+            while (it != batch.end()) { 
+                if (verbose) 
+                    ShowProgress(++counter); 
+
+                size_t offset = *it; 
+ 
+                // Fill the array with the subnodes of the element 
+                TNode node(trie.Data, offset, trie.SkipFunction); 
+                size_t end = offset + curlen; 
+                if (size_t rightOffset = node.GetRightOffset()) { 
+                    AddPiece(subtries, rightOffset, end - rightOffset); 
+                    end = rightOffset; 
+                } 
+                if (size_t leftOffset = node.GetLeftOffset()) { 
+                    AddPiece(subtries, leftOffset, end - leftOffset); 
+                    end = leftOffset; 
+                } 
+                if (size_t forwardOffset = node.GetForwardOffset()) { 
+                    AddPiece(subtries, forwardOffset, end - forwardOffset); 
+                } 
+
+                while (++it != batch.end()) { 
+                    // Find next different; until then, just add the offsets to the list of merged nodes. 
+                    size_t nextoffset = *it; 
+
+                    if (memcmp(trie.Data + offset, trie.Data + nextoffset, curlen)) 
+                        break; 
+ 
+                    merger.Add(nextoffset, offset); 
+                } 
+            } 
+ 
+            subtries.erase(curlen); 
         }
-        return merger;
+        if (verbose) { 
+            Cerr << counter << Endl; 
+        } 
+        return merger; 
     }
-
-    size_t RawCompactTrieMinimizeImpl(IOutputStream& os, TOpaqueTrie& trie, bool verbose, size_t minMergeSize, EMinimizeMode mode) {
-        if (!trie.Data || !trie.Length) {
-            return 0;
-        }
-
-        TOffsetMap merger = FindEquivalentSubtries(trie, verbose, minMergeSize);
-        TMergingReverseNodeEnumerator enumerator(trie, merger);
-
-        if (mode == MM_DEFAULT)
-            return WriteTrieBackwards(os, enumerator, verbose);
-        else
-            return WriteTrieBackwardsNoAlloc(os, enumerator, trie, mode);
+ 
+    size_t RawCompactTrieMinimizeImpl(IOutputStream& os, TOpaqueTrie& trie, bool verbose, size_t minMergeSize, EMinimizeMode mode) { 
+        if (!trie.Data || !trie.Length) { 
+            return 0; 
+        } 
+ 
+        TOffsetMap merger = FindEquivalentSubtries(trie, verbose, minMergeSize); 
+        TMergingReverseNodeEnumerator enumerator(trie, merger); 
+ 
+        if (mode == MM_DEFAULT) 
+            return WriteTrieBackwards(os, enumerator, verbose); 
+        else 
+            return WriteTrieBackwardsNoAlloc(os, enumerator, trie, mode); 
     }
 
 }
author	Anton Samokhvalov <pg83@yandex.ru>	2022-02-10 16:45:15 +0300
committer	Daniil Cherednik <dcherednik@yandex-team.ru>	2022-02-10 16:45:15 +0300
commit	72cb13b4aff9bc9cf22e49251bc8fd143f82538f (patch)
tree	da2c34829458c7d4e74bdfbdf85dff449e9e7fb8 /library/cpp/containers/comptrie/minimize.cpp
parent	778e51ba091dc39e7b7fcab2b9cf4dbedfb6f2b5 (diff)
download	ydb-72cb13b4aff9bc9cf22e49251bc8fd143f82538f.tar.gz