diff options
author | Anton Samokhvalov <pg83@yandex.ru> | 2022-02-10 16:45:15 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:45:15 +0300 |
commit | 72cb13b4aff9bc9cf22e49251bc8fd143f82538f (patch) | |
tree | da2c34829458c7d4e74bdfbdf85dff449e9e7fb8 /library/cpp/containers/comptrie/minimize.cpp | |
parent | 778e51ba091dc39e7b7fcab2b9cf4dbedfb6f2b5 (diff) | |
download | ydb-72cb13b4aff9bc9cf22e49251bc8fd143f82538f.tar.gz |
Restoring authorship annotation for Anton Samokhvalov <pg83@yandex.ru>. Commit 1 of 2.
Diffstat (limited to 'library/cpp/containers/comptrie/minimize.cpp')
-rw-r--r-- | library/cpp/containers/comptrie/minimize.cpp | 678 |
1 files changed, 339 insertions, 339 deletions
diff --git a/library/cpp/containers/comptrie/minimize.cpp b/library/cpp/containers/comptrie/minimize.cpp index 80d0b25217..f53511428f 100644 --- a/library/cpp/containers/comptrie/minimize.cpp +++ b/library/cpp/containers/comptrie/minimize.cpp @@ -8,352 +8,352 @@ #include <util/generic/algorithm.h> namespace NCompactTrie { - // Minimize the trie. The result is equivalent to the original - // trie, except that it takes less space (and has marginally lower - // performance, because of eventual epsilon links). - // The algorithm is as follows: starting from the largest pieces, we find - // nodes that have identical continuations (Daciuk's right language), - // and repack the trie. Repacking is done in-place, so memory is less - // of an issue; however, it may take considerable time. - - // IMPORTANT: never try to reminimize an already minimized trie or a trie with fast layout. - // Because of non-local structure and epsilon links, it won't work - // as you expect it to, and can destroy the trie in the making. - - namespace { - using TOffsetList = TVector<size_t>; - using TPieceIndex = THashMap<size_t, TOffsetList>; - - using TSizePair = std::pair<size_t, size_t>; - using TSizePairVector = TVector<TSizePair>; - using TSizePairVectorVector = TVector<TSizePairVector>; - - class TOffsetMap { - protected: - TSizePairVectorVector Data; - - public: - TOffsetMap() { - Data.reserve(0x10000); + // Minimize the trie. The result is equivalent to the original + // trie, except that it takes less space (and has marginally lower + // performance, because of eventual epsilon links). + // The algorithm is as follows: starting from the largest pieces, we find + // nodes that have identical continuations (Daciuk's right language), + // and repack the trie. Repacking is done in-place, so memory is less + // of an issue; however, it may take considerable time. + + // IMPORTANT: never try to reminimize an already minimized trie or a trie with fast layout. + // Because of non-local structure and epsilon links, it won't work + // as you expect it to, and can destroy the trie in the making. + + namespace { + using TOffsetList = TVector<size_t>; + using TPieceIndex = THashMap<size_t, TOffsetList>; + + using TSizePair = std::pair<size_t, size_t>; + using TSizePairVector = TVector<TSizePair>; + using TSizePairVectorVector = TVector<TSizePairVector>; + + class TOffsetMap { + protected: + TSizePairVectorVector Data; + + public: + TOffsetMap() { + Data.reserve(0x10000); + } + + void Add(size_t key, size_t value) { + size_t hikey = key & 0xFFFF; + + if (Data.size() <= hikey) + Data.resize(hikey + 1); + + TSizePairVector& sublist = Data[hikey]; + + for (auto& it : sublist) { + if (it.first == key) { + it.second = value; + + return; + } + } + + sublist.push_back(TSizePair(key, value)); + } + + bool Contains(size_t key) const { + return (Get(key) != 0); } - - void Add(size_t key, size_t value) { - size_t hikey = key & 0xFFFF; - - if (Data.size() <= hikey) - Data.resize(hikey + 1); - - TSizePairVector& sublist = Data[hikey]; - - for (auto& it : sublist) { - if (it.first == key) { - it.second = value; - - return; - } - } - - sublist.push_back(TSizePair(key, value)); - } - - bool Contains(size_t key) const { - return (Get(key) != 0); - } - - size_t Get(size_t key) const { - size_t hikey = key & 0xFFFF; - - if (Data.size() <= hikey) - return 0; - - const TSizePairVector& sublist = Data[hikey]; - - for (const auto& it : sublist) { - if (it.first == key) - return it.second; - } - - return 0; - } - }; - - class TOffsetDeltas { - protected: - TSizePairVector Data; - - public: - void Add(size_t key, size_t value) { - if (Data.empty()) { - if (key == value) - return; // no offset - } else { - TSizePair last = Data.back(); - - if (key <= last.first) { - Cerr << "Trouble: elements to offset delta list added in wrong order" << Endl; - - return; - } - - if (last.first + value == last.second + key) - return; // same offset - } - - Data.push_back(TSizePair(key, value)); - } - - size_t Get(size_t key) const { - if (Data.empty()) - return key; // difference is zero; - - if (key < Data.front().first) - return key; - - // Binary search for the highest entry in the list that does not exceed the key - size_t from = 0; - size_t to = Data.size() - 1; - - while (from < to) { - size_t midpoint = (from + to + 1) / 2; - - if (key < Data[midpoint].first) - to = midpoint - 1; - else - from = midpoint; - } - - TSizePair entry = Data[from]; - - return key - entry.first + entry.second; - } - }; - - class TPieceComparer { - private: - const char* Data; - const size_t Length; - - public: - TPieceComparer(const char* buf, size_t len) - : Data(buf) - , Length(len) - { + + size_t Get(size_t key) const { + size_t hikey = key & 0xFFFF; + + if (Data.size() <= hikey) + return 0; + + const TSizePairVector& sublist = Data[hikey]; + + for (const auto& it : sublist) { + if (it.first == key) + return it.second; + } + + return 0; + } + }; + + class TOffsetDeltas { + protected: + TSizePairVector Data; + + public: + void Add(size_t key, size_t value) { + if (Data.empty()) { + if (key == value) + return; // no offset + } else { + TSizePair last = Data.back(); + + if (key <= last.first) { + Cerr << "Trouble: elements to offset delta list added in wrong order" << Endl; + + return; + } + + if (last.first + value == last.second + key) + return; // same offset + } + + Data.push_back(TSizePair(key, value)); } - - bool operator()(size_t p1, const size_t p2) { - int res = memcmp(Data + p1, Data + p2, Length); - - if (res) - return (res > 0); - - return (p1 > p2); // the pieces are sorted in the reverse order of appearance - } - }; - - struct TBranchPoint { - TNode Node; - int Selector; - - public: - TBranchPoint() - : Selector(0) - { + + size_t Get(size_t key) const { + if (Data.empty()) + return key; // difference is zero; + + if (key < Data.front().first) + return key; + + // Binary search for the highest entry in the list that does not exceed the key + size_t from = 0; + size_t to = Data.size() - 1; + + while (from < to) { + size_t midpoint = (from + to + 1) / 2; + + if (key < Data[midpoint].first) + to = midpoint - 1; + else + from = midpoint; + } + + TSizePair entry = Data[from]; + + return key - entry.first + entry.second; + } + }; + + class TPieceComparer { + private: + const char* Data; + const size_t Length; + + public: + TPieceComparer(const char* buf, size_t len) + : Data(buf) + , Length(len) + { + } + + bool operator()(size_t p1, const size_t p2) { + int res = memcmp(Data + p1, Data + p2, Length); + + if (res) + return (res > 0); + + return (p1 > p2); // the pieces are sorted in the reverse order of appearance + } + }; + + struct TBranchPoint { + TNode Node; + int Selector; + + public: + TBranchPoint() + : Selector(0) + { + } + + TBranchPoint(const char* data, size_t offset, const ILeafSkipper& skipFunction) + : Node(data, offset, skipFunction) + , Selector(0) + { + } + + bool IsFinal() const { + return Node.IsFinal(); + } + + // NextNode returns child nodes, starting from the last node: Right, then Left, then Forward + size_t NextNode(const TOffsetMap& mergedNodes) { + while (Selector < 3) { + size_t nextOffset = 0; + + switch (++Selector) { + case 1: + if (Node.GetRightOffset()) + nextOffset = Node.GetRightOffset(); + break; + + case 2: + if (Node.GetLeftOffset()) + nextOffset = Node.GetLeftOffset(); + break; + + case 3: + if (Node.GetForwardOffset()) + nextOffset = Node.GetForwardOffset(); + break; + + default: + break; + } + + if (nextOffset && !mergedNodes.Contains(nextOffset)) + return nextOffset; + } + return 0; } - - TBranchPoint(const char* data, size_t offset, const ILeafSkipper& skipFunction) - : Node(data, offset, skipFunction) - , Selector(0) - { - } - - bool IsFinal() const { - return Node.IsFinal(); - } - - // NextNode returns child nodes, starting from the last node: Right, then Left, then Forward - size_t NextNode(const TOffsetMap& mergedNodes) { - while (Selector < 3) { - size_t nextOffset = 0; - - switch (++Selector) { - case 1: - if (Node.GetRightOffset()) - nextOffset = Node.GetRightOffset(); - break; - - case 2: - if (Node.GetLeftOffset()) - nextOffset = Node.GetLeftOffset(); - break; - - case 3: - if (Node.GetForwardOffset()) - nextOffset = Node.GetForwardOffset(); - break; - - default: - break; - } - - if (nextOffset && !mergedNodes.Contains(nextOffset)) - return nextOffset; - } - return 0; - } - }; - - class TMergingReverseNodeEnumerator: public TReverseNodeEnumerator { - private: - bool Fresh; - TOpaqueTrie Trie; - const TOffsetMap& MergeMap; - TVector<TBranchPoint> Trace; - TOffsetDeltas OffsetIndex; - - public: - TMergingReverseNodeEnumerator(const TOpaqueTrie& trie, const TOffsetMap& mergers) - : Fresh(true) - , Trie(trie) - , MergeMap(mergers) - { - } - - bool Move() override { - if (Fresh) { - Trace.push_back(TBranchPoint(Trie.Data, 0, Trie.SkipFunction)); - Fresh = false; - } else { - if (Trace.empty()) - return false; - - Trace.pop_back(); - - if (Trace.empty()) - return false; - } - - size_t nextnode = Trace.back().NextNode(MergeMap); - - while (nextnode) { - Trace.push_back(TBranchPoint(Trie.Data, nextnode, Trie.SkipFunction)); - nextnode = Trace.back().NextNode(MergeMap); - } - - return (!Trace.empty()); - } - - const TNode& Get() const { - return Trace.back().Node; - } - size_t GetLeafLength() const override { - return Get().GetLeafLength(); - } - - // Returns recalculated offset from the end of the current node - size_t PrepareOffset(size_t absoffset, size_t minilength) { - if (!absoffset) - return NPOS; - - if (MergeMap.Contains(absoffset)) - absoffset = MergeMap.Get(absoffset); - return minilength - OffsetIndex.Get(Trie.Length - absoffset); - } - - size_t RecreateNode(char* buffer, size_t resultLength) override { - TWriteableNode newNode(Get(), Trie.Data); - newNode.ForwardOffset = PrepareOffset(Get().GetForwardOffset(), resultLength); - newNode.LeftOffset = PrepareOffset(Get().GetLeftOffset(), resultLength); - newNode.RightOffset = PrepareOffset(Get().GetRightOffset(), resultLength); - - if (!buffer) - return newNode.Measure(); - - const size_t len = newNode.Pack(buffer); - OffsetIndex.Add(Trie.Length - Get().GetOffset(), resultLength + len); - - return len; - } - }; - + }; + + class TMergingReverseNodeEnumerator: public TReverseNodeEnumerator { + private: + bool Fresh; + TOpaqueTrie Trie; + const TOffsetMap& MergeMap; + TVector<TBranchPoint> Trace; + TOffsetDeltas OffsetIndex; + + public: + TMergingReverseNodeEnumerator(const TOpaqueTrie& trie, const TOffsetMap& mergers) + : Fresh(true) + , Trie(trie) + , MergeMap(mergers) + { + } + + bool Move() override { + if (Fresh) { + Trace.push_back(TBranchPoint(Trie.Data, 0, Trie.SkipFunction)); + Fresh = false; + } else { + if (Trace.empty()) + return false; + + Trace.pop_back(); + + if (Trace.empty()) + return false; + } + + size_t nextnode = Trace.back().NextNode(MergeMap); + + while (nextnode) { + Trace.push_back(TBranchPoint(Trie.Data, nextnode, Trie.SkipFunction)); + nextnode = Trace.back().NextNode(MergeMap); + } + + return (!Trace.empty()); + } + + const TNode& Get() const { + return Trace.back().Node; + } + size_t GetLeafLength() const override { + return Get().GetLeafLength(); + } + + // Returns recalculated offset from the end of the current node + size_t PrepareOffset(size_t absoffset, size_t minilength) { + if (!absoffset) + return NPOS; + + if (MergeMap.Contains(absoffset)) + absoffset = MergeMap.Get(absoffset); + return minilength - OffsetIndex.Get(Trie.Length - absoffset); + } + + size_t RecreateNode(char* buffer, size_t resultLength) override { + TWriteableNode newNode(Get(), Trie.Data); + newNode.ForwardOffset = PrepareOffset(Get().GetForwardOffset(), resultLength); + newNode.LeftOffset = PrepareOffset(Get().GetLeftOffset(), resultLength); + newNode.RightOffset = PrepareOffset(Get().GetRightOffset(), resultLength); + + if (!buffer) + return newNode.Measure(); + + const size_t len = newNode.Pack(buffer); + OffsetIndex.Add(Trie.Length - Get().GetOffset(), resultLength + len); + + return len; + } + }; + } - static void AddPiece(TPieceIndex& index, size_t offset, size_t len) { - index[len].push_back(offset); - } - - static TOffsetMap FindEquivalentSubtries(const TOpaqueTrie& trie, bool verbose, size_t minMergeSize) { - // Tree nodes, arranged by span length. - // When all nodes of a given size are considered, they pop off the queue. - TPieceIndex subtries; - TOffsetMap merger; - // Start walking the trie from head. - AddPiece(subtries, 0, trie.Length); - - size_t counter = 0; - // Now consider all nodes with sizeable continuations - for (size_t curlen = trie.Length; curlen >= minMergeSize && !subtries.empty(); curlen--) { - TPieceIndex::iterator iit = subtries.find(curlen); - - if (iit == subtries.end()) - continue; // fast forward to the next available length value - - TOffsetList& batch = iit->second; - TPieceComparer comparer(trie.Data, curlen); - Sort(batch.begin(), batch.end(), comparer); - - TOffsetList::iterator it = batch.begin(); - while (it != batch.end()) { - if (verbose) - ShowProgress(++counter); - - size_t offset = *it; - - // Fill the array with the subnodes of the element - TNode node(trie.Data, offset, trie.SkipFunction); - size_t end = offset + curlen; - if (size_t rightOffset = node.GetRightOffset()) { - AddPiece(subtries, rightOffset, end - rightOffset); - end = rightOffset; - } - if (size_t leftOffset = node.GetLeftOffset()) { - AddPiece(subtries, leftOffset, end - leftOffset); - end = leftOffset; - } - if (size_t forwardOffset = node.GetForwardOffset()) { - AddPiece(subtries, forwardOffset, end - forwardOffset); - } - - while (++it != batch.end()) { - // Find next different; until then, just add the offsets to the list of merged nodes. - size_t nextoffset = *it; - - if (memcmp(trie.Data + offset, trie.Data + nextoffset, curlen)) - break; - - merger.Add(nextoffset, offset); - } - } - - subtries.erase(curlen); - } - if (verbose) { - Cerr << counter << Endl; + static void AddPiece(TPieceIndex& index, size_t offset, size_t len) { + index[len].push_back(offset); + } + + static TOffsetMap FindEquivalentSubtries(const TOpaqueTrie& trie, bool verbose, size_t minMergeSize) { + // Tree nodes, arranged by span length. + // When all nodes of a given size are considered, they pop off the queue. + TPieceIndex subtries; + TOffsetMap merger; + // Start walking the trie from head. + AddPiece(subtries, 0, trie.Length); + + size_t counter = 0; + // Now consider all nodes with sizeable continuations + for (size_t curlen = trie.Length; curlen >= minMergeSize && !subtries.empty(); curlen--) { + TPieceIndex::iterator iit = subtries.find(curlen); + + if (iit == subtries.end()) + continue; // fast forward to the next available length value + + TOffsetList& batch = iit->second; + TPieceComparer comparer(trie.Data, curlen); + Sort(batch.begin(), batch.end(), comparer); + + TOffsetList::iterator it = batch.begin(); + while (it != batch.end()) { + if (verbose) + ShowProgress(++counter); + + size_t offset = *it; + + // Fill the array with the subnodes of the element + TNode node(trie.Data, offset, trie.SkipFunction); + size_t end = offset + curlen; + if (size_t rightOffset = node.GetRightOffset()) { + AddPiece(subtries, rightOffset, end - rightOffset); + end = rightOffset; + } + if (size_t leftOffset = node.GetLeftOffset()) { + AddPiece(subtries, leftOffset, end - leftOffset); + end = leftOffset; + } + if (size_t forwardOffset = node.GetForwardOffset()) { + AddPiece(subtries, forwardOffset, end - forwardOffset); + } + + while (++it != batch.end()) { + // Find next different; until then, just add the offsets to the list of merged nodes. + size_t nextoffset = *it; + + if (memcmp(trie.Data + offset, trie.Data + nextoffset, curlen)) + break; + + merger.Add(nextoffset, offset); + } + } + + subtries.erase(curlen); } - return merger; + if (verbose) { + Cerr << counter << Endl; + } + return merger; } - - size_t RawCompactTrieMinimizeImpl(IOutputStream& os, TOpaqueTrie& trie, bool verbose, size_t minMergeSize, EMinimizeMode mode) { - if (!trie.Data || !trie.Length) { - return 0; - } - - TOffsetMap merger = FindEquivalentSubtries(trie, verbose, minMergeSize); - TMergingReverseNodeEnumerator enumerator(trie, merger); - - if (mode == MM_DEFAULT) - return WriteTrieBackwards(os, enumerator, verbose); - else - return WriteTrieBackwardsNoAlloc(os, enumerator, trie, mode); + + size_t RawCompactTrieMinimizeImpl(IOutputStream& os, TOpaqueTrie& trie, bool verbose, size_t minMergeSize, EMinimizeMode mode) { + if (!trie.Data || !trie.Length) { + return 0; + } + + TOffsetMap merger = FindEquivalentSubtries(trie, verbose, minMergeSize); + TMergingReverseNodeEnumerator enumerator(trie, merger); + + if (mode == MM_DEFAULT) + return WriteTrieBackwards(os, enumerator, verbose); + else + return WriteTrieBackwardsNoAlloc(os, enumerator, trie, mode); } } |