aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/containers/comptrie/minimize.cpp
diff options
context:
space:
mode:
authorAnton Samokhvalov <pg83@yandex.ru>2022-02-10 16:45:15 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-02-10 16:45:15 +0300
commit72cb13b4aff9bc9cf22e49251bc8fd143f82538f (patch)
treeda2c34829458c7d4e74bdfbdf85dff449e9e7fb8 /library/cpp/containers/comptrie/minimize.cpp
parent778e51ba091dc39e7b7fcab2b9cf4dbedfb6f2b5 (diff)
downloadydb-72cb13b4aff9bc9cf22e49251bc8fd143f82538f.tar.gz
Restoring authorship annotation for Anton Samokhvalov <pg83@yandex.ru>. Commit 1 of 2.
Diffstat (limited to 'library/cpp/containers/comptrie/minimize.cpp')
-rw-r--r--library/cpp/containers/comptrie/minimize.cpp678
1 files changed, 339 insertions, 339 deletions
diff --git a/library/cpp/containers/comptrie/minimize.cpp b/library/cpp/containers/comptrie/minimize.cpp
index 80d0b25217..f53511428f 100644
--- a/library/cpp/containers/comptrie/minimize.cpp
+++ b/library/cpp/containers/comptrie/minimize.cpp
@@ -8,352 +8,352 @@
#include <util/generic/algorithm.h>
namespace NCompactTrie {
- // Minimize the trie. The result is equivalent to the original
- // trie, except that it takes less space (and has marginally lower
- // performance, because of eventual epsilon links).
- // The algorithm is as follows: starting from the largest pieces, we find
- // nodes that have identical continuations (Daciuk's right language),
- // and repack the trie. Repacking is done in-place, so memory is less
- // of an issue; however, it may take considerable time.
-
- // IMPORTANT: never try to reminimize an already minimized trie or a trie with fast layout.
- // Because of non-local structure and epsilon links, it won't work
- // as you expect it to, and can destroy the trie in the making.
-
- namespace {
- using TOffsetList = TVector<size_t>;
- using TPieceIndex = THashMap<size_t, TOffsetList>;
-
- using TSizePair = std::pair<size_t, size_t>;
- using TSizePairVector = TVector<TSizePair>;
- using TSizePairVectorVector = TVector<TSizePairVector>;
-
- class TOffsetMap {
- protected:
- TSizePairVectorVector Data;
-
- public:
- TOffsetMap() {
- Data.reserve(0x10000);
+ // Minimize the trie. The result is equivalent to the original
+ // trie, except that it takes less space (and has marginally lower
+ // performance, because of eventual epsilon links).
+ // The algorithm is as follows: starting from the largest pieces, we find
+ // nodes that have identical continuations (Daciuk's right language),
+ // and repack the trie. Repacking is done in-place, so memory is less
+ // of an issue; however, it may take considerable time.
+
+ // IMPORTANT: never try to reminimize an already minimized trie or a trie with fast layout.
+ // Because of non-local structure and epsilon links, it won't work
+ // as you expect it to, and can destroy the trie in the making.
+
+ namespace {
+ using TOffsetList = TVector<size_t>;
+ using TPieceIndex = THashMap<size_t, TOffsetList>;
+
+ using TSizePair = std::pair<size_t, size_t>;
+ using TSizePairVector = TVector<TSizePair>;
+ using TSizePairVectorVector = TVector<TSizePairVector>;
+
+ class TOffsetMap {
+ protected:
+ TSizePairVectorVector Data;
+
+ public:
+ TOffsetMap() {
+ Data.reserve(0x10000);
+ }
+
+ void Add(size_t key, size_t value) {
+ size_t hikey = key & 0xFFFF;
+
+ if (Data.size() <= hikey)
+ Data.resize(hikey + 1);
+
+ TSizePairVector& sublist = Data[hikey];
+
+ for (auto& it : sublist) {
+ if (it.first == key) {
+ it.second = value;
+
+ return;
+ }
+ }
+
+ sublist.push_back(TSizePair(key, value));
+ }
+
+ bool Contains(size_t key) const {
+ return (Get(key) != 0);
}
-
- void Add(size_t key, size_t value) {
- size_t hikey = key & 0xFFFF;
-
- if (Data.size() <= hikey)
- Data.resize(hikey + 1);
-
- TSizePairVector& sublist = Data[hikey];
-
- for (auto& it : sublist) {
- if (it.first == key) {
- it.second = value;
-
- return;
- }
- }
-
- sublist.push_back(TSizePair(key, value));
- }
-
- bool Contains(size_t key) const {
- return (Get(key) != 0);
- }
-
- size_t Get(size_t key) const {
- size_t hikey = key & 0xFFFF;
-
- if (Data.size() <= hikey)
- return 0;
-
- const TSizePairVector& sublist = Data[hikey];
-
- for (const auto& it : sublist) {
- if (it.first == key)
- return it.second;
- }
-
- return 0;
- }
- };
-
- class TOffsetDeltas {
- protected:
- TSizePairVector Data;
-
- public:
- void Add(size_t key, size_t value) {
- if (Data.empty()) {
- if (key == value)
- return; // no offset
- } else {
- TSizePair last = Data.back();
-
- if (key <= last.first) {
- Cerr << "Trouble: elements to offset delta list added in wrong order" << Endl;
-
- return;
- }
-
- if (last.first + value == last.second + key)
- return; // same offset
- }
-
- Data.push_back(TSizePair(key, value));
- }
-
- size_t Get(size_t key) const {
- if (Data.empty())
- return key; // difference is zero;
-
- if (key < Data.front().first)
- return key;
-
- // Binary search for the highest entry in the list that does not exceed the key
- size_t from = 0;
- size_t to = Data.size() - 1;
-
- while (from < to) {
- size_t midpoint = (from + to + 1) / 2;
-
- if (key < Data[midpoint].first)
- to = midpoint - 1;
- else
- from = midpoint;
- }
-
- TSizePair entry = Data[from];
-
- return key - entry.first + entry.second;
- }
- };
-
- class TPieceComparer {
- private:
- const char* Data;
- const size_t Length;
-
- public:
- TPieceComparer(const char* buf, size_t len)
- : Data(buf)
- , Length(len)
- {
+
+ size_t Get(size_t key) const {
+ size_t hikey = key & 0xFFFF;
+
+ if (Data.size() <= hikey)
+ return 0;
+
+ const TSizePairVector& sublist = Data[hikey];
+
+ for (const auto& it : sublist) {
+ if (it.first == key)
+ return it.second;
+ }
+
+ return 0;
+ }
+ };
+
+ class TOffsetDeltas {
+ protected:
+ TSizePairVector Data;
+
+ public:
+ void Add(size_t key, size_t value) {
+ if (Data.empty()) {
+ if (key == value)
+ return; // no offset
+ } else {
+ TSizePair last = Data.back();
+
+ if (key <= last.first) {
+ Cerr << "Trouble: elements to offset delta list added in wrong order" << Endl;
+
+ return;
+ }
+
+ if (last.first + value == last.second + key)
+ return; // same offset
+ }
+
+ Data.push_back(TSizePair(key, value));
}
-
- bool operator()(size_t p1, const size_t p2) {
- int res = memcmp(Data + p1, Data + p2, Length);
-
- if (res)
- return (res > 0);
-
- return (p1 > p2); // the pieces are sorted in the reverse order of appearance
- }
- };
-
- struct TBranchPoint {
- TNode Node;
- int Selector;
-
- public:
- TBranchPoint()
- : Selector(0)
- {
+
+ size_t Get(size_t key) const {
+ if (Data.empty())
+ return key; // difference is zero;
+
+ if (key < Data.front().first)
+ return key;
+
+ // Binary search for the highest entry in the list that does not exceed the key
+ size_t from = 0;
+ size_t to = Data.size() - 1;
+
+ while (from < to) {
+ size_t midpoint = (from + to + 1) / 2;
+
+ if (key < Data[midpoint].first)
+ to = midpoint - 1;
+ else
+ from = midpoint;
+ }
+
+ TSizePair entry = Data[from];
+
+ return key - entry.first + entry.second;
+ }
+ };
+
+ class TPieceComparer {
+ private:
+ const char* Data;
+ const size_t Length;
+
+ public:
+ TPieceComparer(const char* buf, size_t len)
+ : Data(buf)
+ , Length(len)
+ {
+ }
+
+ bool operator()(size_t p1, const size_t p2) {
+ int res = memcmp(Data + p1, Data + p2, Length);
+
+ if (res)
+ return (res > 0);
+
+ return (p1 > p2); // the pieces are sorted in the reverse order of appearance
+ }
+ };
+
+ struct TBranchPoint {
+ TNode Node;
+ int Selector;
+
+ public:
+ TBranchPoint()
+ : Selector(0)
+ {
+ }
+
+ TBranchPoint(const char* data, size_t offset, const ILeafSkipper& skipFunction)
+ : Node(data, offset, skipFunction)
+ , Selector(0)
+ {
+ }
+
+ bool IsFinal() const {
+ return Node.IsFinal();
+ }
+
+ // NextNode returns child nodes, starting from the last node: Right, then Left, then Forward
+ size_t NextNode(const TOffsetMap& mergedNodes) {
+ while (Selector < 3) {
+ size_t nextOffset = 0;
+
+ switch (++Selector) {
+ case 1:
+ if (Node.GetRightOffset())
+ nextOffset = Node.GetRightOffset();
+ break;
+
+ case 2:
+ if (Node.GetLeftOffset())
+ nextOffset = Node.GetLeftOffset();
+ break;
+
+ case 3:
+ if (Node.GetForwardOffset())
+ nextOffset = Node.GetForwardOffset();
+ break;
+
+ default:
+ break;
+ }
+
+ if (nextOffset && !mergedNodes.Contains(nextOffset))
+ return nextOffset;
+ }
+ return 0;
}
-
- TBranchPoint(const char* data, size_t offset, const ILeafSkipper& skipFunction)
- : Node(data, offset, skipFunction)
- , Selector(0)
- {
- }
-
- bool IsFinal() const {
- return Node.IsFinal();
- }
-
- // NextNode returns child nodes, starting from the last node: Right, then Left, then Forward
- size_t NextNode(const TOffsetMap& mergedNodes) {
- while (Selector < 3) {
- size_t nextOffset = 0;
-
- switch (++Selector) {
- case 1:
- if (Node.GetRightOffset())
- nextOffset = Node.GetRightOffset();
- break;
-
- case 2:
- if (Node.GetLeftOffset())
- nextOffset = Node.GetLeftOffset();
- break;
-
- case 3:
- if (Node.GetForwardOffset())
- nextOffset = Node.GetForwardOffset();
- break;
-
- default:
- break;
- }
-
- if (nextOffset && !mergedNodes.Contains(nextOffset))
- return nextOffset;
- }
- return 0;
- }
- };
-
- class TMergingReverseNodeEnumerator: public TReverseNodeEnumerator {
- private:
- bool Fresh;
- TOpaqueTrie Trie;
- const TOffsetMap& MergeMap;
- TVector<TBranchPoint> Trace;
- TOffsetDeltas OffsetIndex;
-
- public:
- TMergingReverseNodeEnumerator(const TOpaqueTrie& trie, const TOffsetMap& mergers)
- : Fresh(true)
- , Trie(trie)
- , MergeMap(mergers)
- {
- }
-
- bool Move() override {
- if (Fresh) {
- Trace.push_back(TBranchPoint(Trie.Data, 0, Trie.SkipFunction));
- Fresh = false;
- } else {
- if (Trace.empty())
- return false;
-
- Trace.pop_back();
-
- if (Trace.empty())
- return false;
- }
-
- size_t nextnode = Trace.back().NextNode(MergeMap);
-
- while (nextnode) {
- Trace.push_back(TBranchPoint(Trie.Data, nextnode, Trie.SkipFunction));
- nextnode = Trace.back().NextNode(MergeMap);
- }
-
- return (!Trace.empty());
- }
-
- const TNode& Get() const {
- return Trace.back().Node;
- }
- size_t GetLeafLength() const override {
- return Get().GetLeafLength();
- }
-
- // Returns recalculated offset from the end of the current node
- size_t PrepareOffset(size_t absoffset, size_t minilength) {
- if (!absoffset)
- return NPOS;
-
- if (MergeMap.Contains(absoffset))
- absoffset = MergeMap.Get(absoffset);
- return minilength - OffsetIndex.Get(Trie.Length - absoffset);
- }
-
- size_t RecreateNode(char* buffer, size_t resultLength) override {
- TWriteableNode newNode(Get(), Trie.Data);
- newNode.ForwardOffset = PrepareOffset(Get().GetForwardOffset(), resultLength);
- newNode.LeftOffset = PrepareOffset(Get().GetLeftOffset(), resultLength);
- newNode.RightOffset = PrepareOffset(Get().GetRightOffset(), resultLength);
-
- if (!buffer)
- return newNode.Measure();
-
- const size_t len = newNode.Pack(buffer);
- OffsetIndex.Add(Trie.Length - Get().GetOffset(), resultLength + len);
-
- return len;
- }
- };
-
+ };
+
+ class TMergingReverseNodeEnumerator: public TReverseNodeEnumerator {
+ private:
+ bool Fresh;
+ TOpaqueTrie Trie;
+ const TOffsetMap& MergeMap;
+ TVector<TBranchPoint> Trace;
+ TOffsetDeltas OffsetIndex;
+
+ public:
+ TMergingReverseNodeEnumerator(const TOpaqueTrie& trie, const TOffsetMap& mergers)
+ : Fresh(true)
+ , Trie(trie)
+ , MergeMap(mergers)
+ {
+ }
+
+ bool Move() override {
+ if (Fresh) {
+ Trace.push_back(TBranchPoint(Trie.Data, 0, Trie.SkipFunction));
+ Fresh = false;
+ } else {
+ if (Trace.empty())
+ return false;
+
+ Trace.pop_back();
+
+ if (Trace.empty())
+ return false;
+ }
+
+ size_t nextnode = Trace.back().NextNode(MergeMap);
+
+ while (nextnode) {
+ Trace.push_back(TBranchPoint(Trie.Data, nextnode, Trie.SkipFunction));
+ nextnode = Trace.back().NextNode(MergeMap);
+ }
+
+ return (!Trace.empty());
+ }
+
+ const TNode& Get() const {
+ return Trace.back().Node;
+ }
+ size_t GetLeafLength() const override {
+ return Get().GetLeafLength();
+ }
+
+ // Returns recalculated offset from the end of the current node
+ size_t PrepareOffset(size_t absoffset, size_t minilength) {
+ if (!absoffset)
+ return NPOS;
+
+ if (MergeMap.Contains(absoffset))
+ absoffset = MergeMap.Get(absoffset);
+ return minilength - OffsetIndex.Get(Trie.Length - absoffset);
+ }
+
+ size_t RecreateNode(char* buffer, size_t resultLength) override {
+ TWriteableNode newNode(Get(), Trie.Data);
+ newNode.ForwardOffset = PrepareOffset(Get().GetForwardOffset(), resultLength);
+ newNode.LeftOffset = PrepareOffset(Get().GetLeftOffset(), resultLength);
+ newNode.RightOffset = PrepareOffset(Get().GetRightOffset(), resultLength);
+
+ if (!buffer)
+ return newNode.Measure();
+
+ const size_t len = newNode.Pack(buffer);
+ OffsetIndex.Add(Trie.Length - Get().GetOffset(), resultLength + len);
+
+ return len;
+ }
+ };
+
}
- static void AddPiece(TPieceIndex& index, size_t offset, size_t len) {
- index[len].push_back(offset);
- }
-
- static TOffsetMap FindEquivalentSubtries(const TOpaqueTrie& trie, bool verbose, size_t minMergeSize) {
- // Tree nodes, arranged by span length.
- // When all nodes of a given size are considered, they pop off the queue.
- TPieceIndex subtries;
- TOffsetMap merger;
- // Start walking the trie from head.
- AddPiece(subtries, 0, trie.Length);
-
- size_t counter = 0;
- // Now consider all nodes with sizeable continuations
- for (size_t curlen = trie.Length; curlen >= minMergeSize && !subtries.empty(); curlen--) {
- TPieceIndex::iterator iit = subtries.find(curlen);
-
- if (iit == subtries.end())
- continue; // fast forward to the next available length value
-
- TOffsetList& batch = iit->second;
- TPieceComparer comparer(trie.Data, curlen);
- Sort(batch.begin(), batch.end(), comparer);
-
- TOffsetList::iterator it = batch.begin();
- while (it != batch.end()) {
- if (verbose)
- ShowProgress(++counter);
-
- size_t offset = *it;
-
- // Fill the array with the subnodes of the element
- TNode node(trie.Data, offset, trie.SkipFunction);
- size_t end = offset + curlen;
- if (size_t rightOffset = node.GetRightOffset()) {
- AddPiece(subtries, rightOffset, end - rightOffset);
- end = rightOffset;
- }
- if (size_t leftOffset = node.GetLeftOffset()) {
- AddPiece(subtries, leftOffset, end - leftOffset);
- end = leftOffset;
- }
- if (size_t forwardOffset = node.GetForwardOffset()) {
- AddPiece(subtries, forwardOffset, end - forwardOffset);
- }
-
- while (++it != batch.end()) {
- // Find next different; until then, just add the offsets to the list of merged nodes.
- size_t nextoffset = *it;
-
- if (memcmp(trie.Data + offset, trie.Data + nextoffset, curlen))
- break;
-
- merger.Add(nextoffset, offset);
- }
- }
-
- subtries.erase(curlen);
- }
- if (verbose) {
- Cerr << counter << Endl;
+ static void AddPiece(TPieceIndex& index, size_t offset, size_t len) {
+ index[len].push_back(offset);
+ }
+
+ static TOffsetMap FindEquivalentSubtries(const TOpaqueTrie& trie, bool verbose, size_t minMergeSize) {
+ // Tree nodes, arranged by span length.
+ // When all nodes of a given size are considered, they pop off the queue.
+ TPieceIndex subtries;
+ TOffsetMap merger;
+ // Start walking the trie from head.
+ AddPiece(subtries, 0, trie.Length);
+
+ size_t counter = 0;
+ // Now consider all nodes with sizeable continuations
+ for (size_t curlen = trie.Length; curlen >= minMergeSize && !subtries.empty(); curlen--) {
+ TPieceIndex::iterator iit = subtries.find(curlen);
+
+ if (iit == subtries.end())
+ continue; // fast forward to the next available length value
+
+ TOffsetList& batch = iit->second;
+ TPieceComparer comparer(trie.Data, curlen);
+ Sort(batch.begin(), batch.end(), comparer);
+
+ TOffsetList::iterator it = batch.begin();
+ while (it != batch.end()) {
+ if (verbose)
+ ShowProgress(++counter);
+
+ size_t offset = *it;
+
+ // Fill the array with the subnodes of the element
+ TNode node(trie.Data, offset, trie.SkipFunction);
+ size_t end = offset + curlen;
+ if (size_t rightOffset = node.GetRightOffset()) {
+ AddPiece(subtries, rightOffset, end - rightOffset);
+ end = rightOffset;
+ }
+ if (size_t leftOffset = node.GetLeftOffset()) {
+ AddPiece(subtries, leftOffset, end - leftOffset);
+ end = leftOffset;
+ }
+ if (size_t forwardOffset = node.GetForwardOffset()) {
+ AddPiece(subtries, forwardOffset, end - forwardOffset);
+ }
+
+ while (++it != batch.end()) {
+ // Find next different; until then, just add the offsets to the list of merged nodes.
+ size_t nextoffset = *it;
+
+ if (memcmp(trie.Data + offset, trie.Data + nextoffset, curlen))
+ break;
+
+ merger.Add(nextoffset, offset);
+ }
+ }
+
+ subtries.erase(curlen);
}
- return merger;
+ if (verbose) {
+ Cerr << counter << Endl;
+ }
+ return merger;
}
-
- size_t RawCompactTrieMinimizeImpl(IOutputStream& os, TOpaqueTrie& trie, bool verbose, size_t minMergeSize, EMinimizeMode mode) {
- if (!trie.Data || !trie.Length) {
- return 0;
- }
-
- TOffsetMap merger = FindEquivalentSubtries(trie, verbose, minMergeSize);
- TMergingReverseNodeEnumerator enumerator(trie, merger);
-
- if (mode == MM_DEFAULT)
- return WriteTrieBackwards(os, enumerator, verbose);
- else
- return WriteTrieBackwardsNoAlloc(os, enumerator, trie, mode);
+
+ size_t RawCompactTrieMinimizeImpl(IOutputStream& os, TOpaqueTrie& trie, bool verbose, size_t minMergeSize, EMinimizeMode mode) {
+ if (!trie.Data || !trie.Length) {
+ return 0;
+ }
+
+ TOffsetMap merger = FindEquivalentSubtries(trie, verbose, minMergeSize);
+ TMergingReverseNodeEnumerator enumerator(trie, merger);
+
+ if (mode == MM_DEFAULT)
+ return WriteTrieBackwards(os, enumerator, verbose);
+ else
+ return WriteTrieBackwardsNoAlloc(os, enumerator, trie, mode);
}
}