diff options
author | Anton Samokhvalov <pg83@yandex.ru> | 2022-02-10 16:45:17 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:45:17 +0300 |
commit | d3a398281c6fd1d3672036cb2d63f842d2cb28c5 (patch) | |
tree | dd4bd3ca0f36b817e96812825ffaf10d645803f2 /library/cpp/containers/comptrie | |
parent | 72cb13b4aff9bc9cf22e49251bc8fd143f82538f (diff) | |
download | ydb-d3a398281c6fd1d3672036cb2d63f842d2cb28c5.tar.gz |
Restoring authorship annotation for Anton Samokhvalov <pg83@yandex.ru>. Commit 2 of 2.
Diffstat (limited to 'library/cpp/containers/comptrie')
30 files changed, 1963 insertions, 1963 deletions
diff --git a/library/cpp/containers/comptrie/array_with_size.h b/library/cpp/containers/comptrie/array_with_size.h index 148200fe6a..36e61c7410 100644 --- a/library/cpp/containers/comptrie/array_with_size.h +++ b/library/cpp/containers/comptrie/array_with_size.h @@ -3,19 +3,19 @@ #include <util/generic/ptr.h> #include <util/generic/noncopyable.h> #include <util/generic/utility.h> -#include <util/system/sys_alloc.h> +#include <util/system/sys_alloc.h> template <typename T> -class TArrayWithSizeHolder : TNonCopyable { +class TArrayWithSizeHolder : TNonCopyable { typedef TArrayWithSizeHolder<T> TThis; T* Data; public: - TArrayWithSizeHolder() - : Data(nullptr) - { - } + TArrayWithSizeHolder() + : Data(nullptr) + { + } ~TArrayWithSizeHolder() { if (!Data) @@ -26,7 +26,7 @@ public: } catch (...) { } } - y_deallocate(((size_t*)Data) - 1); + y_deallocate(((size_t*)Data) - 1); } void Swap(TThis& copy) { @@ -37,7 +37,7 @@ public: if (newSize == Size()) return; TThis copy; - copy.Data = (T*)(((size_t*)y_allocate(sizeof(size_t) + sizeof(T) * newSize)) + 1); + copy.Data = (T*)(((size_t*)y_allocate(sizeof(size_t) + sizeof(T) * newSize)) + 1); // does not handle constructor exceptions properly for (size_t i = 0; i < Min(Size(), newSize); ++i) { new (copy.Data + i) T(Data[i]); @@ -45,12 +45,12 @@ public: for (size_t i = Min(Size(), newSize); i < newSize; ++i) { new (copy.Data + i) T; } - ((size_t*)copy.Data)[-1] = newSize; + ((size_t*)copy.Data)[-1] = newSize; Swap(copy); } size_t Size() const { - return Data ? ((size_t*)Data)[-1] : 0; + return Data ? ((size_t*)Data)[-1] : 0; } bool Empty() const { diff --git a/library/cpp/containers/comptrie/chunked_helpers_trie.h b/library/cpp/containers/comptrie/chunked_helpers_trie.h index 6efe55b5f0..cfa35f5ba2 100644 --- a/library/cpp/containers/comptrie/chunked_helpers_trie.h +++ b/library/cpp/containers/comptrie/chunked_helpers_trie.h @@ -13,7 +13,7 @@ public: : Trie(blob) { } - + bool Has(const char* key) const { return Trie.Find(key, strlen(key)); } @@ -23,7 +23,7 @@ public: } }; -template <bool sorted = false> +template <bool sorted = false> class TTrieSetWriter { private: TCompactTrieBuilder<char> Builder; @@ -57,24 +57,24 @@ public: } }; -template <bool isWriter, bool sorted = false> +template <bool isWriter, bool sorted = false> struct TTrieSetG; -template <bool sorted> +template <bool sorted> struct TTrieSetG<false, sorted> { typedef TTrieSet T; }; -template <bool sorted> +template <bool sorted> struct TTrieSetG<true, sorted> { typedef TTrieSetWriter<sorted> T; }; -template <typename T> +template <typename T> class TTrieMap { private: TCompactTrie<char> Trie; - static_assert(sizeof(T) <= sizeof(ui64), "expect sizeof(T) <= sizeof(ui64)"); + static_assert(sizeof(T) <= sizeof(ui64), "expect sizeof(T) <= sizeof(ui64)"); public: TTrieMap(const TBlob& blob) @@ -101,17 +101,17 @@ public: } } - const TCompactTrie<char>& GetTrie() const { + const TCompactTrie<char>& GetTrie() const { return Trie; } }; -template <typename T, bool sorted = false> +template <typename T, bool sorted = false> class TTrieMapWriter { private: typedef TCompactTrieBuilder<char> TBuilder; TBuilder Builder; - static_assert(sizeof(T) <= sizeof(ui64), "expect sizeof(T) <= sizeof(ui64)"); + static_assert(sizeof(T) <= sizeof(ui64), "expect sizeof(T) <= sizeof(ui64)"); #ifndef NDEBUG bool IsSorted; #endif @@ -130,12 +130,12 @@ public: memcpy(&intValue, &value, sizeof(T)); Builder.Add(key, strlen(key), intValue); #ifndef NDEBUG - /* + /* if (!IsSorted) { T test; assert(Get(key, &test) && value == test); } - */ + */ #endif } @@ -177,7 +177,7 @@ public: } }; -template <typename T> +template <typename T> class TTrieSortedMapWriter { private: typedef std::pair<TString, T> TValue; @@ -204,15 +204,15 @@ public: } }; -template <typename X, bool isWriter, bool sorted = false> +template <typename X, bool isWriter, bool sorted = false> struct TTrieMapG; -template <typename X, bool sorted> +template <typename X, bool sorted> struct TTrieMapG<X, false, sorted> { typedef TTrieMap<X> T; }; -template <typename X, bool sorted> +template <typename X, bool sorted> struct TTrieMapG<X, true, sorted> { typedef TTrieMapWriter<X, sorted> T; }; diff --git a/library/cpp/containers/comptrie/comptrie.cpp b/library/cpp/containers/comptrie/comptrie.cpp index 86a006a231..4556e5b571 100644 --- a/library/cpp/containers/comptrie/comptrie.cpp +++ b/library/cpp/containers/comptrie/comptrie.cpp @@ -1,8 +1,8 @@ -#include "comptrie_impl.h" -#include "comptrie.h" -#include "array_with_size.h" -#include "comptrie_trie.h" -#include "comptrie_builder.h" -#include "protopacker.h" -#include "set.h" -#include "chunked_helpers_trie.h" +#include "comptrie_impl.h" +#include "comptrie.h" +#include "array_with_size.h" +#include "comptrie_trie.h" +#include "comptrie_builder.h" +#include "protopacker.h" +#include "set.h" +#include "chunked_helpers_trie.h" diff --git a/library/cpp/containers/comptrie/comptrie_builder.h b/library/cpp/containers/comptrie/comptrie_builder.h index e0364eef9f..cf7d2e39a3 100644 --- a/library/cpp/containers/comptrie/comptrie_builder.h +++ b/library/cpp/containers/comptrie/comptrie_builder.h @@ -26,18 +26,18 @@ enum ECompactTrieBuilderFlags { using TCompactTrieBuilderFlags = ECompactTrieBuilderFlags; -inline TCompactTrieBuilderFlags operator|(TCompactTrieBuilderFlags first, TCompactTrieBuilderFlags second) { +inline TCompactTrieBuilderFlags operator|(TCompactTrieBuilderFlags first, TCompactTrieBuilderFlags second) { return static_cast<TCompactTrieBuilderFlags>(static_cast<int>(first) | second); } -inline TCompactTrieBuilderFlags& operator|=(TCompactTrieBuilderFlags& first, TCompactTrieBuilderFlags second) { +inline TCompactTrieBuilderFlags& operator|=(TCompactTrieBuilderFlags& first, TCompactTrieBuilderFlags second) { return first = first | second; } template <typename T> class TArrayWithSizeHolder; -template <class T = char, class D = ui64, class S = TCompactTriePacker<D>> +template <class T = char, class D = ui64, class S = TCompactTriePacker<D>> class TCompactTrieBuilder { public: typedef T TSymbol; @@ -92,12 +92,12 @@ public: size_t GetEntryCount() const; size_t GetNodeCount() const; - + // Exact output file size in bytes. size_t MeasureByteSize() const { return Impl->MeasureByteSize(); } - + protected: class TCompactTrieBuilderImpl; THolder<TCompactTrieBuilderImpl> Impl; diff --git a/library/cpp/containers/comptrie/comptrie_builder.inl b/library/cpp/containers/comptrie/comptrie_builder.inl index e19f005d7c..f273fa6571 100644 --- a/library/cpp/containers/comptrie/comptrie_builder.inl +++ b/library/cpp/containers/comptrie/comptrie_builder.inl @@ -27,7 +27,7 @@ template <class T, class D, class S> class TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl { protected: - TMemoryPool Pool; + TMemoryPool Pool; size_t PayloadSize; THolder<TFixedSizeAllocator> NodeAllocator; class TNode; @@ -138,7 +138,7 @@ public: Y_ASSERT(reinterpret_cast<ISubtree*>(this) == static_cast<void*>(this)); // This assumption is used in TNode::Subtree() } - iterator Find(char ch); + iterator Find(char ch); const_iterator Find(char ch) const; void Add(const TBlob& s, TNode* node); @@ -422,8 +422,8 @@ public: template <class T, class D, class S> TCompactTrieBuilder<T, D, S>::TCompactTrieBuilder(TCompactTrieBuilderFlags flags, TPacker packer, IAllocator* alloc) : Impl(new TCompactTrieBuilderImpl(flags, packer, alloc)) -{ -} +{ +} template <class T, class D, class S> bool TCompactTrieBuilder<T, D, S>::Add(const TSymbol* key, size_t keylen, const TData& value) { @@ -742,7 +742,7 @@ bool TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::FindLongestPrefixImp template <class T, class D, class S> void TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::Clear() { DestroyNode(Root); - Pool.Clear(); + Pool.Clear(); NodeAllocator.Reset(new TFixedSizeAllocator(sizeof(TNode) + PayloadSize, TDefaultAllocator::Instance())); Root = new (*NodeAllocator) TNode; EntryCount = 0; @@ -980,11 +980,11 @@ ui64 TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::ArcSaveAndDestroy(co template <class T, class D, class S> typename TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::TNode::TArcSet::iterator TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::TNode::TArcSet::Find(char ch) { - using namespace NCompTriePrivate; - iterator it = LowerBound(this->begin(), this->end(), ch, TCmp()); - + using namespace NCompTriePrivate; + iterator it = LowerBound(this->begin(), this->end(), ch, TCmp()); + if (it != this->end() && it->Label[0] == (unsigned char)ch) { - return it; + return it; } return this->end(); @@ -1005,7 +1005,7 @@ typename TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::TNode::TArcSet:: template <class T, class D, class S> void TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::TNode::TArcSet::Add(const TBlob& s, TNode* node) { - using namespace NCompTriePrivate; + using namespace NCompTriePrivate; this->insert(LowerBound(this->begin(), this->end(), s[0], TCmp()), TArc(s, node)); } diff --git a/library/cpp/containers/comptrie/comptrie_impl.cpp b/library/cpp/containers/comptrie/comptrie_impl.cpp index a293b45d1c..a116ab6d1e 100644 --- a/library/cpp/containers/comptrie/comptrie_impl.cpp +++ b/library/cpp/containers/comptrie/comptrie_impl.cpp @@ -6,34 +6,34 @@ // Unpack the leaf value. The algorithm can store up to 8 full bytes in leafs. namespace NCompactTrie { - size_t MeasureOffset(size_t offset) { - int n = 0; - - while (offset) { - offset >>= 8; - ++n; - } - - return n; + size_t MeasureOffset(size_t offset) { + int n = 0; + + while (offset) { + offset >>= 8; + ++n; + } + + return n; } - - size_t PackOffset(char* buffer, size_t offset) { - size_t len = MeasureOffset(offset); - size_t i = len; - - while (i--) { - buffer[i] = (char)(offset & 0xFF); - offset >>= 8; - } - - return len; + + size_t PackOffset(char* buffer, size_t offset) { + size_t len = MeasureOffset(offset); + size_t i = len; + + while (i--) { + buffer[i] = (char)(offset & 0xFF); + offset >>= 8; + } + + return len; } - - void ShowProgress(size_t n) { - if (n % 1000000 == 0) + + void ShowProgress(size_t n) { + if (n % 1000000 == 0) Cerr << n << ", RSS=" << (TRusage::Get().MaxRss >> 20) << "mb" << Endl; - else if (n % 20000 == 0) - Cerr << "."; - } + else if (n % 20000 == 0) + Cerr << "."; + } } diff --git a/library/cpp/containers/comptrie/comptrie_impl.h b/library/cpp/containers/comptrie/comptrie_impl.h index f4b0bf072e..f41c38311a 100644 --- a/library/cpp/containers/comptrie/comptrie_impl.h +++ b/library/cpp/containers/comptrie/comptrie_impl.h @@ -9,10 +9,10 @@ // NCompactTrie namespace NCompactTrie { - const char MT_FINAL = '\x80'; - const char MT_NEXT = '\x40'; + const char MT_FINAL = '\x80'; + const char MT_NEXT = '\x40'; const char MT_SIZEMASK = '\x07'; - const size_t MT_LEFTSHIFT = 3; + const size_t MT_LEFTSHIFT = 3; const size_t MT_RIGHTSHIFT = 0; Y_FORCE_INLINE size_t UnpackOffset(const char* p, size_t len); @@ -72,22 +72,22 @@ namespace NCompTriePrivate { typedef TUtf32String TResult; }; -} +} -namespace NCompTriePrivate { - struct TCmp { - template <class T> - inline bool operator()(const T& l, const T& r) { +namespace NCompTriePrivate { + struct TCmp { + template <class T> + inline bool operator()(const T& l, const T& r) { return (unsigned char)(l.Label[0]) < (unsigned char)(r.Label[0]); - } - - template <class T> - inline bool operator()(const T& l, char r) { + } + + template <class T> + inline bool operator()(const T& l, char r) { return (unsigned char)(l.Label[0]) < (unsigned char)r; - } - }; -} - + } + }; +} + namespace NCompactTrie { static inline ui64 ArcSaveOffset(size_t offset, IOutputStream& os) { using namespace NCompactTrie; @@ -184,7 +184,7 @@ namespace NCompactTrie { // Returns true if the symbol was succesfully found in the trie, false otherwise. template <typename TSymbol, class TPacker> Y_FORCE_INLINE bool Advance(const char*& datapos, const char* const dataend, const char*& value, - TSymbol label, TPacker packer) { + TSymbol label, TPacker packer) { Y_ASSERT(datapos < dataend); char flags = MT_NEXT; for (int i = (int)ExtraBits<TSymbol>(); i >= 0; i -= 8) { diff --git a/library/cpp/containers/comptrie/comptrie_trie.h b/library/cpp/containers/comptrie/comptrie_trie.h index 1fa8794897..40ec1e52b3 100644 --- a/library/cpp/containers/comptrie/comptrie_trie.h +++ b/library/cpp/containers/comptrie/comptrie_trie.h @@ -29,7 +29,7 @@ template <class TTrie> class TPrefixIterator; // in case of <char> specialization cannot distinguish between "" and "\0" keys -template <class T = char, class D = ui64, class S = TCompactTriePacker<D>> +template <class T = char, class D = ui64, class S = TCompactTriePacker<D>> class TCompactTrie { public: typedef T TSymbol; @@ -56,12 +56,12 @@ public: TCompactTrie(const char* d, size_t len, TPacker packer); TCompactTrie(const char* d, size_t len) - : TCompactTrie{d, len, TPacker{}} { + : TCompactTrie{d, len, TPacker{}} { } TCompactTrie(const TBlob& data, TPacker packer); explicit TCompactTrie(const TBlob& data) - : TCompactTrie{data, TPacker{}} { + : TCompactTrie{data, TPacker{}} { } // Skipper should be initialized with &Packer, not with &other.Packer, so you have to redefine these. @@ -126,7 +126,7 @@ public: bool FindLongestPrefix(const TKeyBuf& key, size_t* prefixLen, TData* value = nullptr, bool* hasNext = nullptr) const { return FindLongestPrefix(key.data(), key.size(), prefixLen, value, hasNext); } - + // Return trie, containing all tails for the given key inline TCompactTrie<T, D, S> FindTails(const TSymbol* key, size_t keylen) const; TCompactTrie<T, D, S> FindTails(const TKeyBuf& key) const { @@ -146,17 +146,17 @@ public: typedef NCompactTrie::TOpaqueTrieIterator TOpaqueTrieIterator; typedef NCompactTrie::TOpaqueTrie TOpaqueTrie; friend class TCompactTrie; - TConstIterator(const TOpaqueTrie& trie, const char* emptyValue, bool atend, TPacker packer); // only usable from Begin() and End() methods + TConstIterator(const TOpaqueTrie& trie, const char* emptyValue, bool atend, TPacker packer); // only usable from Begin() and End() methods TConstIterator(const TOpaqueTrie& trie, const char* emptyValue, const TKeyBuf& key, TPacker packer); // only usable from UpperBound() method public: TConstIterator() = default; - bool IsEmpty() const { - return !Impl; - } // Almost no other method can be called. + bool IsEmpty() const { + return !Impl; + } // Almost no other method can be called. - bool operator==(const TConstIterator& other) const; - bool operator!=(const TConstIterator& other) const; + bool operator==(const TConstIterator& other) const; + bool operator!=(const TConstIterator& other) const; TConstIterator& operator++(); TConstIterator operator++(int /*unused*/); TConstIterator& operator--(); @@ -205,8 +205,8 @@ protected: void LookupPhrases(const char* datapos, size_t len, const TSymbol* key, size_t keylen, TVector<TPhraseMatch>& matches, TSymbol separator) const; }; -template <class T = char, class D = ui64, class S = TCompactTriePacker<D>> -class TCompactTrieHolder: public TCompactTrie<T, D, S>, NNonCopyable::TNonCopyable { +template <class T = char, class D = ui64, class S = TCompactTriePacker<D>> +class TCompactTrieHolder: public TCompactTrie<T, D, S>, NNonCopyable::TNonCopyable { private: typedef TCompactTrie<T, D, S> TBase; TArrayHolder<char> Storage; @@ -239,35 +239,35 @@ TCompactTrie<T, D, S>::TCompactTrie(const char* d, size_t len, TPacker packer) template <class T, class D, class S> TCompactTrie<T, D, S>::TCompactTrie(const char* emptyValue) : EmptyValue(emptyValue) -{ -} +{ +} template <class T, class D, class S> TCompactTrie<T, D, S>::TCompactTrie(const TBlob& data, const char* emptyValue, TPacker packer) : DataHolder(data) , EmptyValue(emptyValue) , Packer(packer) -{ -} +{ +} template <class T, class D, class S> TCompactTrie<T, D, S>::TCompactTrie(const TCompactTrie& other) - : DataHolder(other.DataHolder) - , EmptyValue(other.EmptyValue) - , Packer(other.Packer) -{ -} + : DataHolder(other.DataHolder) + , EmptyValue(other.EmptyValue) + , Packer(other.Packer) +{ +} template <class T, class D, class S> TCompactTrie<T, D, S>::TCompactTrie(TCompactTrie&& other) noexcept - : DataHolder(std::move(other.DataHolder)) - , EmptyValue(std::move(other.EmptyValue)) - , Packer(std::move(other.Packer)) -{ -} + : DataHolder(std::move(other.DataHolder)) + , EmptyValue(std::move(other.EmptyValue)) + , Packer(std::move(other.Packer)) +{ +} template <class T, class D, class S> -TCompactTrie<T, D, S>& TCompactTrie<T, D, S>::operator=(const TCompactTrie& other) { +TCompactTrie<T, D, S>& TCompactTrie<T, D, S>::operator=(const TCompactTrie& other) { if (this != &other) { DataHolder = other.DataHolder; EmptyValue = other.EmptyValue; @@ -529,7 +529,7 @@ bool TCompactTrie<T, D, S>::LookupLongestPrefix(const TSymbol* key, size_t keyle template <class T, class D, class S> void TCompactTrie<T, D, S>::LookupPhrases( const char* datapos, size_t len, const TSymbol* key, size_t keylen, - TVector<TPhraseMatch>& matches, TSymbol separator) const { + TVector<TPhraseMatch>& matches, TSymbol separator) const { using namespace NCompactTrie; matches.clear(); @@ -572,10 +572,10 @@ TCompactTrieHolder<T, D, S>::TCompactTrieHolder(IInputStream& is, size_t len) template <class T, class D, class S> TCompactTrie<T, D, S>::TConstIterator::TConstIterator(const TOpaqueTrie& trie, const char* emptyValue, bool atend, TPacker packer) - : Packer(packer) - , Impl(new TOpaqueTrieIterator(trie, emptyValue, atend)) -{ -} + : Packer(packer) + , Impl(new TOpaqueTrieIterator(trie, emptyValue, atend)) +{ +} template <class T, class D, class S> TCompactTrie<T, D, S>::TConstIterator::TConstIterator(const TOpaqueTrie& trie, const char* emptyValue, const TKeyBuf& key, TPacker packer) @@ -586,7 +586,7 @@ TCompactTrie<T, D, S>::TConstIterator::TConstIterator(const TOpaqueTrie& trie, c } template <class T, class D, class S> -bool TCompactTrie<T, D, S>::TConstIterator::operator==(const TConstIterator& other) const { +bool TCompactTrie<T, D, S>::TConstIterator::operator==(const TConstIterator& other) const { if (!Impl) return !other.Impl; if (!other.Impl) @@ -595,8 +595,8 @@ bool TCompactTrie<T, D, S>::TConstIterator::operator==(const TConstIterator& oth } template <class T, class D, class S> -bool TCompactTrie<T, D, S>::TConstIterator::operator!=(const TConstIterator& other) const { - return !operator==(other); +bool TCompactTrie<T, D, S>::TConstIterator::operator!=(const TConstIterator& other) const { + return !operator==(other); } template <class T, class D, class S> diff --git a/library/cpp/containers/comptrie/comptrie_ut.cpp b/library/cpp/containers/comptrie/comptrie_ut.cpp index b4f84b3e4b..74bee09b5d 100644 --- a/library/cpp/containers/comptrie/comptrie_ut.cpp +++ b/library/cpp/containers/comptrie/comptrie_ut.cpp @@ -1,12 +1,12 @@ -#include <util/random/shuffle.h> +#include <util/random/shuffle.h> #include <library/cpp/testing/unittest/registar.h> - + #include <util/stream/output.h> #include <utility> - -#include <util/charset/wide.h> + +#include <util/charset/wide.h> #include <util/generic/algorithm.h> -#include <util/generic/buffer.h> +#include <util/generic/buffer.h> #include <util/generic/map.h> #include <util/generic/vector.h> #include <util/generic/ptr.h> @@ -14,100 +14,100 @@ #include <util/folder/dirut.h> -#include <util/random/random.h> +#include <util/random/random.h> #include <util/random/fast.h> - + #include <util/string/hex.h> #include <util/string/cast.h> -#include "comptrie.h" +#include "comptrie.h" #include "set.h" #include "first_symbol_iterator.h" #include "search_iterator.h" #include "pattern_searcher.h" - + #include <array> #include <iterator> -class TCompactTrieTest: public TTestBase { +class TCompactTrieTest: public TTestBase { private: UNIT_TEST_SUITE(TCompactTrieTest); - UNIT_TEST(TestTrie8); - UNIT_TEST(TestTrie16); - UNIT_TEST(TestTrie32); - - UNIT_TEST(TestFastTrie8); - UNIT_TEST(TestFastTrie16); - UNIT_TEST(TestFastTrie32); - - UNIT_TEST(TestMinimizedTrie8); - UNIT_TEST(TestMinimizedTrie16); - UNIT_TEST(TestMinimizedTrie32); - - UNIT_TEST(TestFastMinimizedTrie8); - UNIT_TEST(TestFastMinimizedTrie16); - UNIT_TEST(TestFastMinimizedTrie32); - - UNIT_TEST(TestTrieIterator8); - UNIT_TEST(TestTrieIterator16); - UNIT_TEST(TestTrieIterator32); - - UNIT_TEST(TestMinimizedTrieIterator8); - UNIT_TEST(TestMinimizedTrieIterator16); - UNIT_TEST(TestMinimizedTrieIterator32); - - UNIT_TEST(TestPhraseSearch); - UNIT_TEST(TestAddGet); - UNIT_TEST(TestEmpty); - UNIT_TEST(TestUninitializedNonEmpty); - UNIT_TEST(TestRandom); - UNIT_TEST(TestFindTails); - UNIT_TEST(TestPrefixGrouped); - UNIT_TEST(CrashTestPrefixGrouped); + UNIT_TEST(TestTrie8); + UNIT_TEST(TestTrie16); + UNIT_TEST(TestTrie32); + + UNIT_TEST(TestFastTrie8); + UNIT_TEST(TestFastTrie16); + UNIT_TEST(TestFastTrie32); + + UNIT_TEST(TestMinimizedTrie8); + UNIT_TEST(TestMinimizedTrie16); + UNIT_TEST(TestMinimizedTrie32); + + UNIT_TEST(TestFastMinimizedTrie8); + UNIT_TEST(TestFastMinimizedTrie16); + UNIT_TEST(TestFastMinimizedTrie32); + + UNIT_TEST(TestTrieIterator8); + UNIT_TEST(TestTrieIterator16); + UNIT_TEST(TestTrieIterator32); + + UNIT_TEST(TestMinimizedTrieIterator8); + UNIT_TEST(TestMinimizedTrieIterator16); + UNIT_TEST(TestMinimizedTrieIterator32); + + UNIT_TEST(TestPhraseSearch); + UNIT_TEST(TestAddGet); + UNIT_TEST(TestEmpty); + UNIT_TEST(TestUninitializedNonEmpty); + UNIT_TEST(TestRandom); + UNIT_TEST(TestFindTails); + UNIT_TEST(TestPrefixGrouped); + UNIT_TEST(CrashTestPrefixGrouped); UNIT_TEST(TestMergeFromFile); UNIT_TEST(TestMergeFromBuffer); - UNIT_TEST(TestUnique); - UNIT_TEST(TestAddRetValue); - UNIT_TEST(TestClear); + UNIT_TEST(TestUnique); + UNIT_TEST(TestAddRetValue); + UNIT_TEST(TestClear); - UNIT_TEST(TestIterateEmptyKey); + UNIT_TEST(TestIterateEmptyKey); - UNIT_TEST(TestTrieSet); + UNIT_TEST(TestTrieSet); - UNIT_TEST(TestTrieForVectorInt64); - UNIT_TEST(TestTrieForListInt64); - UNIT_TEST(TestTrieForSetInt64); + UNIT_TEST(TestTrieForVectorInt64); + UNIT_TEST(TestTrieForListInt64); + UNIT_TEST(TestTrieForSetInt64); - UNIT_TEST(TestTrieForVectorStroka); - UNIT_TEST(TestTrieForListStroka); - UNIT_TEST(TestTrieForSetStroka); + UNIT_TEST(TestTrieForVectorStroka); + UNIT_TEST(TestTrieForListStroka); + UNIT_TEST(TestTrieForSetStroka); - UNIT_TEST(TestTrieForVectorWtroka); - UNIT_TEST(TestTrieForVectorFloat); - UNIT_TEST(TestTrieForVectorDouble); + UNIT_TEST(TestTrieForVectorWtroka); + UNIT_TEST(TestTrieForVectorFloat); + UNIT_TEST(TestTrieForVectorDouble); - UNIT_TEST(TestTrieForListVectorInt64); - UNIT_TEST(TestTrieForPairWtrokaVectorInt64); + UNIT_TEST(TestTrieForListVectorInt64); + UNIT_TEST(TestTrieForPairWtrokaVectorInt64); - UNIT_TEST(TestEmptyValueOutOfOrder); - UNIT_TEST(TestFindLongestPrefixWithEmptyValue); + UNIT_TEST(TestEmptyValueOutOfOrder); + UNIT_TEST(TestFindLongestPrefixWithEmptyValue); - UNIT_TEST(TestSearchIterChar); - UNIT_TEST(TestSearchIterWchar); + UNIT_TEST(TestSearchIterChar); + UNIT_TEST(TestSearchIterWchar); UNIT_TEST(TestSearchIterWchar32) - UNIT_TEST(TestCopyAndAssignment); + UNIT_TEST(TestCopyAndAssignment); - UNIT_TEST(TestFirstSymbolIterator8); - UNIT_TEST(TestFirstSymbolIterator16); - UNIT_TEST(TestFirstSymbolIterator32); + UNIT_TEST(TestFirstSymbolIterator8); + UNIT_TEST(TestFirstSymbolIterator16); + UNIT_TEST(TestFirstSymbolIterator32); UNIT_TEST(TestFirstSymbolIteratorChar32); - UNIT_TEST(TestArrayPacker); + UNIT_TEST(TestArrayPacker); - UNIT_TEST(TestBuilderFindLongestPrefix); - UNIT_TEST(TestBuilderFindLongestPrefixWithEmptyValue); + UNIT_TEST(TestBuilderFindLongestPrefix); + UNIT_TEST(TestBuilderFindLongestPrefixWithEmptyValue); UNIT_TEST(TestPatternSearcherEmpty); UNIT_TEST(TestPatternSearcherSimple); @@ -147,7 +147,7 @@ private: TVector<TContainer> GetSampleVectorData(size_t nValues); template <class TContainer> TVector<TContainer> GetSampleTextVectorData(size_t nValues); - template <class T> + template <class T> void CheckEquality(const T& value1, const T& value2) const; template <class TContainer> void TestTrieWithContainers(const TVector<TUtf16String>& keys, const TVector<TContainer>& sampleData, TString methodName); @@ -265,38 +265,38 @@ public: UNIT_TEST_SUITE_REGISTRATION(TCompactTrieTest); -const char* TCompactTrieTest::SampleData[] = { - "", - "a", "b", "c", "d", - "aa", "ab", "ac", "ad", - "aaa", "aab", "aac", "aad", - "aba", "abb", "abc", "abd", - "fba", "fbb", "fbc", "fbd", - "fbbaa", - "c\x85\xA4\xBF" // Just something outside ASCII. +const char* TCompactTrieTest::SampleData[] = { + "", + "a", "b", "c", "d", + "aa", "ab", "ac", "ad", + "aaa", "aab", "aac", "aad", + "aba", "abb", "abc", "abd", + "fba", "fbb", "fbc", "fbd", + "fbbaa", + "c\x85\xA4\xBF" // Just something outside ASCII. }; -template <class T> +template <class T> typename TCompactTrie<T>::TKey MakeWideKey(const char* str, size_t len) { typename TCompactTrie<T>::TKey buffer; for (size_t i = 0; i < len; i++) { unsigned int ch = (str[i] & 0xFF); - buffer.push_back((T)(ch | ch << 8 | ch << 16 | ch << 24)); + buffer.push_back((T)(ch | ch << 8 | ch << 16 | ch << 24)); } return buffer; } -template <class T> +template <class T> typename TCompactTrie<T>::TKey MakeWideKey(const TString& str) { return MakeWideKey<T>(str.c_str(), str.length()); } -template <class T> +template <class T> typename TCompactTrie<T>::TKey MakeWideKey(const TStringBuf& buf) { return MakeWideKey<T>(buf.data(), buf.length()); } -template <class T> +template <class T> void TCompactTrieTest::CreateTrie(IOutputStream& out, bool minimize, bool useFastLayout) { TCompactTrieBuilder<T> builder; @@ -305,18 +305,18 @@ void TCompactTrieTest::CreateTrie(IOutputStream& out, bool minimize, bool useFas builder.Add(MakeWideKey<T>(i, len), len * 2); } - + TBufferOutput tmp2; IOutputStream& currentOutput = useFastLayout ? tmp2 : out; if (minimize) { TBufferOutput buftmp; builder.Save(buftmp); - CompactTrieMinimize<TCompactTriePacker<ui64>>(currentOutput, buftmp.Buffer().Data(), buftmp.Buffer().Size(), false); + CompactTrieMinimize<TCompactTriePacker<ui64>>(currentOutput, buftmp.Buffer().Data(), buftmp.Buffer().Size(), false); } else { builder.Save(currentOutput); } if (useFastLayout) { - CompactTrieMakeFastLayout<TCompactTriePacker<T>>(out, tmp2.Buffer().Data(), tmp2.Buffer().Size(), false); + CompactTrieMakeFastLayout<TCompactTriePacker<T>>(out, tmp2.Buffer().Data(), tmp2.Buffer().Size(), false); } } @@ -337,7 +337,7 @@ static bool LexicographicStep(TString& s) { return true; } -template <class T> +template <class T> void TCompactTrieTest::CheckUpperBound(const char* data, size_t datalen) { TCompactTrie<T> trie(data, datalen); typedef typename TCompactTrie<T>::TKey TKey; @@ -357,7 +357,7 @@ void TCompactTrieTest::CheckUpperBound(const char* data, size_t datalen) { } while (LexicographicStep(key)); } -template <class T> +template <class T> void TCompactTrieTest::CheckData(const char* data, size_t datalen) { TCompactTrie<T> trie(data, datalen); @@ -413,13 +413,13 @@ void TCompactTrieTest::CheckData(const char* data, size_t datalen) { UNIT_ASSERT(trie.FindLongestPrefix(key, &prefixLen, &value)); UNIT_ASSERT_EQUAL(prefixLen, 3); UNIT_ASSERT_EQUAL(6, value); - + value = 12345678; UNIT_ASSERT(!trie.Find(key, &value)); - UNIT_ASSERT_EQUAL(12345678, value); //Failed Find() should not change value + UNIT_ASSERT_EQUAL(12345678, value); //Failed Find() should not change value } -template <class T> +template <class T> void TCompactTrieTest::CheckIterator(const char* data, size_t datalen) { typedef typename TCompactTrie<T>::TKey TKey; typedef typename TCompactTrie<T>::TValueType TValue; @@ -441,10 +441,10 @@ void TCompactTrieTest::CheckIterator(const char* data, size_t datalen) { received.insert(*it); items.push_back(*it); entry_count++; - it++; + it++; } TMap<TKey, ui64> received2; - for (std::pair<TKey, ui64> x : trie) { + for (std::pair<TKey, ui64> x : trie) { received2.insert(x); } UNIT_ASSERT(entry_count == stored.size()); @@ -485,7 +485,7 @@ void TCompactTrieTest::CheckIterator(const char* data, size_t datalen) { UNIT_ASSERT(revIt != trie.End()); } -template <class T> +template <class T> void TCompactTrieTest::TestTrie(bool minimize, bool useFastLayout) { TBufferOutput bufout; CreateTrie<T>(bufout, minimize, useFastLayout); @@ -493,75 +493,75 @@ void TCompactTrieTest::TestTrie(bool minimize, bool useFastLayout) { CheckUpperBound<T>(bufout.Buffer().Data(), bufout.Buffer().Size()); } -template <class T> +template <class T> void TCompactTrieTest::TestTrieIterator(bool minimize) { TBufferOutput bufout; CreateTrie<T>(bufout, minimize, false); CheckIterator<T>(bufout.Buffer().Data(), bufout.Buffer().Size()); } -void TCompactTrieTest::TestTrie8() { - TestTrie<char>(false, false); -} -void TCompactTrieTest::TestTrie16() { - TestTrie<wchar16>(false, false); -} -void TCompactTrieTest::TestTrie32() { - TestTrie<wchar32>(false, false); -} - -void TCompactTrieTest::TestFastTrie8() { - TestTrie<char>(false, true); -} -void TCompactTrieTest::TestFastTrie16() { - TestTrie<wchar16>(false, true); -} -void TCompactTrieTest::TestFastTrie32() { - TestTrie<wchar32>(false, true); -} - -void TCompactTrieTest::TestMinimizedTrie8() { - TestTrie<char>(true, false); -} -void TCompactTrieTest::TestMinimizedTrie16() { - TestTrie<wchar16>(true, false); -} -void TCompactTrieTest::TestMinimizedTrie32() { - TestTrie<wchar32>(true, false); -} - -void TCompactTrieTest::TestFastMinimizedTrie8() { - TestTrie<char>(true, true); -} -void TCompactTrieTest::TestFastMinimizedTrie16() { - TestTrie<wchar16>(true, true); -} -void TCompactTrieTest::TestFastMinimizedTrie32() { - TestTrie<wchar32>(true, true); -} - -void TCompactTrieTest::TestTrieIterator8() { - TestTrieIterator<char>(false); -} -void TCompactTrieTest::TestTrieIterator16() { - TestTrieIterator<wchar16>(false); -} -void TCompactTrieTest::TestTrieIterator32() { - TestTrieIterator<wchar32>(false); -} - -void TCompactTrieTest::TestMinimizedTrieIterator8() { - TestTrieIterator<char>(true); -} -void TCompactTrieTest::TestMinimizedTrieIterator16() { - TestTrieIterator<wchar16>(true); -} -void TCompactTrieTest::TestMinimizedTrieIterator32() { - TestTrieIterator<wchar32>(true); -} +void TCompactTrieTest::TestTrie8() { + TestTrie<char>(false, false); +} +void TCompactTrieTest::TestTrie16() { + TestTrie<wchar16>(false, false); +} +void TCompactTrieTest::TestTrie32() { + TestTrie<wchar32>(false, false); +} + +void TCompactTrieTest::TestFastTrie8() { + TestTrie<char>(false, true); +} +void TCompactTrieTest::TestFastTrie16() { + TestTrie<wchar16>(false, true); +} +void TCompactTrieTest::TestFastTrie32() { + TestTrie<wchar32>(false, true); +} + +void TCompactTrieTest::TestMinimizedTrie8() { + TestTrie<char>(true, false); +} +void TCompactTrieTest::TestMinimizedTrie16() { + TestTrie<wchar16>(true, false); +} +void TCompactTrieTest::TestMinimizedTrie32() { + TestTrie<wchar32>(true, false); +} + +void TCompactTrieTest::TestFastMinimizedTrie8() { + TestTrie<char>(true, true); +} +void TCompactTrieTest::TestFastMinimizedTrie16() { + TestTrie<wchar16>(true, true); +} +void TCompactTrieTest::TestFastMinimizedTrie32() { + TestTrie<wchar32>(true, true); +} + +void TCompactTrieTest::TestTrieIterator8() { + TestTrieIterator<char>(false); +} +void TCompactTrieTest::TestTrieIterator16() { + TestTrieIterator<wchar16>(false); +} +void TCompactTrieTest::TestTrieIterator32() { + TestTrieIterator<wchar32>(false); +} + +void TCompactTrieTest::TestMinimizedTrieIterator8() { + TestTrieIterator<char>(true); +} +void TCompactTrieTest::TestMinimizedTrieIterator16() { + TestTrieIterator<wchar16>(true); +} +void TCompactTrieTest::TestMinimizedTrieIterator32() { + TestTrieIterator<wchar32>(true); +} void TCompactTrieTest::TestPhraseSearch() { - static const char* phrases[] = {"ab", "ab cd", "ab cd ef"}; + static const char* phrases[] = {"ab", "ab cd", "ab cd ef"}; static const char* const goodphrase = "ab cd ef gh"; static const char* const badphrase = "cd ef gh ab"; TBufferOutput bufout; @@ -651,7 +651,7 @@ static char RandChar() { } static TString RandStr(const size_t max) { - size_t len = RandomNumber<size_t>() % max; + size_t len = RandomNumber<size_t>() % max; TString key; for (size_t j = 0; j < len; ++j) key += RandChar(); @@ -664,7 +664,7 @@ void TCompactTrieTest::TestRandom(const size_t n, const size_t maxKeySize) { TCompactTrieBuilder<char, typename T::TData, T> builder; typedef TMap<TString, typename T::TData> TKeys; TKeys keys; - + typename T::TData dummy; for (size_t i = 0; i < n; ++i) { const TString key = RandStr(maxKeySize); @@ -677,7 +677,7 @@ void TCompactTrieTest::TestRandom(const size_t n, const size_t maxKeySize) { UNIT_ASSERT(dummy == val); } } - + TBufferStream stream; size_t len = builder.Save(stream); TCompactTrie<char, typename T::TData, T> trie(stream.Buffer().Data(), len); @@ -721,12 +721,12 @@ void TCompactTrieTest::TestRandom(const size_t n, const size_t maxKeySize) { } void TCompactTrieTest::TestRandom() { - TestRandom<TIntPacker<ui64>, true>(1000, 1000); - TestRandom<TIntPacker<int>, true>(100, 100); - TestRandom<TDummyPacker<ui64>, true>(0, 0); - TestRandom<TDummyPacker<ui64>, true>(100, 3); - TestRandom<TDummyPacker<ui64>, true>(100, 100); - TestRandom<TStrokaPacker, true>(100, 100); + TestRandom<TIntPacker<ui64>, true>(1000, 1000); + TestRandom<TIntPacker<int>, true>(100, 100); + TestRandom<TDummyPacker<ui64>, true>(0, 0); + TestRandom<TDummyPacker<ui64>, true>(100, 3); + TestRandom<TDummyPacker<ui64>, true>(100, 100); + TestRandom<TStrokaPacker, true>(100, 100); } void TCompactTrieTest::TestFindTailsImpl(const TString& prefix) { @@ -779,14 +779,14 @@ void TCompactTrieTest::TestPrefixGrouped() { TBuffer b1b; TCompactTrieBuilder<char, ui32> b1(CTBF_PREFIX_GROUPED); const char* data[] = { - "Kazan", - "Moscow", - "Monino", - "Murmansk", - "Fryanovo", - "Fryazino", - "Fryazevo", - "Tumen", + "Kazan", + "Moscow", + "Monino", + "Murmansk", + "Fryanovo", + "Fryazino", + "Fryazevo", + "Tumen", }; for (size_t i = 0; i < Y_ARRAY_SIZE(data); ++i) { @@ -937,15 +937,15 @@ void TCompactTrieTest::TestUnique() { void TCompactTrieTest::TestUniqueImpl(bool isPrefixGrouped) { TCompactTrieBuilder<char, ui32> builder(CTBF_UNIQUE | (isPrefixGrouped ? CTBF_PREFIX_GROUPED : CTBF_NONE)); const char* data[] = { - "Kazan", - "Moscow", - "Monino", - "Murmansk", - "Fryanovo", - "Fryazino", - "Fryazevo", - "Fry", - "Tumen", + "Kazan", + "Moscow", + "Monino", + "Murmansk", + "Fryanovo", + "Fryazino", + "Fryazevo", + "Fry", + "Tumen", }; for (size_t i = 0; i < Y_ARRAY_SIZE(data); ++i) { UNIT_ASSERT_C(builder.Add(data[i], strlen(data[i]), i + 1), i); @@ -963,15 +963,15 @@ void TCompactTrieTest::TestUniqueImpl(bool isPrefixGrouped) { void TCompactTrieTest::TestAddRetValue() { TCompactTrieBuilder<char, ui32> builder; const char* data[] = { - "Kazan", - "Moscow", - "Monino", - "Murmansk", - "Fryanovo", - "Fryazino", - "Fryazevo", - "Fry", - "Tumen", + "Kazan", + "Moscow", + "Monino", + "Murmansk", + "Fryanovo", + "Fryazino", + "Fryazevo", + "Fry", + "Tumen", }; for (size_t i = 0; i < Y_ARRAY_SIZE(data); ++i) { UNIT_ASSERT(builder.Add(data[i], strlen(data[i]), i + 1)); @@ -982,7 +982,7 @@ void TCompactTrieTest::TestAddRetValue() { } } -void TCompactTrieTest::TestClear() { +void TCompactTrieTest::TestClear() { TCompactTrieBuilder<char, ui32> builder; const char* data[] = { "Kazan", @@ -1008,14 +1008,14 @@ void TCompactTrieTest::TestFindTails() { TestFindTailsImpl("aa"); TestFindTailsImpl("bb"); TestFindTailsImpl("fb"); - TestFindTailsImpl("fbc"); - TestFindTailsImpl("fbbaa"); + TestFindTailsImpl("fbc"); + TestFindTailsImpl("fbbaa"); } template <class T> -class TCompactTrieTest::TDummyPacker: public TNullPacker<T> { +class TCompactTrieTest::TDummyPacker: public TNullPacker<T> { public: - static T Data(const TString&) { + static T Data(const TString&) { T data; TNullPacker<T>().UnpackLeaf(nullptr, data); return data; @@ -1024,21 +1024,21 @@ public: typedef T TData; }; -class TCompactTrieTest::TStrokaPacker: public TCompactTriePacker<TString> { +class TCompactTrieTest::TStrokaPacker: public TCompactTriePacker<TString> { public: typedef TString TData; - static TString Data(const TString& str) { + static TString Data(const TString& str) { return str; } }; template <class T> -class TCompactTrieTest::TIntPacker: public TCompactTriePacker<T> { +class TCompactTrieTest::TIntPacker: public TCompactTriePacker<T> { public: typedef T TData; - static TData Data(const TString&) { + static TData Data(const TString&) { return RandomNumber<std::make_unsigned_t<T>>(); } }; @@ -1099,7 +1099,7 @@ void TCompactTrieTest::TestTrieSet() { UNIT_ASSERT(!empty.Has("d")); UNIT_ASSERT(!empty.Has("d")); - UNIT_ASSERT(empty.Has("")); // contains only empty string + UNIT_ASSERT(empty.Has("")); // contains only empty string } // Tests for trie with vector (list, set) values @@ -1119,7 +1119,7 @@ TVector<TContainer> TCompactTrieTest::GetSampleVectorData(size_t nValues) { for (size_t i = 0; i < nValues; i++) { data.push_back(TContainer()); for (size_t j = 0; j < i; j++) - data[i].insert(data[i].end(), (typename TContainer::value_type)((j == 3) ? 0 : (1 << (j * 5)))); + data[i].insert(data[i].end(), (typename TContainer::value_type)((j == 3) ? 0 : (1 << (j * 5)))); } return data; } @@ -1135,16 +1135,16 @@ TVector<TContainer> TCompactTrieTest::GetSampleTextVectorData(size_t nValues) { return data; } -template <class T> +template <class T> void TCompactTrieTest::CheckEquality(const T& value1, const T& value2) const { UNIT_ASSERT_VALUES_EQUAL(value1, value2); } -template <> -void TCompactTrieTest::CheckEquality<TVector<i64>>(const TVector<i64>& value1, const TVector<i64>& value2) const { - UNIT_ASSERT_VALUES_EQUAL(value1.size(), value2.size()); - for (size_t i = 0; i < value1.size(); i++) - UNIT_ASSERT_VALUES_EQUAL(value1[i], value2[i]); +template <> +void TCompactTrieTest::CheckEquality<TVector<i64>>(const TVector<i64>& value1, const TVector<i64>& value2) const { + UNIT_ASSERT_VALUES_EQUAL(value1.size(), value2.size()); + for (size_t i = 0; i < value1.size(); i++) + UNIT_ASSERT_VALUES_EQUAL(value1[i], value2[i]); } template <class TContainer> @@ -1171,9 +1171,9 @@ void TCompactTrieTest::TestTrieWithContainers(const TVector<TUtf16String>& keys, unlink(fileName.data()); } -template <> -void TCompactTrieTest::TestTrieWithContainers<std::pair<TUtf16String, TVector<i64>>>(const TVector<TUtf16String>& keys, const TVector<std::pair<TUtf16String, TVector<i64>>>& sampleData, TString methodName) { - typedef std::pair<TUtf16String, TVector<i64>> TContainer; +template <> +void TCompactTrieTest::TestTrieWithContainers<std::pair<TUtf16String, TVector<i64>>>(const TVector<TUtf16String>& keys, const TVector<std::pair<TUtf16String, TVector<i64>>>& sampleData, TString methodName) { + typedef std::pair<TUtf16String, TVector<i64>> TContainer; TString fileName = GetSystemTempDir() + "/TCompactTrieTest-TestTrieWithContainers-" + methodName; TCompactTrieBuilder<wchar16, TContainer> b; @@ -1194,63 +1194,63 @@ void TCompactTrieTest::TestTrieWithContainers<std::pair<TUtf16String, TVector<i6 } void TCompactTrieTest::TestTrieForVectorInt64() { - TestTrieWithContainers<TVector<i64>>(GetSampleKeys(10), GetSampleVectorData<TVector<i64>>(10), "v-i64"); + TestTrieWithContainers<TVector<i64>>(GetSampleKeys(10), GetSampleVectorData<TVector<i64>>(10), "v-i64"); } void TCompactTrieTest::TestTrieForListInt64() { - TestTrieWithContainers<TList<i64>>(GetSampleKeys(10), GetSampleVectorData<TList<i64>>(10), "l-i64"); + TestTrieWithContainers<TList<i64>>(GetSampleKeys(10), GetSampleVectorData<TList<i64>>(10), "l-i64"); } void TCompactTrieTest::TestTrieForSetInt64() { - TestTrieWithContainers<TSet<i64>>(GetSampleKeys(10), GetSampleVectorData<TSet<i64>>(10), "s-i64"); + TestTrieWithContainers<TSet<i64>>(GetSampleKeys(10), GetSampleVectorData<TSet<i64>>(10), "s-i64"); } void TCompactTrieTest::TestTrieForVectorStroka() { - TestTrieWithContainers<TVector<TString>>(GetSampleKeys(10), GetSampleTextVectorData<TVector<TString>>(10), "v-str"); + TestTrieWithContainers<TVector<TString>>(GetSampleKeys(10), GetSampleTextVectorData<TVector<TString>>(10), "v-str"); } void TCompactTrieTest::TestTrieForListStroka() { - TestTrieWithContainers<TList<TString>>(GetSampleKeys(10), GetSampleTextVectorData<TList<TString>>(10), "l-str"); + TestTrieWithContainers<TList<TString>>(GetSampleKeys(10), GetSampleTextVectorData<TList<TString>>(10), "l-str"); } void TCompactTrieTest::TestTrieForSetStroka() { - TestTrieWithContainers<TSet<TString>>(GetSampleKeys(10), GetSampleTextVectorData<TSet<TString>>(10), "s-str"); + TestTrieWithContainers<TSet<TString>>(GetSampleKeys(10), GetSampleTextVectorData<TSet<TString>>(10), "s-str"); } void TCompactTrieTest::TestTrieForVectorWtroka() { - TVector<TVector<TString>> data = GetSampleTextVectorData<TVector<TString>>(10); - TVector<TVector<TUtf16String>> wData; + TVector<TVector<TString>> data = GetSampleTextVectorData<TVector<TString>>(10); + TVector<TVector<TUtf16String>> wData; for (size_t i = 0; i < data.size(); i++) { wData.push_back(TVector<TUtf16String>()); for (size_t j = 0; j < data[i].size(); j++) - wData[i].push_back(UTF8ToWide(data[i][j])); + wData[i].push_back(UTF8ToWide(data[i][j])); } - TestTrieWithContainers<TVector<TUtf16String>>(GetSampleKeys(10), wData, "v-wtr"); + TestTrieWithContainers<TVector<TUtf16String>>(GetSampleKeys(10), wData, "v-wtr"); } void TCompactTrieTest::TestTrieForVectorFloat() { - TestTrieWithContainers<TVector<float>>(GetSampleKeys(10), GetSampleVectorData<TVector<float>>(10), "v-float"); + TestTrieWithContainers<TVector<float>>(GetSampleKeys(10), GetSampleVectorData<TVector<float>>(10), "v-float"); } void TCompactTrieTest::TestTrieForVectorDouble() { - TestTrieWithContainers<TVector<double>>(GetSampleKeys(10), GetSampleVectorData<TVector<double>>(10), "v-double"); + TestTrieWithContainers<TVector<double>>(GetSampleKeys(10), GetSampleVectorData<TVector<double>>(10), "v-double"); } void TCompactTrieTest::TestTrieForListVectorInt64() { TVector<i64> tmp; - tmp.push_back(0); - TList<TVector<i64>> dataElement(5, tmp); - TVector<TList<TVector<i64>>> data(10, dataElement); - TestTrieWithContainers<TList<TVector<i64>>>(GetSampleKeys(10), data, "l-v-i64"); + tmp.push_back(0); + TList<TVector<i64>> dataElement(5, tmp); + TVector<TList<TVector<i64>>> data(10, dataElement); + TestTrieWithContainers<TList<TVector<i64>>>(GetSampleKeys(10), data, "l-v-i64"); } void TCompactTrieTest::TestTrieForPairWtrokaVectorInt64() { TVector<TUtf16String> keys = GetSampleKeys(10); - TVector<TVector<i64>> values = GetSampleVectorData<TVector<i64>>(10); - TVector<std::pair<TUtf16String, TVector<i64>>> data; + TVector<TVector<i64>> values = GetSampleVectorData<TVector<i64>>(10); + TVector<std::pair<TUtf16String, TVector<i64>>> data; for (size_t i = 0; i < 10; i++) data.push_back(std::pair<TUtf16String, TVector<i64>>(keys[i] + u"_v", values[i])); - TestTrieWithContainers<std::pair<TUtf16String, TVector<i64>>>(keys, data, "pair-str-v-i64"); + TestTrieWithContainers<std::pair<TUtf16String, TVector<i64>>>(keys, data, "pair-str-v-i64"); } void TCompactTrieTest::TestEmptyValueOutOfOrder() { @@ -1468,7 +1468,7 @@ void TCompactTrieTest::TestArrayPacker() { void TCompactTrieTest::TestBuilderFindLongestPrefix() { const size_t sizes[] = {10, 100}; - const double branchProbabilities[] = {0.01, 0.1, 0.5, 0.9, 0.99}; + const double branchProbabilities[] = {0.01, 0.1, 0.5, 0.9, 0.99}; for (size_t size : sizes) { for (double branchProbability : branchProbabilities) { TestBuilderFindLongestPrefix(size, branchProbability, false, false); @@ -1498,7 +1498,7 @@ void TCompactTrieTest::TestBuilderFindLongestPrefix(size_t keysCount, double bra if (isPrefixGrouped) Sort(keys.begin(), keys.end()); else - Shuffle(keys.begin(), keys.end()); + Shuffle(keys.begin(), keys.end()); TCompactTrieBuilder<char, TString> builder(isPrefixGrouped ? CTBF_PREFIX_GROUPED : CTBF_NONE); const TString EMPTY_VALUE = "empty"; diff --git a/library/cpp/containers/comptrie/first_symbol_iterator.h b/library/cpp/containers/comptrie/first_symbol_iterator.h index d3c521e7e6..d06135f06f 100644 --- a/library/cpp/containers/comptrie/first_symbol_iterator.h +++ b/library/cpp/containers/comptrie/first_symbol_iterator.h @@ -6,7 +6,7 @@ namespace NCompactTrie { // Iterates over possible first symbols in a trie. // Allows one to get the symbol and the subtrie starting from it. - template <class TTrie> + template <class TTrie> class TFirstSymbolIterator { public: using TSymbol = typename TTrie::TSymbol; @@ -58,4 +58,4 @@ namespace NCompactTrie { TCopyPtr<TOpaqueTrieIterator> Impl; }; -} +} diff --git a/library/cpp/containers/comptrie/key_selector.h b/library/cpp/containers/comptrie/key_selector.h index 80191df744..60466cef71 100644 --- a/library/cpp/containers/comptrie/key_selector.h +++ b/library/cpp/containers/comptrie/key_selector.h @@ -4,24 +4,24 @@ #include <util/generic/string.h> #include <util/generic/strbuf.h> -template <class T> +template <class T> struct TCompactTrieKeySelector { typedef TVector<T> TKey; typedef TVector<T> TKeyBuf; }; -template <class TChar> +template <class TChar> struct TCompactTrieCharKeySelector { typedef TBasicString<TChar> TKey; typedef TBasicStringBuf<TChar> TKeyBuf; }; -template <> -struct TCompactTrieKeySelector<char>: public TCompactTrieCharKeySelector<char> { +template <> +struct TCompactTrieKeySelector<char>: public TCompactTrieCharKeySelector<char> { }; -template <> -struct TCompactTrieKeySelector<wchar16>: public TCompactTrieCharKeySelector<wchar16> { +template <> +struct TCompactTrieKeySelector<wchar16>: public TCompactTrieCharKeySelector<wchar16> { }; template <> diff --git a/library/cpp/containers/comptrie/leaf_skipper.h b/library/cpp/containers/comptrie/leaf_skipper.h index 0bdfa11d63..3959258948 100644 --- a/library/cpp/containers/comptrie/leaf_skipper.h +++ b/library/cpp/containers/comptrie/leaf_skipper.h @@ -9,11 +9,11 @@ namespace NCompactTrie { virtual ~ILeafSkipper() = default; }; - template <class TPacker> - class TPackerLeafSkipper: public ILeafSkipper { + template <class TPacker> + class TPackerLeafSkipper: public ILeafSkipper { private: const TPacker* Packer; - + public: TPackerLeafSkipper(const TPacker* packer) : Packer(packer) @@ -25,9 +25,9 @@ namespace NCompactTrie { } // For test purposes. - const TPacker* GetPacker() const { - return Packer; - } + const TPacker* GetPacker() const { + return Packer; + } }; // The data you need to traverse the trie without unpacking the values. @@ -40,13 +40,13 @@ namespace NCompactTrie { : Data(data) , Length(dataLength) , SkipFunction(skipFunction) - { - } + { + } bool operator==(const TOpaqueTrie& other) const { return Data == other.Data && - Length == other.Length && - &SkipFunction == &other.SkipFunction; + Length == other.Length && + &SkipFunction == &other.SkipFunction; } bool operator!=(const TOpaqueTrie& other) const { diff --git a/library/cpp/containers/comptrie/loader/loader.h b/library/cpp/containers/comptrie/loader/loader.h index d172efa541..ee10e9b451 100644 --- a/library/cpp/containers/comptrie/loader/loader.h +++ b/library/cpp/containers/comptrie/loader/loader.h @@ -9,7 +9,7 @@ template <class TrieType, size_t N> TrieType LoadTrieFromArchive(const TString& key, const unsigned char (&data)[N], - bool ignoreErrors = false) { + bool ignoreErrors = false) { TArchiveReader archive(TBlob::NoCopy(data, sizeof(data))); if (archive.Has(key)) { TAutoPtr<IInputStream> trie = archive.ObjectByKey(key); diff --git a/library/cpp/containers/comptrie/loader/loader_ut.cpp b/library/cpp/containers/comptrie/loader/loader_ut.cpp index 2e771357fe..345063a31e 100644 --- a/library/cpp/containers/comptrie/loader/loader_ut.cpp +++ b/library/cpp/containers/comptrie/loader/loader_ut.cpp @@ -6,25 +6,25 @@ using TDummyTrie = TCompactTrie<char, i32>; namespace { const unsigned char DATA[] = { -#include "data.inc" +#include "data.inc" }; -} +} Y_UNIT_TEST_SUITE(ArchiveLoaderTests) { Y_UNIT_TEST(BaseTest) { - TDummyTrie trie = LoadTrieFromArchive<TDummyTrie>("/dummy.trie", DATA, true); - UNIT_ASSERT_EQUAL(trie.Size(), 3); + TDummyTrie trie = LoadTrieFromArchive<TDummyTrie>("/dummy.trie", DATA, true); + UNIT_ASSERT_EQUAL(trie.Size(), 3); - const TString TrieKyes[3] = { - "zero", "one", "two"}; - i32 val = -1; - for (i32 i = 0; i < 3; ++i) { + const TString TrieKyes[3] = { + "zero", "one", "two"}; + i32 val = -1; + for (i32 i = 0; i < 3; ++i) { UNIT_ASSERT(trie.Find(TrieKyes[i].data(), TrieKyes[i].size(), &val)); - UNIT_ASSERT_EQUAL(i, val); - } + UNIT_ASSERT_EQUAL(i, val); + } - UNIT_CHECK_GENERATED_EXCEPTION( - LoadTrieFromArchive<TDummyTrie>("/noname.trie", DATA), - yexception); + UNIT_CHECK_GENERATED_EXCEPTION( + LoadTrieFromArchive<TDummyTrie>("/noname.trie", DATA), + yexception); } } diff --git a/library/cpp/containers/comptrie/make_fast_layout.cpp b/library/cpp/containers/comptrie/make_fast_layout.cpp index 693384ee7f..ade78d7899 100644 --- a/library/cpp/containers/comptrie/make_fast_layout.cpp +++ b/library/cpp/containers/comptrie/make_fast_layout.cpp @@ -24,444 +24,444 @@ // (there is not much difference between these papers, actually). // namespace NCompactTrie { - static size_t FindSupportingPowerOf2(size_t n) { - size_t result = 1ull << (8 * sizeof(size_t) - 1); - while (result > n) { - result >>= 1; - } - return result; + static size_t FindSupportingPowerOf2(size_t n) { + size_t result = 1ull << (8 * sizeof(size_t) - 1); + while (result > n) { + result >>= 1; + } + return result; } - namespace { - class TTrieNodeSet { - public: + namespace { + class TTrieNodeSet { + public: TTrieNodeSet() = default; - explicit TTrieNodeSet(const TOpaqueTrie& trie) - : Body(trie.Length / (8 * MinNodeSize) + 1, 0) - { - } - - bool Has(size_t offset) const { - const size_t reducedOffset = ReducedOffset(offset); - return OffsetCell(reducedOffset) & OffsetMask(reducedOffset); - } - - void Add(size_t offset) { - const size_t reducedOffset = ReducedOffset(offset); - OffsetCell(reducedOffset) |= OffsetMask(reducedOffset); - } - - void Remove(size_t offset) { - const size_t reducedOffset = ReducedOffset(offset); - OffsetCell(reducedOffset) &= ~OffsetMask(reducedOffset); - } - - void Swap(TTrieNodeSet& other) { - Body.swap(other.Body); - } - - private: - static const size_t MinNodeSize = 2; - TVector<ui8> Body; - - static size_t ReducedOffset(size_t offset) { - return offset / MinNodeSize; - } - static ui8 OffsetMask(size_t reducedOffset) { - return 1 << (reducedOffset % 8); - } - ui8& OffsetCell(size_t reducedOffset) { - return Body.at(reducedOffset / 8); - } - const ui8& OffsetCell(size_t reducedOffset) const { - return Body.at(reducedOffset / 8); - } - }; - - //--------------------------------------------------------------------- - - class TTrieNodeCounts { - public: + explicit TTrieNodeSet(const TOpaqueTrie& trie) + : Body(trie.Length / (8 * MinNodeSize) + 1, 0) + { + } + + bool Has(size_t offset) const { + const size_t reducedOffset = ReducedOffset(offset); + return OffsetCell(reducedOffset) & OffsetMask(reducedOffset); + } + + void Add(size_t offset) { + const size_t reducedOffset = ReducedOffset(offset); + OffsetCell(reducedOffset) |= OffsetMask(reducedOffset); + } + + void Remove(size_t offset) { + const size_t reducedOffset = ReducedOffset(offset); + OffsetCell(reducedOffset) &= ~OffsetMask(reducedOffset); + } + + void Swap(TTrieNodeSet& other) { + Body.swap(other.Body); + } + + private: + static const size_t MinNodeSize = 2; + TVector<ui8> Body; + + static size_t ReducedOffset(size_t offset) { + return offset / MinNodeSize; + } + static ui8 OffsetMask(size_t reducedOffset) { + return 1 << (reducedOffset % 8); + } + ui8& OffsetCell(size_t reducedOffset) { + return Body.at(reducedOffset / 8); + } + const ui8& OffsetCell(size_t reducedOffset) const { + return Body.at(reducedOffset / 8); + } + }; + + //--------------------------------------------------------------------- + + class TTrieNodeCounts { + public: TTrieNodeCounts() = default; - explicit TTrieNodeCounts(const TOpaqueTrie& trie) - : Body(trie.Length / MinNodeSize, 0) - , IsTree(false) - { - } - - size_t Get(size_t offset) const { - return IsTree ? 1 : Body.at(offset / MinNodeSize); - } - - void Inc(size_t offset) { - if (IsTree) { - return; - } - ui8& count = Body.at(offset / MinNodeSize); - if (count != MaxCount) { - ++count; - } - } - - size_t Dec(size_t offset) { - if (IsTree) { - return 0; - } - ui8& count = Body.at(offset / MinNodeSize); - Y_ASSERT(count > 0); - if (count != MaxCount) { - --count; - } - return count; - } - - void Swap(TTrieNodeCounts& other) { - Body.swap(other.Body); - ::DoSwap(IsTree, other.IsTree); - } - - void SetTreeMode() { - IsTree = true; - Body = TVector<ui8>(); - } - - private: - static const size_t MinNodeSize = 2; - static const ui8 MaxCount = 255; - - TVector<ui8> Body; - bool IsTree = false; - }; - - //---------------------------------------------------------- - - class TOffsetIndex { - public: - // In all methods: - // Key --- offset from the beginning of the old trie. - // Value --- offset from the end of the new trie. - - explicit TOffsetIndex(TTrieNodeCounts& counts) { - ParentCounts.Swap(counts); - } - - void Add(size_t key, size_t value) { - Data[key] = value; - } - - size_t Size() const { - return Data.size(); - } - - size_t Get(size_t key) { - auto pos = Data.find(key); - if (pos == Data.end()) { - ythrow yexception() << "Bad node walking order: trying to get node offset too early or too many times!"; - } - size_t result = pos->second; - if (ParentCounts.Dec(key) == 0) { - // We don't need this offset any more. - Data.erase(pos); - } - return result; - } - - private: - TTrieNodeCounts ParentCounts; - THashMap<size_t, size_t> Data; - }; - - //--------------------------------------------------------------------------------------- - - class TTrieMeasurer { - public: - TTrieMeasurer(const TOpaqueTrie& trie, bool verbose); - void Measure(); - - size_t GetDepth() const { - return Depth; - } - - size_t GetNodeCount() const { - return NodeCount; - } - - size_t GetUnminimizedNodeCount() const { - return UnminimizedNodeCount; - } - - bool IsTree() const { - return NodeCount == UnminimizedNodeCount; - } - - TTrieNodeCounts& GetParentCounts() { - return ParentCounts; - } - - const TOpaqueTrie& GetTrie() const { - return Trie; - } - - private: - const TOpaqueTrie& Trie; - size_t Depth; - TTrieNodeCounts ParentCounts; - size_t NodeCount; - size_t UnminimizedNodeCount; - const bool Verbose; - - // returns depth, increments NodeCount. - size_t MeasureSubtrie(size_t rootOffset, bool isNewPath); - }; - - TTrieMeasurer::TTrieMeasurer(const TOpaqueTrie& trie, bool verbose) - : Trie(trie) - , Depth(0) - , ParentCounts(trie) - , NodeCount(0) - , UnminimizedNodeCount(0) - , Verbose(verbose) - { - Y_ASSERT(Trie.Data); - } - - void TTrieMeasurer::Measure() { - if (Verbose) { - Cerr << "Measuring the trie..." << Endl; - } - NodeCount = 0; - UnminimizedNodeCount = 0; - Depth = MeasureSubtrie(0, true); - if (IsTree()) { - ParentCounts.SetTreeMode(); - } - if (Verbose) { - Cerr << "Unminimized node count: " << UnminimizedNodeCount << Endl; - Cerr << "Trie depth: " << Depth << Endl; - Cerr << "Node count: " << NodeCount << Endl; - } - } - - // A chain of nodes linked by forward links - // is considered one node with many left and right children - // for depth measuring here and in - // TVanEmdeBoasReverseNodeEnumerator::FindDescendants. - size_t TTrieMeasurer::MeasureSubtrie(size_t rootOffset, bool isNewPath) { - Y_ASSERT(rootOffset < Trie.Length); - TNode node(Trie.Data, rootOffset, Trie.SkipFunction); - size_t depth = 0; - for (;;) { - ++UnminimizedNodeCount; - if (Verbose) { - ShowProgress(UnminimizedNodeCount); - } - if (isNewPath) { - if (ParentCounts.Get(node.GetOffset()) > 0) { - isNewPath = false; - } else { - ++NodeCount; - } - ParentCounts.Inc(node.GetOffset()); - } - if (node.GetLeftOffset()) { - depth = Max(depth, 1 + MeasureSubtrie(node.GetLeftOffset(), isNewPath)); - } - if (node.GetRightOffset()) { - depth = Max(depth, 1 + MeasureSubtrie(node.GetRightOffset(), isNewPath)); - } - if (node.GetForwardOffset()) { - node = TNode(Trie.Data, node.GetForwardOffset(), Trie.SkipFunction); - } else { - break; - } - } - return depth; + explicit TTrieNodeCounts(const TOpaqueTrie& trie) + : Body(trie.Length / MinNodeSize, 0) + , IsTree(false) + { + } + + size_t Get(size_t offset) const { + return IsTree ? 1 : Body.at(offset / MinNodeSize); + } + + void Inc(size_t offset) { + if (IsTree) { + return; + } + ui8& count = Body.at(offset / MinNodeSize); + if (count != MaxCount) { + ++count; + } + } + + size_t Dec(size_t offset) { + if (IsTree) { + return 0; + } + ui8& count = Body.at(offset / MinNodeSize); + Y_ASSERT(count > 0); + if (count != MaxCount) { + --count; + } + return count; + } + + void Swap(TTrieNodeCounts& other) { + Body.swap(other.Body); + ::DoSwap(IsTree, other.IsTree); + } + + void SetTreeMode() { + IsTree = true; + Body = TVector<ui8>(); + } + + private: + static const size_t MinNodeSize = 2; + static const ui8 MaxCount = 255; + + TVector<ui8> Body; + bool IsTree = false; + }; + + //---------------------------------------------------------- + + class TOffsetIndex { + public: + // In all methods: + // Key --- offset from the beginning of the old trie. + // Value --- offset from the end of the new trie. + + explicit TOffsetIndex(TTrieNodeCounts& counts) { + ParentCounts.Swap(counts); + } + + void Add(size_t key, size_t value) { + Data[key] = value; + } + + size_t Size() const { + return Data.size(); + } + + size_t Get(size_t key) { + auto pos = Data.find(key); + if (pos == Data.end()) { + ythrow yexception() << "Bad node walking order: trying to get node offset too early or too many times!"; + } + size_t result = pos->second; + if (ParentCounts.Dec(key) == 0) { + // We don't need this offset any more. + Data.erase(pos); + } + return result; + } + + private: + TTrieNodeCounts ParentCounts; + THashMap<size_t, size_t> Data; + }; + + //--------------------------------------------------------------------------------------- + + class TTrieMeasurer { + public: + TTrieMeasurer(const TOpaqueTrie& trie, bool verbose); + void Measure(); + + size_t GetDepth() const { + return Depth; + } + + size_t GetNodeCount() const { + return NodeCount; + } + + size_t GetUnminimizedNodeCount() const { + return UnminimizedNodeCount; + } + + bool IsTree() const { + return NodeCount == UnminimizedNodeCount; + } + + TTrieNodeCounts& GetParentCounts() { + return ParentCounts; + } + + const TOpaqueTrie& GetTrie() const { + return Trie; + } + + private: + const TOpaqueTrie& Trie; + size_t Depth; + TTrieNodeCounts ParentCounts; + size_t NodeCount; + size_t UnminimizedNodeCount; + const bool Verbose; + + // returns depth, increments NodeCount. + size_t MeasureSubtrie(size_t rootOffset, bool isNewPath); + }; + + TTrieMeasurer::TTrieMeasurer(const TOpaqueTrie& trie, bool verbose) + : Trie(trie) + , Depth(0) + , ParentCounts(trie) + , NodeCount(0) + , UnminimizedNodeCount(0) + , Verbose(verbose) + { + Y_ASSERT(Trie.Data); } - - //-------------------------------------------------------------------------------------- - - using TLevelNodes = TVector<size_t>; - - struct TLevel { - size_t Depth; - TLevelNodes Nodes; - - explicit TLevel(size_t depth) - : Depth(depth) - { - } - }; - - //---------------------------------------------------------------------------------------- - - class TVanEmdeBoasReverseNodeEnumerator: public TReverseNodeEnumerator { - public: - TVanEmdeBoasReverseNodeEnumerator(TTrieMeasurer& measurer, bool verbose) - : Fresh(true) - , Trie(measurer.GetTrie()) - , Depth(measurer.GetDepth()) - , MaxIndexSize(0) - , BackIndex(measurer.GetParentCounts()) - , ProcessedNodes(measurer.GetTrie()) - , Verbose(verbose) - { - } - - bool Move() override { - if (!Fresh) { - CentralWord.pop_back(); - } - if (CentralWord.empty()) { - return MoveCentralWordStart(); - } - return true; - } - - const TNode& Get() const { - return CentralWord.back(); - } - - size_t GetLeafLength() const override { - return Get().GetLeafLength(); - } - - // Returns recalculated offset from the end of the current node. - size_t PrepareOffset(size_t absoffset, size_t resultLength) { - if (!absoffset) - return NPOS; - return resultLength - BackIndex.Get(absoffset); - } - - size_t RecreateNode(char* buffer, size_t resultLength) override { - TWriteableNode newNode(Get(), Trie.Data); - newNode.ForwardOffset = PrepareOffset(Get().GetForwardOffset(), resultLength); - newNode.LeftOffset = PrepareOffset(Get().GetLeftOffset(), resultLength); - newNode.RightOffset = PrepareOffset(Get().GetRightOffset(), resultLength); - - const size_t len = newNode.Pack(buffer); - ProcessedNodes.Add(Get().GetOffset()); - BackIndex.Add(Get().GetOffset(), resultLength + len); - MaxIndexSize = Max(MaxIndexSize, BackIndex.Size()); - return len; - } - - private: - bool Fresh; - TOpaqueTrie Trie; - size_t Depth; - size_t MaxIndexSize; - - TVector<TLevel> Trace; - TOffsetIndex BackIndex; - TVector<TNode> CentralWord; - TTrieNodeSet ProcessedNodes; - - const bool Verbose; - - private: - bool IsVisited(size_t offset) const { - return ProcessedNodes.Has(offset); - } - - void AddCascade(size_t step) { - Y_ASSERT(!(step & (step - 1))); // Should be a power of 2. - while (step > 0) { - size_t root = Trace.back().Nodes.back(); - TLevel level(Trace.back().Depth + step); - Trace.push_back(level); - size_t actualStep = FindSupportingPowerOf2(FindDescendants(root, step, Trace.back().Nodes)); - if (actualStep != step) { - // Retry with a smaller step. - Y_ASSERT(actualStep < step); - step = actualStep; - Trace.pop_back(); - } else { - step /= 2; - } - } - } - - void FillCentralWord() { - CentralWord.clear(); - CentralWord.push_back(TNode(Trie.Data, Trace.back().Nodes.back(), Trie.SkipFunction)); - // Do not check for epsilon links, as the traversal order now is different. - while (CentralWord.back().GetForwardOffset() && !IsVisited(CentralWord.back().GetForwardOffset())) { - CentralWord.push_back(TNode(Trie.Data, CentralWord.back().GetForwardOffset(), Trie.SkipFunction)); - } - } - - bool MoveCentralWordStart() { - do { - if (Fresh) { - TLevel root(0); - Trace.push_back(root); - Trace.back().Nodes.push_back(0); - const size_t sectionDepth = FindSupportingPowerOf2(Depth); - AddCascade(sectionDepth); - Fresh = false; - } else { - Trace.back().Nodes.pop_back(); - if (Trace.back().Nodes.empty() && Trace.size() == 1) { - if (Verbose) { - Cerr << "Max index size: " << MaxIndexSize << Endl; - Cerr << "Current index size: " << BackIndex.Size() << Endl; - } - // We just popped the root. - return false; - } - size_t lastStep = Trace.back().Depth - Trace[Trace.size() - 2].Depth; - if (Trace.back().Nodes.empty()) { - Trace.pop_back(); - } - AddCascade(lastStep / 2); - } - } while (IsVisited(Trace.back().Nodes.back())); - FillCentralWord(); - return true; - } - - // Returns the maximal depth it has reached while searching. - // This is a method because it needs OffsetIndex to skip processed nodes. - size_t FindDescendants(size_t rootOffset, size_t depth, TLevelNodes& result) const { - if (depth == 0) { - result.push_back(rootOffset); - return 0; - } - size_t actualDepth = 0; - TNode node(Trie.Data, rootOffset, Trie.SkipFunction); - for (;;) { - if (node.GetLeftOffset() && !IsVisited(node.GetLeftOffset())) { - actualDepth = Max(actualDepth, - 1 + FindDescendants(node.GetLeftOffset(), depth - 1, result)); + + void TTrieMeasurer::Measure() { + if (Verbose) { + Cerr << "Measuring the trie..." << Endl; + } + NodeCount = 0; + UnminimizedNodeCount = 0; + Depth = MeasureSubtrie(0, true); + if (IsTree()) { + ParentCounts.SetTreeMode(); + } + if (Verbose) { + Cerr << "Unminimized node count: " << UnminimizedNodeCount << Endl; + Cerr << "Trie depth: " << Depth << Endl; + Cerr << "Node count: " << NodeCount << Endl; + } + } + + // A chain of nodes linked by forward links + // is considered one node with many left and right children + // for depth measuring here and in + // TVanEmdeBoasReverseNodeEnumerator::FindDescendants. + size_t TTrieMeasurer::MeasureSubtrie(size_t rootOffset, bool isNewPath) { + Y_ASSERT(rootOffset < Trie.Length); + TNode node(Trie.Data, rootOffset, Trie.SkipFunction); + size_t depth = 0; + for (;;) { + ++UnminimizedNodeCount; + if (Verbose) { + ShowProgress(UnminimizedNodeCount); + } + if (isNewPath) { + if (ParentCounts.Get(node.GetOffset()) > 0) { + isNewPath = false; + } else { + ++NodeCount; + } + ParentCounts.Inc(node.GetOffset()); + } + if (node.GetLeftOffset()) { + depth = Max(depth, 1 + MeasureSubtrie(node.GetLeftOffset(), isNewPath)); + } + if (node.GetRightOffset()) { + depth = Max(depth, 1 + MeasureSubtrie(node.GetRightOffset(), isNewPath)); + } + if (node.GetForwardOffset()) { + node = TNode(Trie.Data, node.GetForwardOffset(), Trie.SkipFunction); + } else { + break; + } + } + return depth; + } + + //-------------------------------------------------------------------------------------- + + using TLevelNodes = TVector<size_t>; + + struct TLevel { + size_t Depth; + TLevelNodes Nodes; + + explicit TLevel(size_t depth) + : Depth(depth) + { + } + }; + + //---------------------------------------------------------------------------------------- + + class TVanEmdeBoasReverseNodeEnumerator: public TReverseNodeEnumerator { + public: + TVanEmdeBoasReverseNodeEnumerator(TTrieMeasurer& measurer, bool verbose) + : Fresh(true) + , Trie(measurer.GetTrie()) + , Depth(measurer.GetDepth()) + , MaxIndexSize(0) + , BackIndex(measurer.GetParentCounts()) + , ProcessedNodes(measurer.GetTrie()) + , Verbose(verbose) + { + } + + bool Move() override { + if (!Fresh) { + CentralWord.pop_back(); + } + if (CentralWord.empty()) { + return MoveCentralWordStart(); + } + return true; + } + + const TNode& Get() const { + return CentralWord.back(); + } + + size_t GetLeafLength() const override { + return Get().GetLeafLength(); + } + + // Returns recalculated offset from the end of the current node. + size_t PrepareOffset(size_t absoffset, size_t resultLength) { + if (!absoffset) + return NPOS; + return resultLength - BackIndex.Get(absoffset); + } + + size_t RecreateNode(char* buffer, size_t resultLength) override { + TWriteableNode newNode(Get(), Trie.Data); + newNode.ForwardOffset = PrepareOffset(Get().GetForwardOffset(), resultLength); + newNode.LeftOffset = PrepareOffset(Get().GetLeftOffset(), resultLength); + newNode.RightOffset = PrepareOffset(Get().GetRightOffset(), resultLength); + + const size_t len = newNode.Pack(buffer); + ProcessedNodes.Add(Get().GetOffset()); + BackIndex.Add(Get().GetOffset(), resultLength + len); + MaxIndexSize = Max(MaxIndexSize, BackIndex.Size()); + return len; + } + + private: + bool Fresh; + TOpaqueTrie Trie; + size_t Depth; + size_t MaxIndexSize; + + TVector<TLevel> Trace; + TOffsetIndex BackIndex; + TVector<TNode> CentralWord; + TTrieNodeSet ProcessedNodes; + + const bool Verbose; + + private: + bool IsVisited(size_t offset) const { + return ProcessedNodes.Has(offset); + } + + void AddCascade(size_t step) { + Y_ASSERT(!(step & (step - 1))); // Should be a power of 2. + while (step > 0) { + size_t root = Trace.back().Nodes.back(); + TLevel level(Trace.back().Depth + step); + Trace.push_back(level); + size_t actualStep = FindSupportingPowerOf2(FindDescendants(root, step, Trace.back().Nodes)); + if (actualStep != step) { + // Retry with a smaller step. + Y_ASSERT(actualStep < step); + step = actualStep; + Trace.pop_back(); + } else { + step /= 2; } - if (node.GetRightOffset() && !IsVisited(node.GetRightOffset())) { - actualDepth = Max(actualDepth, - 1 + FindDescendants(node.GetRightOffset(), depth - 1, result)); - } - if (node.GetForwardOffset() && !IsVisited(node.GetForwardOffset())) { - node = TNode(Trie.Data, node.GetForwardOffset(), Trie.SkipFunction); - } else { - break; - } } - return actualDepth; } - }; - } // Anonymous namespace. - - //----------------------------------------------------------------------------------- - - size_t RawCompactTrieFastLayoutImpl(IOutputStream& os, const TOpaqueTrie& trie, bool verbose) { - if (!trie.Data || !trie.Length) { + void FillCentralWord() { + CentralWord.clear(); + CentralWord.push_back(TNode(Trie.Data, Trace.back().Nodes.back(), Trie.SkipFunction)); + // Do not check for epsilon links, as the traversal order now is different. + while (CentralWord.back().GetForwardOffset() && !IsVisited(CentralWord.back().GetForwardOffset())) { + CentralWord.push_back(TNode(Trie.Data, CentralWord.back().GetForwardOffset(), Trie.SkipFunction)); + } + } + + bool MoveCentralWordStart() { + do { + if (Fresh) { + TLevel root(0); + Trace.push_back(root); + Trace.back().Nodes.push_back(0); + const size_t sectionDepth = FindSupportingPowerOf2(Depth); + AddCascade(sectionDepth); + Fresh = false; + } else { + Trace.back().Nodes.pop_back(); + if (Trace.back().Nodes.empty() && Trace.size() == 1) { + if (Verbose) { + Cerr << "Max index size: " << MaxIndexSize << Endl; + Cerr << "Current index size: " << BackIndex.Size() << Endl; + } + // We just popped the root. + return false; + } + size_t lastStep = Trace.back().Depth - Trace[Trace.size() - 2].Depth; + if (Trace.back().Nodes.empty()) { + Trace.pop_back(); + } + AddCascade(lastStep / 2); + } + } while (IsVisited(Trace.back().Nodes.back())); + FillCentralWord(); + return true; + } + + // Returns the maximal depth it has reached while searching. + // This is a method because it needs OffsetIndex to skip processed nodes. + size_t FindDescendants(size_t rootOffset, size_t depth, TLevelNodes& result) const { + if (depth == 0) { + result.push_back(rootOffset); + return 0; + } + size_t actualDepth = 0; + TNode node(Trie.Data, rootOffset, Trie.SkipFunction); + for (;;) { + if (node.GetLeftOffset() && !IsVisited(node.GetLeftOffset())) { + actualDepth = Max(actualDepth, + 1 + FindDescendants(node.GetLeftOffset(), depth - 1, result)); + } + if (node.GetRightOffset() && !IsVisited(node.GetRightOffset())) { + actualDepth = Max(actualDepth, + 1 + FindDescendants(node.GetRightOffset(), depth - 1, result)); + } + if (node.GetForwardOffset() && !IsVisited(node.GetForwardOffset())) { + node = TNode(Trie.Data, node.GetForwardOffset(), Trie.SkipFunction); + } else { + break; + } + } + return actualDepth; + } + }; + + } // Anonymous namespace. + + //----------------------------------------------------------------------------------- + + size_t RawCompactTrieFastLayoutImpl(IOutputStream& os, const TOpaqueTrie& trie, bool verbose) { + if (!trie.Data || !trie.Length) { return 0; } - TTrieMeasurer mes(trie, verbose); - mes.Measure(); - TVanEmdeBoasReverseNodeEnumerator enumerator(mes, verbose); - return WriteTrieBackwards(os, enumerator, verbose); + TTrieMeasurer mes(trie, verbose); + mes.Measure(); + TVanEmdeBoasReverseNodeEnumerator enumerator(mes, verbose); + return WriteTrieBackwards(os, enumerator, verbose); } - + } diff --git a/library/cpp/containers/comptrie/make_fast_layout.h b/library/cpp/containers/comptrie/make_fast_layout.h index cb488ffaa5..b8fab5d65b 100644 --- a/library/cpp/containers/comptrie/make_fast_layout.h +++ b/library/cpp/containers/comptrie/make_fast_layout.h @@ -6,15 +6,15 @@ class IOutputStream; namespace NCompactTrie { - // Return value: size of the resulting trie. - size_t RawCompactTrieFastLayoutImpl(IOutputStream& os, const NCompactTrie::TOpaqueTrie& trie, bool verbose); + // Return value: size of the resulting trie. + size_t RawCompactTrieFastLayoutImpl(IOutputStream& os, const NCompactTrie::TOpaqueTrie& trie, bool verbose); + + // Return value: size of the resulting trie. + template <class TPacker> + size_t CompactTrieMakeFastLayoutImpl(IOutputStream& os, const char* data, size_t datalength, bool verbose, const TPacker* packer) { + TPackerLeafSkipper<TPacker> skipper(packer); + TOpaqueTrie trie(data, datalength, skipper); + return RawCompactTrieFastLayoutImpl(os, trie, verbose); + } - // Return value: size of the resulting trie. - template <class TPacker> - size_t CompactTrieMakeFastLayoutImpl(IOutputStream& os, const char* data, size_t datalength, bool verbose, const TPacker* packer) { - TPackerLeafSkipper<TPacker> skipper(packer); - TOpaqueTrie trie(data, datalength, skipper); - return RawCompactTrieFastLayoutImpl(os, trie, verbose); - } - } diff --git a/library/cpp/containers/comptrie/minimize.cpp b/library/cpp/containers/comptrie/minimize.cpp index f53511428f..80d0b25217 100644 --- a/library/cpp/containers/comptrie/minimize.cpp +++ b/library/cpp/containers/comptrie/minimize.cpp @@ -8,352 +8,352 @@ #include <util/generic/algorithm.h> namespace NCompactTrie { - // Minimize the trie. The result is equivalent to the original - // trie, except that it takes less space (and has marginally lower - // performance, because of eventual epsilon links). - // The algorithm is as follows: starting from the largest pieces, we find - // nodes that have identical continuations (Daciuk's right language), - // and repack the trie. Repacking is done in-place, so memory is less - // of an issue; however, it may take considerable time. - - // IMPORTANT: never try to reminimize an already minimized trie or a trie with fast layout. - // Because of non-local structure and epsilon links, it won't work - // as you expect it to, and can destroy the trie in the making. - - namespace { - using TOffsetList = TVector<size_t>; - using TPieceIndex = THashMap<size_t, TOffsetList>; - - using TSizePair = std::pair<size_t, size_t>; - using TSizePairVector = TVector<TSizePair>; - using TSizePairVectorVector = TVector<TSizePairVector>; - - class TOffsetMap { - protected: - TSizePairVectorVector Data; - - public: - TOffsetMap() { - Data.reserve(0x10000); - } - - void Add(size_t key, size_t value) { - size_t hikey = key & 0xFFFF; - - if (Data.size() <= hikey) - Data.resize(hikey + 1); - - TSizePairVector& sublist = Data[hikey]; - - for (auto& it : sublist) { - if (it.first == key) { - it.second = value; - - return; - } - } - - sublist.push_back(TSizePair(key, value)); - } - - bool Contains(size_t key) const { - return (Get(key) != 0); + // Minimize the trie. The result is equivalent to the original + // trie, except that it takes less space (and has marginally lower + // performance, because of eventual epsilon links). + // The algorithm is as follows: starting from the largest pieces, we find + // nodes that have identical continuations (Daciuk's right language), + // and repack the trie. Repacking is done in-place, so memory is less + // of an issue; however, it may take considerable time. + + // IMPORTANT: never try to reminimize an already minimized trie or a trie with fast layout. + // Because of non-local structure and epsilon links, it won't work + // as you expect it to, and can destroy the trie in the making. + + namespace { + using TOffsetList = TVector<size_t>; + using TPieceIndex = THashMap<size_t, TOffsetList>; + + using TSizePair = std::pair<size_t, size_t>; + using TSizePairVector = TVector<TSizePair>; + using TSizePairVectorVector = TVector<TSizePairVector>; + + class TOffsetMap { + protected: + TSizePairVectorVector Data; + + public: + TOffsetMap() { + Data.reserve(0x10000); } - - size_t Get(size_t key) const { - size_t hikey = key & 0xFFFF; - - if (Data.size() <= hikey) - return 0; - - const TSizePairVector& sublist = Data[hikey]; - - for (const auto& it : sublist) { - if (it.first == key) - return it.second; - } - - return 0; - } - }; - - class TOffsetDeltas { - protected: - TSizePairVector Data; - - public: - void Add(size_t key, size_t value) { - if (Data.empty()) { - if (key == value) - return; // no offset - } else { - TSizePair last = Data.back(); - - if (key <= last.first) { - Cerr << "Trouble: elements to offset delta list added in wrong order" << Endl; - - return; - } - - if (last.first + value == last.second + key) - return; // same offset - } - - Data.push_back(TSizePair(key, value)); + + void Add(size_t key, size_t value) { + size_t hikey = key & 0xFFFF; + + if (Data.size() <= hikey) + Data.resize(hikey + 1); + + TSizePairVector& sublist = Data[hikey]; + + for (auto& it : sublist) { + if (it.first == key) { + it.second = value; + + return; + } + } + + sublist.push_back(TSizePair(key, value)); + } + + bool Contains(size_t key) const { + return (Get(key) != 0); + } + + size_t Get(size_t key) const { + size_t hikey = key & 0xFFFF; + + if (Data.size() <= hikey) + return 0; + + const TSizePairVector& sublist = Data[hikey]; + + for (const auto& it : sublist) { + if (it.first == key) + return it.second; + } + + return 0; + } + }; + + class TOffsetDeltas { + protected: + TSizePairVector Data; + + public: + void Add(size_t key, size_t value) { + if (Data.empty()) { + if (key == value) + return; // no offset + } else { + TSizePair last = Data.back(); + + if (key <= last.first) { + Cerr << "Trouble: elements to offset delta list added in wrong order" << Endl; + + return; + } + + if (last.first + value == last.second + key) + return; // same offset + } + + Data.push_back(TSizePair(key, value)); + } + + size_t Get(size_t key) const { + if (Data.empty()) + return key; // difference is zero; + + if (key < Data.front().first) + return key; + + // Binary search for the highest entry in the list that does not exceed the key + size_t from = 0; + size_t to = Data.size() - 1; + + while (from < to) { + size_t midpoint = (from + to + 1) / 2; + + if (key < Data[midpoint].first) + to = midpoint - 1; + else + from = midpoint; + } + + TSizePair entry = Data[from]; + + return key - entry.first + entry.second; + } + }; + + class TPieceComparer { + private: + const char* Data; + const size_t Length; + + public: + TPieceComparer(const char* buf, size_t len) + : Data(buf) + , Length(len) + { } - - size_t Get(size_t key) const { - if (Data.empty()) - return key; // difference is zero; - - if (key < Data.front().first) - return key; - - // Binary search for the highest entry in the list that does not exceed the key - size_t from = 0; - size_t to = Data.size() - 1; - - while (from < to) { - size_t midpoint = (from + to + 1) / 2; - - if (key < Data[midpoint].first) - to = midpoint - 1; - else - from = midpoint; - } - - TSizePair entry = Data[from]; - - return key - entry.first + entry.second; - } - }; - - class TPieceComparer { - private: - const char* Data; - const size_t Length; - - public: - TPieceComparer(const char* buf, size_t len) - : Data(buf) - , Length(len) - { - } - - bool operator()(size_t p1, const size_t p2) { - int res = memcmp(Data + p1, Data + p2, Length); - - if (res) - return (res > 0); - - return (p1 > p2); // the pieces are sorted in the reverse order of appearance - } - }; - - struct TBranchPoint { - TNode Node; - int Selector; - - public: - TBranchPoint() - : Selector(0) - { - } - - TBranchPoint(const char* data, size_t offset, const ILeafSkipper& skipFunction) - : Node(data, offset, skipFunction) - , Selector(0) - { - } - - bool IsFinal() const { - return Node.IsFinal(); - } - - // NextNode returns child nodes, starting from the last node: Right, then Left, then Forward - size_t NextNode(const TOffsetMap& mergedNodes) { - while (Selector < 3) { - size_t nextOffset = 0; - - switch (++Selector) { - case 1: - if (Node.GetRightOffset()) - nextOffset = Node.GetRightOffset(); - break; - - case 2: - if (Node.GetLeftOffset()) - nextOffset = Node.GetLeftOffset(); - break; - - case 3: - if (Node.GetForwardOffset()) - nextOffset = Node.GetForwardOffset(); - break; - - default: - break; - } - - if (nextOffset && !mergedNodes.Contains(nextOffset)) - return nextOffset; - } - return 0; + + bool operator()(size_t p1, const size_t p2) { + int res = memcmp(Data + p1, Data + p2, Length); + + if (res) + return (res > 0); + + return (p1 > p2); // the pieces are sorted in the reverse order of appearance + } + }; + + struct TBranchPoint { + TNode Node; + int Selector; + + public: + TBranchPoint() + : Selector(0) + { } - }; - - class TMergingReverseNodeEnumerator: public TReverseNodeEnumerator { - private: - bool Fresh; - TOpaqueTrie Trie; - const TOffsetMap& MergeMap; - TVector<TBranchPoint> Trace; - TOffsetDeltas OffsetIndex; - - public: - TMergingReverseNodeEnumerator(const TOpaqueTrie& trie, const TOffsetMap& mergers) - : Fresh(true) - , Trie(trie) - , MergeMap(mergers) - { - } - - bool Move() override { - if (Fresh) { - Trace.push_back(TBranchPoint(Trie.Data, 0, Trie.SkipFunction)); - Fresh = false; - } else { - if (Trace.empty()) - return false; - - Trace.pop_back(); - - if (Trace.empty()) - return false; - } - - size_t nextnode = Trace.back().NextNode(MergeMap); - - while (nextnode) { - Trace.push_back(TBranchPoint(Trie.Data, nextnode, Trie.SkipFunction)); - nextnode = Trace.back().NextNode(MergeMap); - } - - return (!Trace.empty()); - } - - const TNode& Get() const { - return Trace.back().Node; - } - size_t GetLeafLength() const override { - return Get().GetLeafLength(); - } - - // Returns recalculated offset from the end of the current node - size_t PrepareOffset(size_t absoffset, size_t minilength) { - if (!absoffset) - return NPOS; - - if (MergeMap.Contains(absoffset)) - absoffset = MergeMap.Get(absoffset); - return minilength - OffsetIndex.Get(Trie.Length - absoffset); - } - - size_t RecreateNode(char* buffer, size_t resultLength) override { - TWriteableNode newNode(Get(), Trie.Data); - newNode.ForwardOffset = PrepareOffset(Get().GetForwardOffset(), resultLength); - newNode.LeftOffset = PrepareOffset(Get().GetLeftOffset(), resultLength); - newNode.RightOffset = PrepareOffset(Get().GetRightOffset(), resultLength); - - if (!buffer) - return newNode.Measure(); - - const size_t len = newNode.Pack(buffer); - OffsetIndex.Add(Trie.Length - Get().GetOffset(), resultLength + len); - - return len; - } - }; - + + TBranchPoint(const char* data, size_t offset, const ILeafSkipper& skipFunction) + : Node(data, offset, skipFunction) + , Selector(0) + { + } + + bool IsFinal() const { + return Node.IsFinal(); + } + + // NextNode returns child nodes, starting from the last node: Right, then Left, then Forward + size_t NextNode(const TOffsetMap& mergedNodes) { + while (Selector < 3) { + size_t nextOffset = 0; + + switch (++Selector) { + case 1: + if (Node.GetRightOffset()) + nextOffset = Node.GetRightOffset(); + break; + + case 2: + if (Node.GetLeftOffset()) + nextOffset = Node.GetLeftOffset(); + break; + + case 3: + if (Node.GetForwardOffset()) + nextOffset = Node.GetForwardOffset(); + break; + + default: + break; + } + + if (nextOffset && !mergedNodes.Contains(nextOffset)) + return nextOffset; + } + return 0; + } + }; + + class TMergingReverseNodeEnumerator: public TReverseNodeEnumerator { + private: + bool Fresh; + TOpaqueTrie Trie; + const TOffsetMap& MergeMap; + TVector<TBranchPoint> Trace; + TOffsetDeltas OffsetIndex; + + public: + TMergingReverseNodeEnumerator(const TOpaqueTrie& trie, const TOffsetMap& mergers) + : Fresh(true) + , Trie(trie) + , MergeMap(mergers) + { + } + + bool Move() override { + if (Fresh) { + Trace.push_back(TBranchPoint(Trie.Data, 0, Trie.SkipFunction)); + Fresh = false; + } else { + if (Trace.empty()) + return false; + + Trace.pop_back(); + + if (Trace.empty()) + return false; + } + + size_t nextnode = Trace.back().NextNode(MergeMap); + + while (nextnode) { + Trace.push_back(TBranchPoint(Trie.Data, nextnode, Trie.SkipFunction)); + nextnode = Trace.back().NextNode(MergeMap); + } + + return (!Trace.empty()); + } + + const TNode& Get() const { + return Trace.back().Node; + } + size_t GetLeafLength() const override { + return Get().GetLeafLength(); + } + + // Returns recalculated offset from the end of the current node + size_t PrepareOffset(size_t absoffset, size_t minilength) { + if (!absoffset) + return NPOS; + + if (MergeMap.Contains(absoffset)) + absoffset = MergeMap.Get(absoffset); + return minilength - OffsetIndex.Get(Trie.Length - absoffset); + } + + size_t RecreateNode(char* buffer, size_t resultLength) override { + TWriteableNode newNode(Get(), Trie.Data); + newNode.ForwardOffset = PrepareOffset(Get().GetForwardOffset(), resultLength); + newNode.LeftOffset = PrepareOffset(Get().GetLeftOffset(), resultLength); + newNode.RightOffset = PrepareOffset(Get().GetRightOffset(), resultLength); + + if (!buffer) + return newNode.Measure(); + + const size_t len = newNode.Pack(buffer); + OffsetIndex.Add(Trie.Length - Get().GetOffset(), resultLength + len); + + return len; + } + }; + } - static void AddPiece(TPieceIndex& index, size_t offset, size_t len) { - index[len].push_back(offset); - } - - static TOffsetMap FindEquivalentSubtries(const TOpaqueTrie& trie, bool verbose, size_t minMergeSize) { - // Tree nodes, arranged by span length. - // When all nodes of a given size are considered, they pop off the queue. - TPieceIndex subtries; - TOffsetMap merger; - // Start walking the trie from head. - AddPiece(subtries, 0, trie.Length); - - size_t counter = 0; - // Now consider all nodes with sizeable continuations - for (size_t curlen = trie.Length; curlen >= minMergeSize && !subtries.empty(); curlen--) { - TPieceIndex::iterator iit = subtries.find(curlen); - - if (iit == subtries.end()) - continue; // fast forward to the next available length value - - TOffsetList& batch = iit->second; - TPieceComparer comparer(trie.Data, curlen); - Sort(batch.begin(), batch.end(), comparer); - - TOffsetList::iterator it = batch.begin(); - while (it != batch.end()) { - if (verbose) - ShowProgress(++counter); - - size_t offset = *it; - - // Fill the array with the subnodes of the element - TNode node(trie.Data, offset, trie.SkipFunction); - size_t end = offset + curlen; - if (size_t rightOffset = node.GetRightOffset()) { - AddPiece(subtries, rightOffset, end - rightOffset); - end = rightOffset; - } - if (size_t leftOffset = node.GetLeftOffset()) { - AddPiece(subtries, leftOffset, end - leftOffset); - end = leftOffset; - } - if (size_t forwardOffset = node.GetForwardOffset()) { - AddPiece(subtries, forwardOffset, end - forwardOffset); - } - - while (++it != batch.end()) { - // Find next different; until then, just add the offsets to the list of merged nodes. - size_t nextoffset = *it; - - if (memcmp(trie.Data + offset, trie.Data + nextoffset, curlen)) - break; - - merger.Add(nextoffset, offset); - } - } - - subtries.erase(curlen); + static void AddPiece(TPieceIndex& index, size_t offset, size_t len) { + index[len].push_back(offset); + } + + static TOffsetMap FindEquivalentSubtries(const TOpaqueTrie& trie, bool verbose, size_t minMergeSize) { + // Tree nodes, arranged by span length. + // When all nodes of a given size are considered, they pop off the queue. + TPieceIndex subtries; + TOffsetMap merger; + // Start walking the trie from head. + AddPiece(subtries, 0, trie.Length); + + size_t counter = 0; + // Now consider all nodes with sizeable continuations + for (size_t curlen = trie.Length; curlen >= minMergeSize && !subtries.empty(); curlen--) { + TPieceIndex::iterator iit = subtries.find(curlen); + + if (iit == subtries.end()) + continue; // fast forward to the next available length value + + TOffsetList& batch = iit->second; + TPieceComparer comparer(trie.Data, curlen); + Sort(batch.begin(), batch.end(), comparer); + + TOffsetList::iterator it = batch.begin(); + while (it != batch.end()) { + if (verbose) + ShowProgress(++counter); + + size_t offset = *it; + + // Fill the array with the subnodes of the element + TNode node(trie.Data, offset, trie.SkipFunction); + size_t end = offset + curlen; + if (size_t rightOffset = node.GetRightOffset()) { + AddPiece(subtries, rightOffset, end - rightOffset); + end = rightOffset; + } + if (size_t leftOffset = node.GetLeftOffset()) { + AddPiece(subtries, leftOffset, end - leftOffset); + end = leftOffset; + } + if (size_t forwardOffset = node.GetForwardOffset()) { + AddPiece(subtries, forwardOffset, end - forwardOffset); + } + + while (++it != batch.end()) { + // Find next different; until then, just add the offsets to the list of merged nodes. + size_t nextoffset = *it; + + if (memcmp(trie.Data + offset, trie.Data + nextoffset, curlen)) + break; + + merger.Add(nextoffset, offset); + } + } + + subtries.erase(curlen); + } + if (verbose) { + Cerr << counter << Endl; } - if (verbose) { - Cerr << counter << Endl; - } - return merger; + return merger; } - - size_t RawCompactTrieMinimizeImpl(IOutputStream& os, TOpaqueTrie& trie, bool verbose, size_t minMergeSize, EMinimizeMode mode) { - if (!trie.Data || !trie.Length) { - return 0; - } - - TOffsetMap merger = FindEquivalentSubtries(trie, verbose, minMergeSize); - TMergingReverseNodeEnumerator enumerator(trie, merger); - - if (mode == MM_DEFAULT) - return WriteTrieBackwards(os, enumerator, verbose); - else - return WriteTrieBackwardsNoAlloc(os, enumerator, trie, mode); + + size_t RawCompactTrieMinimizeImpl(IOutputStream& os, TOpaqueTrie& trie, bool verbose, size_t minMergeSize, EMinimizeMode mode) { + if (!trie.Data || !trie.Length) { + return 0; + } + + TOffsetMap merger = FindEquivalentSubtries(trie, verbose, minMergeSize); + TMergingReverseNodeEnumerator enumerator(trie, merger); + + if (mode == MM_DEFAULT) + return WriteTrieBackwards(os, enumerator, verbose); + else + return WriteTrieBackwardsNoAlloc(os, enumerator, trie, mode); } } diff --git a/library/cpp/containers/comptrie/minimize.h b/library/cpp/containers/comptrie/minimize.h index 2464571437..baaa431d04 100644 --- a/library/cpp/containers/comptrie/minimize.h +++ b/library/cpp/containers/comptrie/minimize.h @@ -6,24 +6,24 @@ class IOutputStream; namespace NCompactTrie { - size_t MeasureOffset(size_t offset); + size_t MeasureOffset(size_t offset); - enum EMinimizeMode { - MM_DEFAULT, // alollocate new memory for minimized tree - MM_NOALLOC, // minimize tree in the same buffer - MM_INPLACE // do not write tree to the stream, but move to the buffer beginning - }; + enum EMinimizeMode { + MM_DEFAULT, // alollocate new memory for minimized tree + MM_NOALLOC, // minimize tree in the same buffer + MM_INPLACE // do not write tree to the stream, but move to the buffer beginning + }; - // Return value: size of the minimized trie. - size_t RawCompactTrieMinimizeImpl(IOutputStream& os, TOpaqueTrie& trie, bool verbose, size_t minMergeSize, EMinimizeMode mode); + // Return value: size of the minimized trie. + size_t RawCompactTrieMinimizeImpl(IOutputStream& os, TOpaqueTrie& trie, bool verbose, size_t minMergeSize, EMinimizeMode mode); - // Return value: size of the minimized trie. - template <class TPacker> - size_t CompactTrieMinimizeImpl(IOutputStream& os, const char* data, size_t datalength, bool verbose, const TPacker* packer, EMinimizeMode mode) { - TPackerLeafSkipper<TPacker> skipper(packer); - size_t minmerge = MeasureOffset(datalength); - TOpaqueTrie trie(data, datalength, skipper); - return RawCompactTrieMinimizeImpl(os, trie, verbose, minmerge, mode); - } + // Return value: size of the minimized trie. + template <class TPacker> + size_t CompactTrieMinimizeImpl(IOutputStream& os, const char* data, size_t datalength, bool verbose, const TPacker* packer, EMinimizeMode mode) { + TPackerLeafSkipper<TPacker> skipper(packer); + size_t minmerge = MeasureOffset(datalength); + TOpaqueTrie trie(data, datalength, skipper); + return RawCompactTrieMinimizeImpl(os, trie, verbose, minmerge, mode); + } } diff --git a/library/cpp/containers/comptrie/node.cpp b/library/cpp/containers/comptrie/node.cpp index cd98e6b4d6..5fd22f15ec 100644 --- a/library/cpp/containers/comptrie/node.cpp +++ b/library/cpp/containers/comptrie/node.cpp @@ -6,74 +6,74 @@ #include <util/generic/yexception.h> namespace NCompactTrie { - TNode::TNode() - : Offset(0) - , LeafLength(0) - , CoreLength(0) - , Label(0) - { - for (auto& offset : Offsets) { - offset = 0; - } + TNode::TNode() + : Offset(0) + , LeafLength(0) + , CoreLength(0) + , Label(0) + { + for (auto& offset : Offsets) { + offset = 0; + } } - // We believe that epsilon links are found only on the forward-position and that afer jumping an epsilon link you come to an ordinary node. - - TNode::TNode(const char* data, size_t offset, const ILeafSkipper& skipFunction) - : Offset(offset) - , LeafLength(0) - , CoreLength(0) - , Label(0) - { - for (auto& anOffset : Offsets) { - anOffset = 0; - } - if (!data) - return; // empty constructor - - const char* datapos = data + offset; - char flags = *(datapos++); - Y_ASSERT(!IsEpsilonLink(flags)); - Label = *(datapos++); + // We believe that epsilon links are found only on the forward-position and that afer jumping an epsilon link you come to an ordinary node. - size_t leftsize = LeftOffsetLen(flags); - size_t& leftOffset = Offsets[D_LEFT]; - leftOffset = UnpackOffset(datapos, leftsize); - if (leftOffset) { - leftOffset += Offset; - } - datapos += leftsize; + TNode::TNode(const char* data, size_t offset, const ILeafSkipper& skipFunction) + : Offset(offset) + , LeafLength(0) + , CoreLength(0) + , Label(0) + { + for (auto& anOffset : Offsets) { + anOffset = 0; + } + if (!data) + return; // empty constructor - size_t rightsize = RightOffsetLen(flags); - size_t& rightOffset = Offsets[D_RIGHT]; - rightOffset = UnpackOffset(datapos, rightsize); - if (rightOffset) { - rightOffset += Offset; - } - datapos += rightsize; - - if (flags & MT_FINAL) { - Offsets[D_FINAL] = datapos - data; - LeafLength = skipFunction.SkipLeaf(datapos); - } + const char* datapos = data + offset; + char flags = *(datapos++); + Y_ASSERT(!IsEpsilonLink(flags)); + Label = *(datapos++); - CoreLength = 2 + leftsize + rightsize + LeafLength; - if (flags & MT_NEXT) { - size_t& forwardOffset = Offsets[D_NEXT]; - forwardOffset = Offset + CoreLength; - // There might be an epsilon link at the forward position. - // If so, set the ForwardOffset to the value that points to the link's end. - const char* forwardPos = data + forwardOffset; - const char forwardFlags = *forwardPos; - if (IsEpsilonLink(forwardFlags)) { - // Jump through the epsilon link. - size_t epsilonOffset = UnpackOffset(forwardPos + 1, forwardFlags & MT_SIZEMASK); - if (!epsilonOffset) { - ythrow yexception() << "Corrupted epsilon link"; - } - forwardOffset += epsilonOffset; + size_t leftsize = LeftOffsetLen(flags); + size_t& leftOffset = Offsets[D_LEFT]; + leftOffset = UnpackOffset(datapos, leftsize); + if (leftOffset) { + leftOffset += Offset; + } + datapos += leftsize; + + size_t rightsize = RightOffsetLen(flags); + size_t& rightOffset = Offsets[D_RIGHT]; + rightOffset = UnpackOffset(datapos, rightsize); + if (rightOffset) { + rightOffset += Offset; + } + datapos += rightsize; + + if (flags & MT_FINAL) { + Offsets[D_FINAL] = datapos - data; + LeafLength = skipFunction.SkipLeaf(datapos); + } + + CoreLength = 2 + leftsize + rightsize + LeafLength; + if (flags & MT_NEXT) { + size_t& forwardOffset = Offsets[D_NEXT]; + forwardOffset = Offset + CoreLength; + // There might be an epsilon link at the forward position. + // If so, set the ForwardOffset to the value that points to the link's end. + const char* forwardPos = data + forwardOffset; + const char forwardFlags = *forwardPos; + if (IsEpsilonLink(forwardFlags)) { + // Jump through the epsilon link. + size_t epsilonOffset = UnpackOffset(forwardPos + 1, forwardFlags & MT_SIZEMASK); + if (!epsilonOffset) { + ythrow yexception() << "Corrupted epsilon link"; + } + forwardOffset += epsilonOffset; } } } - + } diff --git a/library/cpp/containers/comptrie/node.h b/library/cpp/containers/comptrie/node.h index 9a936998b5..d6f4317db0 100644 --- a/library/cpp/containers/comptrie/node.h +++ b/library/cpp/containers/comptrie/node.h @@ -3,78 +3,78 @@ #include <cstddef> namespace NCompactTrie { - class ILeafSkipper; + class ILeafSkipper; - enum TDirection { - D_LEFT, - D_FINAL, - D_NEXT, - D_RIGHT, - D_MAX - }; - - inline TDirection& operator++(TDirection& direction) { - direction = static_cast<TDirection>(direction + 1); - return direction; - } + enum TDirection { + D_LEFT, + D_FINAL, + D_NEXT, + D_RIGHT, + D_MAX + }; - inline TDirection& operator--(TDirection& direction) { - direction = static_cast<TDirection>(direction - 1); - return direction; - } + inline TDirection& operator++(TDirection& direction) { + direction = static_cast<TDirection>(direction + 1); + return direction; + } - class TNode { - public: - TNode(); - // Processes epsilon links and sets ForwardOffset to correct value. Assumes an epsilon link doesn't point to an epsilon link. - TNode(const char* data, size_t offset, const ILeafSkipper& skipFunction); + inline TDirection& operator--(TDirection& direction) { + direction = static_cast<TDirection>(direction - 1); + return direction; + } - size_t GetOffset() const { - return Offset; - } + class TNode { + public: + TNode(); + // Processes epsilon links and sets ForwardOffset to correct value. Assumes an epsilon link doesn't point to an epsilon link. + TNode(const char* data, size_t offset, const ILeafSkipper& skipFunction); - size_t GetLeafOffset() const { - return Offsets[D_FINAL]; - } - size_t GetLeafLength() const { - return LeafLength; - } - size_t GetCoreLength() const { - return CoreLength; - } + size_t GetOffset() const { + return Offset; + } - size_t GetOffsetByDirection(TDirection direction) const { - return Offsets[direction]; - } + size_t GetLeafOffset() const { + return Offsets[D_FINAL]; + } + size_t GetLeafLength() const { + return LeafLength; + } + size_t GetCoreLength() const { + return CoreLength; + } - size_t GetForwardOffset() const { - return Offsets[D_NEXT]; - } - size_t GetLeftOffset() const { - return Offsets[D_LEFT]; - } - size_t GetRightOffset() const { - return Offsets[D_RIGHT]; - } - char GetLabel() const { - return Label; - } + size_t GetOffsetByDirection(TDirection direction) const { + return Offsets[direction]; + } - bool IsFinal() const { - return GetLeafOffset() != 0; - } + size_t GetForwardOffset() const { + return Offsets[D_NEXT]; + } + size_t GetLeftOffset() const { + return Offsets[D_LEFT]; + } + size_t GetRightOffset() const { + return Offsets[D_RIGHT]; + } + char GetLabel() const { + return Label; + } - bool HasEpsilonLinkForward() const { - return GetForwardOffset() > Offset + CoreLength; - } - - private: - size_t Offsets[D_MAX]; - size_t Offset; - size_t LeafLength; - size_t CoreLength; + bool IsFinal() const { + return GetLeafOffset() != 0; + } - char Label; - }; + bool HasEpsilonLinkForward() const { + return GetForwardOffset() > Offset + CoreLength; + } -} + private: + size_t Offsets[D_MAX]; + size_t Offset; + size_t LeafLength; + size_t CoreLength; + + char Label; + }; + +} diff --git a/library/cpp/containers/comptrie/opaque_trie_iterator.cpp b/library/cpp/containers/comptrie/opaque_trie_iterator.cpp index d1916358aa..5fd3914be6 100644 --- a/library/cpp/containers/comptrie/opaque_trie_iterator.cpp +++ b/library/cpp/containers/comptrie/opaque_trie_iterator.cpp @@ -3,229 +3,229 @@ #include "node.h" namespace NCompactTrie { - TOpaqueTrieIterator::TOpaqueTrieIterator(const TOpaqueTrie& trie, const char* emptyValue, bool atend, - size_t maxKeyLength) - : Trie(trie) - , EmptyValue(emptyValue) - , AtEmptyValue(emptyValue && !atend) - , MaxKeyLength(maxKeyLength) - { - if (!AtEmptyValue && !atend) - Forward(); - } - - bool TOpaqueTrieIterator::operator==(const TOpaqueTrieIterator& rhs) const { - return (Trie == rhs.Trie && - Forks == rhs.Forks && - EmptyValue == rhs.EmptyValue && - AtEmptyValue == rhs.AtEmptyValue && - MaxKeyLength == rhs.MaxKeyLength); - } - - bool TOpaqueTrieIterator::HasMaxKeyLength() const { - return MaxKeyLength != size_t(-1) && MeasureNarrowKey() == MaxKeyLength; - } - - bool TOpaqueTrieIterator::Forward() { - if (AtEmptyValue) { - AtEmptyValue = false; - bool res = Forward(); // TODO delete this after format change - if (res && MeasureNarrowKey() != 0) { - return res; // there was not "\0" key - } - // otherwise we are skipping "\0" key - } - - if (!Trie.Length) - return false; - - if (Forks.Empty()) { - TFork fork(Trie.Data, 0, Trie.Length, Trie.SkipFunction); - Forks.Push(fork); - } else { - TFork* topFork = &Forks.Top(); - while (!topFork->NextDirection()) { - if (topFork->Node.GetOffset() >= Trie.Length) - return false; - Forks.Pop(); - if (Forks.Empty()) - return false; - topFork = &Forks.Top(); - } - } - - Y_ASSERT(!Forks.Empty()); - while (Forks.Top().CurrentDirection != D_FINAL && !HasMaxKeyLength()) { - TFork nextFork = Forks.Top().NextFork(Trie.SkipFunction); - Forks.Push(nextFork); - } - TFork& top = Forks.Top(); - static_assert(D_FINAL < D_NEXT, "relative order of NEXT and FINAL directions has changed"); - if (HasMaxKeyLength() && top.CurrentDirection == D_FINAL && top.HasDirection(D_NEXT)) { - top.NextDirection(); - } - return true; - } - - bool TOpaqueTrieIterator::Backward() { - if (AtEmptyValue) + TOpaqueTrieIterator::TOpaqueTrieIterator(const TOpaqueTrie& trie, const char* emptyValue, bool atend, + size_t maxKeyLength) + : Trie(trie) + , EmptyValue(emptyValue) + , AtEmptyValue(emptyValue && !atend) + , MaxKeyLength(maxKeyLength) + { + if (!AtEmptyValue && !atend) + Forward(); + } + + bool TOpaqueTrieIterator::operator==(const TOpaqueTrieIterator& rhs) const { + return (Trie == rhs.Trie && + Forks == rhs.Forks && + EmptyValue == rhs.EmptyValue && + AtEmptyValue == rhs.AtEmptyValue && + MaxKeyLength == rhs.MaxKeyLength); + } + + bool TOpaqueTrieIterator::HasMaxKeyLength() const { + return MaxKeyLength != size_t(-1) && MeasureNarrowKey() == MaxKeyLength; + } + + bool TOpaqueTrieIterator::Forward() { + if (AtEmptyValue) { + AtEmptyValue = false; + bool res = Forward(); // TODO delete this after format change + if (res && MeasureNarrowKey() != 0) { + return res; // there was not "\0" key + } + // otherwise we are skipping "\0" key + } + + if (!Trie.Length) return false; - if (!Trie.Length) { - if (EmptyValue) { - // A trie that has only the empty value; - // we are not at the empty value, so move to it. + if (Forks.Empty()) { + TFork fork(Trie.Data, 0, Trie.Length, Trie.SkipFunction); + Forks.Push(fork); + } else { + TFork* topFork = &Forks.Top(); + while (!topFork->NextDirection()) { + if (topFork->Node.GetOffset() >= Trie.Length) + return false; + Forks.Pop(); + if (Forks.Empty()) + return false; + topFork = &Forks.Top(); + } + } + + Y_ASSERT(!Forks.Empty()); + while (Forks.Top().CurrentDirection != D_FINAL && !HasMaxKeyLength()) { + TFork nextFork = Forks.Top().NextFork(Trie.SkipFunction); + Forks.Push(nextFork); + } + TFork& top = Forks.Top(); + static_assert(D_FINAL < D_NEXT, "relative order of NEXT and FINAL directions has changed"); + if (HasMaxKeyLength() && top.CurrentDirection == D_FINAL && top.HasDirection(D_NEXT)) { + top.NextDirection(); + } + return true; + } + + bool TOpaqueTrieIterator::Backward() { + if (AtEmptyValue) + return false; + + if (!Trie.Length) { + if (EmptyValue) { + // A trie that has only the empty value; + // we are not at the empty value, so move to it. AtEmptyValue = true; return true; - } else { - // Empty trie. - return false; + } else { + // Empty trie. + return false; + } + } + + if (Forks.Empty()) { + TFork fork(Trie.Data, 0, Trie.Length, Trie.SkipFunction); + fork.LastDirection(); + Forks.Push(fork); + } else { + TFork* topFork = &Forks.Top(); + while (!topFork->PrevDirection()) { + if (topFork->Node.GetOffset() >= Trie.Length) + return false; + Forks.Pop(); + if (!Forks.Empty()) { + topFork = &Forks.Top(); + } else { + // When there are no more forks, + // we have to iterate over the empty value. + if (!EmptyValue) + return false; + AtEmptyValue = true; + return true; + } } } - if (Forks.Empty()) { - TFork fork(Trie.Data, 0, Trie.Length, Trie.SkipFunction); - fork.LastDirection(); - Forks.Push(fork); - } else { - TFork* topFork = &Forks.Top(); - while (!topFork->PrevDirection()) { - if (topFork->Node.GetOffset() >= Trie.Length) - return false; - Forks.Pop(); - if (!Forks.Empty()) { - topFork = &Forks.Top(); - } else { - // When there are no more forks, - // we have to iterate over the empty value. - if (!EmptyValue) - return false; - AtEmptyValue = true; - return true; - } - } - } - - Y_ASSERT(!Forks.Empty()); - while (Forks.Top().CurrentDirection != D_FINAL && !HasMaxKeyLength()) { - TFork nextFork = Forks.Top().NextFork(Trie.SkipFunction); - nextFork.LastDirection(); - Forks.Push(nextFork); - } - TFork& top = Forks.Top(); - static_assert(D_FINAL < D_NEXT, "relative order of NEXT and FINAL directions has changed"); - if (HasMaxKeyLength() && top.CurrentDirection == D_NEXT && top.HasDirection(D_FINAL)) { - top.PrevDirection(); - } - if (MeasureNarrowKey() == 0) { - // This is the '\0' key, skip it and get to the EmptyValue. - AtEmptyValue = true; - Forks.Clear(); - } - return true; - } - - const char* TOpaqueTrieIterator::GetValuePtr() const { - if (!Forks.Empty()) { - const TFork& lastFork = Forks.Top(); - Y_ASSERT(lastFork.Node.IsFinal() && lastFork.CurrentDirection == D_FINAL); - return Trie.Data + lastFork.GetValueOffset(); - } - Y_ASSERT(AtEmptyValue); - return EmptyValue; - } - - //------------------------------------------------------------------------- - - TString TForkStack::GetKey() const { - if (HasEmptyKey()) { - return TString(); - } - - TString result(Key); - if (TopHasLabelInKey()) { - result.append(Top().GetLabel()); - } - return result; - } - - bool TForkStack::HasEmptyKey() const { - // Special case: if we get a single zero label, treat it as an empty key - // TODO delete this after format change - if (TopHasLabelInKey()) { - return Key.size() == 0 && Top().GetLabel() == '\0'; - } else { - return Key.size() == 1 && Key[0] == '\0'; - } - } - - size_t TForkStack::MeasureKey() const { - size_t result = Key.size() + (TopHasLabelInKey() ? 1 : 0); - if (result == 1 && HasEmptyKey()) { - return 0; - } - return result; - } - - //------------------------------------------------------------------------- - - TFork::TFork(const char* data, size_t offset, size_t limit, const ILeafSkipper& skipper) - : Node(data, offset, skipper) - , Data(data) - , Limit(limit) - , CurrentDirection(TDirection(0)) - { + Y_ASSERT(!Forks.Empty()); + while (Forks.Top().CurrentDirection != D_FINAL && !HasMaxKeyLength()) { + TFork nextFork = Forks.Top().NextFork(Trie.SkipFunction); + nextFork.LastDirection(); + Forks.Push(nextFork); + } + TFork& top = Forks.Top(); + static_assert(D_FINAL < D_NEXT, "relative order of NEXT and FINAL directions has changed"); + if (HasMaxKeyLength() && top.CurrentDirection == D_NEXT && top.HasDirection(D_FINAL)) { + top.PrevDirection(); + } + if (MeasureNarrowKey() == 0) { + // This is the '\0' key, skip it and get to the EmptyValue. + AtEmptyValue = true; + Forks.Clear(); + } + return true; + } + + const char* TOpaqueTrieIterator::GetValuePtr() const { + if (!Forks.Empty()) { + const TFork& lastFork = Forks.Top(); + Y_ASSERT(lastFork.Node.IsFinal() && lastFork.CurrentDirection == D_FINAL); + return Trie.Data + lastFork.GetValueOffset(); + } + Y_ASSERT(AtEmptyValue); + return EmptyValue; + } + + //------------------------------------------------------------------------- + + TString TForkStack::GetKey() const { + if (HasEmptyKey()) { + return TString(); + } + + TString result(Key); + if (TopHasLabelInKey()) { + result.append(Top().GetLabel()); + } + return result; + } + + bool TForkStack::HasEmptyKey() const { + // Special case: if we get a single zero label, treat it as an empty key + // TODO delete this after format change + if (TopHasLabelInKey()) { + return Key.size() == 0 && Top().GetLabel() == '\0'; + } else { + return Key.size() == 1 && Key[0] == '\0'; + } + } + + size_t TForkStack::MeasureKey() const { + size_t result = Key.size() + (TopHasLabelInKey() ? 1 : 0); + if (result == 1 && HasEmptyKey()) { + return 0; + } + return result; + } + + //------------------------------------------------------------------------- + + TFork::TFork(const char* data, size_t offset, size_t limit, const ILeafSkipper& skipper) + : Node(data, offset, skipper) + , Data(data) + , Limit(limit) + , CurrentDirection(TDirection(0)) + { #if COMPTRIE_DATA_CHECK - if (Node.GetOffset() >= Limit - 1) - ythrow yexception() << "gone beyond the limit, data is corrupted"; + if (Node.GetOffset() >= Limit - 1) + ythrow yexception() << "gone beyond the limit, data is corrupted"; #endif - while (CurrentDirection < D_MAX && !HasDirection(CurrentDirection)) { - ++CurrentDirection; - } - } - - bool TFork::operator==(const TFork& rhs) const { - return (Data == rhs.Data && - Node.GetOffset() == rhs.Node.GetOffset() && - CurrentDirection == rhs.CurrentDirection); - } - - inline bool TFork::NextDirection() { - do { - ++CurrentDirection; - } while (CurrentDirection < D_MAX && !HasDirection(CurrentDirection)); - return CurrentDirection < D_MAX; - } - - inline bool TFork::PrevDirection() { - if (CurrentDirection == TDirection(0)) { - return false; - } - do { - --CurrentDirection; - } while (CurrentDirection > 0 && !HasDirection(CurrentDirection)); - return HasDirection(CurrentDirection); - } - - void TFork::LastDirection() { - CurrentDirection = D_MAX; - PrevDirection(); - } - - bool TFork::SetDirection(TDirection direction) { - if (!HasDirection(direction)) { - return false; - } - CurrentDirection = direction; - return true; - } - - char TFork::GetLabel() const { - return Node.GetLabel(); - } - - size_t TFork::GetValueOffset() const { - return Node.GetLeafOffset(); + while (CurrentDirection < D_MAX && !HasDirection(CurrentDirection)) { + ++CurrentDirection; + } + } + + bool TFork::operator==(const TFork& rhs) const { + return (Data == rhs.Data && + Node.GetOffset() == rhs.Node.GetOffset() && + CurrentDirection == rhs.CurrentDirection); + } + + inline bool TFork::NextDirection() { + do { + ++CurrentDirection; + } while (CurrentDirection < D_MAX && !HasDirection(CurrentDirection)); + return CurrentDirection < D_MAX; + } + + inline bool TFork::PrevDirection() { + if (CurrentDirection == TDirection(0)) { + return false; + } + do { + --CurrentDirection; + } while (CurrentDirection > 0 && !HasDirection(CurrentDirection)); + return HasDirection(CurrentDirection); + } + + void TFork::LastDirection() { + CurrentDirection = D_MAX; + PrevDirection(); + } + + bool TFork::SetDirection(TDirection direction) { + if (!HasDirection(direction)) { + return false; + } + CurrentDirection = direction; + return true; + } + + char TFork::GetLabel() const { + return Node.GetLabel(); + } + + size_t TFork::GetValueOffset() const { + return Node.GetLeafOffset(); } } diff --git a/library/cpp/containers/comptrie/opaque_trie_iterator.h b/library/cpp/containers/comptrie/opaque_trie_iterator.h index 8e615d7e70..195da3c191 100644 --- a/library/cpp/containers/comptrie/opaque_trie_iterator.h +++ b/library/cpp/containers/comptrie/opaque_trie_iterator.h @@ -9,258 +9,258 @@ #include <util/generic/yexception.h> namespace NCompactTrie { - class ILeafSkipper; - - class TFork { // Auxiliary class for a branching point in the iterator - public: - TNode Node; - const char* Data; - size_t Limit; // valid data is in range [Data + Node.GetOffset(), Data + Limit) - TDirection CurrentDirection; - - public: - TFork(const char* data, size_t offset, size_t limit, const ILeafSkipper& skipper); - - bool operator==(const TFork& rhs) const; - - bool HasLabelInKey() const { - return CurrentDirection == D_NEXT || CurrentDirection == D_FINAL; - } - - bool NextDirection(); - bool PrevDirection(); - void LastDirection(); - - bool HasDirection(TDirection direction) const { - return Node.GetOffsetByDirection(direction); - } - // If the fork doesn't have the specified direction, - // leaves the direction intact and returns false. - // Otherwise returns true. - bool SetDirection(TDirection direction); - TFork NextFork(const ILeafSkipper& skipper) const; - - char GetLabel() const; - size_t GetValueOffset() const; - }; - - inline TFork TFork::NextFork(const ILeafSkipper& skipper) const { - Y_ASSERT(CurrentDirection != D_FINAL); - size_t offset = Node.GetOffsetByDirection(CurrentDirection); - return TFork(Data, offset, Limit, skipper); + class ILeafSkipper; + + class TFork { // Auxiliary class for a branching point in the iterator + public: + TNode Node; + const char* Data; + size_t Limit; // valid data is in range [Data + Node.GetOffset(), Data + Limit) + TDirection CurrentDirection; + + public: + TFork(const char* data, size_t offset, size_t limit, const ILeafSkipper& skipper); + + bool operator==(const TFork& rhs) const; + + bool HasLabelInKey() const { + return CurrentDirection == D_NEXT || CurrentDirection == D_FINAL; + } + + bool NextDirection(); + bool PrevDirection(); + void LastDirection(); + + bool HasDirection(TDirection direction) const { + return Node.GetOffsetByDirection(direction); + } + // If the fork doesn't have the specified direction, + // leaves the direction intact and returns false. + // Otherwise returns true. + bool SetDirection(TDirection direction); + TFork NextFork(const ILeafSkipper& skipper) const; + + char GetLabel() const; + size_t GetValueOffset() const; + }; + + inline TFork TFork::NextFork(const ILeafSkipper& skipper) const { + Y_ASSERT(CurrentDirection != D_FINAL); + size_t offset = Node.GetOffsetByDirection(CurrentDirection); + return TFork(Data, offset, Limit, skipper); } - //------------------------------------------------------------------------------------------------ - class TForkStack { - public: - void Push(const TFork& fork) { - if (TopHasLabelInKey()) { - Key.push_back(Top().GetLabel()); - } - Forks.push_back(fork); - } - - void Pop() { - Forks.pop_back(); - if (TopHasLabelInKey()) { - Key.pop_back(); - } - } - - TFork& Top() { - return Forks.back(); + //------------------------------------------------------------------------------------------------ + class TForkStack { + public: + void Push(const TFork& fork) { + if (TopHasLabelInKey()) { + Key.push_back(Top().GetLabel()); + } + Forks.push_back(fork); + } + + void Pop() { + Forks.pop_back(); + if (TopHasLabelInKey()) { + Key.pop_back(); + } + } + + TFork& Top() { + return Forks.back(); + } + const TFork& Top() const { + return Forks.back(); + } + + bool Empty() const { + return Forks.empty(); + } + + void Clear() { + Forks.clear(); + Key.clear(); + } + + bool operator==(const TForkStack& other) const { + return Forks == other.Forks; + } + bool operator!=(const TForkStack& other) const { + return !(*this == other); + } + + TString GetKey() const; + size_t MeasureKey() const; + + private: + TVector<TFork> Forks; + TString Key; + + private: + bool TopHasLabelInKey() const { + return !Empty() && Top().HasLabelInKey(); + } + bool HasEmptyKey() const; + }; + + //------------------------------------------------------------------------------------------------ + + template <class TSymbol> + struct TConvertRawKey { + typedef typename TCompactTrieKeySelector<TSymbol>::TKey TKey; + static TKey Get(const TString& rawkey) { + TKey result; + const size_t sz = rawkey.size(); + result.reserve(sz / sizeof(TSymbol)); + for (size_t i = 0; i < sz; i += sizeof(TSymbol)) { + TSymbol sym = 0; + for (size_t j = 0; j < sizeof(TSymbol); j++) { + if (sizeof(TSymbol) <= 1) + sym = 0; + else + sym <<= 8; + if (i + j < sz) + sym |= TSymbol(0x00FF & rawkey[i + j]); + } + result.push_back(sym); + } + return result; + } + + static size_t Size(size_t rawsize) { + return rawsize / sizeof(TSymbol); + } + }; + + template <> + struct TConvertRawKey<char> { + static TString Get(const TString& rawkey) { + return rawkey; } - const TFork& Top() const { - return Forks.back(); - } - bool Empty() const { - return Forks.empty(); - } + static size_t Size(size_t rawsize) { + return rawsize; + } + }; + + //------------------------------------------------------------------------------------------------ + class TOpaqueTrieIterator { // Iterator stuff. Stores a stack of visited forks. + public: + TOpaqueTrieIterator(const TOpaqueTrie& trie, const char* emptyValue, bool atend, + size_t maxKeyLength = size_t(-1)); + + bool operator==(const TOpaqueTrieIterator& rhs) const; + bool operator!=(const TOpaqueTrieIterator& rhs) const { + return !(*this == rhs); + } + + bool Forward(); + bool Backward(); + + template <class TSymbol> + bool UpperBound(const typename TCompactTrieKeySelector<TSymbol>::TKeyBuf& key); // True if matched exactly. - void Clear() { - Forks.clear(); - Key.clear(); + template <class TSymbol> + typename TCompactTrieKeySelector<TSymbol>::TKey GetKey() const { + return TConvertRawKey<TSymbol>::Get(GetNarrowKey()); } - bool operator==(const TForkStack& other) const { - return Forks == other.Forks; - } - bool operator!=(const TForkStack& other) const { - return !(*this == other); - } - - TString GetKey() const; - size_t MeasureKey() const; - - private: - TVector<TFork> Forks; - TString Key; - - private: - bool TopHasLabelInKey() const { - return !Empty() && Top().HasLabelInKey(); - } - bool HasEmptyKey() const; - }; - - //------------------------------------------------------------------------------------------------ - - template <class TSymbol> - struct TConvertRawKey { - typedef typename TCompactTrieKeySelector<TSymbol>::TKey TKey; - static TKey Get(const TString& rawkey) { - TKey result; - const size_t sz = rawkey.size(); - result.reserve(sz / sizeof(TSymbol)); - for (size_t i = 0; i < sz; i += sizeof(TSymbol)) { - TSymbol sym = 0; - for (size_t j = 0; j < sizeof(TSymbol); j++) { - if (sizeof(TSymbol) <= 1) - sym = 0; - else - sym <<= 8; - if (i + j < sz) - sym |= TSymbol(0x00FF & rawkey[i + j]); - } - result.push_back(sym); - } - return result; - } - - static size_t Size(size_t rawsize) { - return rawsize / sizeof(TSymbol); - } - }; - - template <> - struct TConvertRawKey<char> { - static TString Get(const TString& rawkey) { - return rawkey; - } - - static size_t Size(size_t rawsize) { - return rawsize; + template <class TSymbol> + size_t MeasureKey() const { + return TConvertRawKey<TSymbol>::Size(MeasureNarrowKey()); } - }; - - //------------------------------------------------------------------------------------------------ - class TOpaqueTrieIterator { // Iterator stuff. Stores a stack of visited forks. - public: - TOpaqueTrieIterator(const TOpaqueTrie& trie, const char* emptyValue, bool atend, - size_t maxKeyLength = size_t(-1)); - - bool operator==(const TOpaqueTrieIterator& rhs) const; - bool operator!=(const TOpaqueTrieIterator& rhs) const { - return !(*this == rhs); - } - - bool Forward(); - bool Backward(); - - template <class TSymbol> - bool UpperBound(const typename TCompactTrieKeySelector<TSymbol>::TKeyBuf& key); // True if matched exactly. - - template <class TSymbol> - typename TCompactTrieKeySelector<TSymbol>::TKey GetKey() const { - return TConvertRawKey<TSymbol>::Get(GetNarrowKey()); - } - - template <class TSymbol> - size_t MeasureKey() const { - return TConvertRawKey<TSymbol>::Size(MeasureNarrowKey()); - } - - TString GetNarrowKey() const { - return Forks.GetKey(); - } - size_t MeasureNarrowKey() const { - return Forks.MeasureKey(); - } - - const char* GetValuePtr() const; // 0 if none - const TNode& GetNode() const { // Could be called for non-empty key and not AtEnd. - return Forks.Top().Node; - } - const TOpaqueTrie& GetTrie() const { - return Trie; - } - - private: - TOpaqueTrie Trie; - TForkStack Forks; - const char* const EmptyValue; - bool AtEmptyValue; - const size_t MaxKeyLength; - - private: - bool HasMaxKeyLength() const; - - template <class TSymbol> - int LongestPrefix(const typename TCompactTrieKeySelector<TSymbol>::TKeyBuf& key); // Used in UpperBound. - }; + + TString GetNarrowKey() const { + return Forks.GetKey(); + } + size_t MeasureNarrowKey() const { + return Forks.MeasureKey(); + } + + const char* GetValuePtr() const; // 0 if none + const TNode& GetNode() const { // Could be called for non-empty key and not AtEnd. + return Forks.Top().Node; + } + const TOpaqueTrie& GetTrie() const { + return Trie; + } + + private: + TOpaqueTrie Trie; + TForkStack Forks; + const char* const EmptyValue; + bool AtEmptyValue; + const size_t MaxKeyLength; + + private: + bool HasMaxKeyLength() const; + + template <class TSymbol> + int LongestPrefix(const typename TCompactTrieKeySelector<TSymbol>::TKeyBuf& key); // Used in UpperBound. + }; template <class TSymbol> - int TOpaqueTrieIterator::LongestPrefix(const typename TCompactTrieKeySelector<TSymbol>::TKeyBuf& key) { - Forks.Clear(); - TFork next(Trie.Data, 0, Trie.Length, Trie.SkipFunction); - for (size_t i = 0; i < key.size(); i++) { - TSymbol symbol = key[i]; - const bool isLastSymbol = (i + 1 == key.size()); - for (i64 shift = (i64)NCompactTrie::ExtraBits<TSymbol>(); shift >= 0; shift -= 8) { - const unsigned char label = (unsigned char)(symbol >> shift); - const bool isLastByte = (isLastSymbol && shift == 0); - do { - Forks.Push(next); - TFork& top = Forks.Top(); - if (label < (unsigned char)top.GetLabel()) { - if (!top.SetDirection(D_LEFT)) - return 1; - } else if (label > (unsigned char)top.GetLabel()) { - if (!top.SetDirection(D_RIGHT)) { - Forks.Pop(); // We don't pass this fork on the way to the upper bound. - return -1; - } - } else if (isLastByte) { // Here and below label == top.GetLabel(). - if (top.SetDirection(D_FINAL)) { - return 0; // Skip the NextFork() call at the end of the cycle. - } else { - top.SetDirection(D_NEXT); - return 1; - } - } else if (!top.SetDirection(D_NEXT)) { - top.SetDirection(D_FINAL); + int TOpaqueTrieIterator::LongestPrefix(const typename TCompactTrieKeySelector<TSymbol>::TKeyBuf& key) { + Forks.Clear(); + TFork next(Trie.Data, 0, Trie.Length, Trie.SkipFunction); + for (size_t i = 0; i < key.size(); i++) { + TSymbol symbol = key[i]; + const bool isLastSymbol = (i + 1 == key.size()); + for (i64 shift = (i64)NCompactTrie::ExtraBits<TSymbol>(); shift >= 0; shift -= 8) { + const unsigned char label = (unsigned char)(symbol >> shift); + const bool isLastByte = (isLastSymbol && shift == 0); + do { + Forks.Push(next); + TFork& top = Forks.Top(); + if (label < (unsigned char)top.GetLabel()) { + if (!top.SetDirection(D_LEFT)) + return 1; + } else if (label > (unsigned char)top.GetLabel()) { + if (!top.SetDirection(D_RIGHT)) { + Forks.Pop(); // We don't pass this fork on the way to the upper bound. + return -1; + } + } else if (isLastByte) { // Here and below label == top.GetLabel(). + if (top.SetDirection(D_FINAL)) { + return 0; // Skip the NextFork() call at the end of the cycle. + } else { + top.SetDirection(D_NEXT); + return 1; + } + } else if (!top.SetDirection(D_NEXT)) { + top.SetDirection(D_FINAL); return -1; } - next = top.NextFork(Trie.SkipFunction); - } while (Forks.Top().CurrentDirection != D_NEXT); // Proceed to the next byte. - } + next = top.NextFork(Trie.SkipFunction); + } while (Forks.Top().CurrentDirection != D_NEXT); // Proceed to the next byte. + } } - // We get here only if the key was empty. - Forks.Push(next); - return 1; + // We get here only if the key was empty. + Forks.Push(next); + return 1; } - template <class TSymbol> - bool TOpaqueTrieIterator::UpperBound(const typename TCompactTrieKeySelector<TSymbol>::TKeyBuf& key) { - Forks.Clear(); - if (key.empty() && EmptyValue) { - AtEmptyValue = true; - return true; - } else { - AtEmptyValue = false; + template <class TSymbol> + bool TOpaqueTrieIterator::UpperBound(const typename TCompactTrieKeySelector<TSymbol>::TKeyBuf& key) { + Forks.Clear(); + if (key.empty() && EmptyValue) { + AtEmptyValue = true; + return true; + } else { + AtEmptyValue = false; + } + const int defect = LongestPrefix<TSymbol>(key); + if (defect > 0) { + // Continue the constructed forks with the smallest key possible. + while (Forks.Top().CurrentDirection != D_FINAL) { + TFork next = Forks.Top().NextFork(Trie.SkipFunction); + Forks.Push(next); + } + } else if (defect < 0) { + Forward(); } - const int defect = LongestPrefix<TSymbol>(key); - if (defect > 0) { - // Continue the constructed forks with the smallest key possible. - while (Forks.Top().CurrentDirection != D_FINAL) { - TFork next = Forks.Top().NextFork(Trie.SkipFunction); - Forks.Push(next); - } - } else if (defect < 0) { - Forward(); - } - return defect == 0; + return defect == 0; } - + } diff --git a/library/cpp/containers/comptrie/prefix_iterator.h b/library/cpp/containers/comptrie/prefix_iterator.h index eb2b2d6878..b369bb4f42 100644 --- a/library/cpp/containers/comptrie/prefix_iterator.h +++ b/library/cpp/containers/comptrie/prefix_iterator.h @@ -1,5 +1,5 @@ #pragma once - + #include "comptrie_trie.h" // Iterates over all prefixes of the given key in the trie. @@ -39,11 +39,11 @@ public: result = Next(); } - operator bool() const { + operator bool() const { return result; } - TPrefixIterator& operator++() { + TPrefixIterator& operator++() { result = Next(); return *this; } diff --git a/library/cpp/containers/comptrie/protopacker.h b/library/cpp/containers/comptrie/protopacker.h index 5bedde270d..3e15866dc5 100644 --- a/library/cpp/containers/comptrie/protopacker.h +++ b/library/cpp/containers/comptrie/protopacker.h @@ -3,7 +3,7 @@ #include <util/stream/mem.h> #include <util/ysaveload.h> -template <class Proto> +template <class Proto> class TProtoPacker { public: TProtoPacker() = default; diff --git a/library/cpp/containers/comptrie/search_iterator.h b/library/cpp/containers/comptrie/search_iterator.h index ad6bcfdf60..247f7e5936 100644 --- a/library/cpp/containers/comptrie/search_iterator.h +++ b/library/cpp/containers/comptrie/search_iterator.h @@ -1,5 +1,5 @@ #pragma once - + #include "comptrie_trie.h" #include "first_symbol_iterator.h" @@ -74,7 +74,7 @@ private: const char* ValuePos = nullptr; }; -template <class TTrie> +template <class TTrie> inline TSearchIterator<TTrie> MakeSearchIterator(const TTrie& trie) { return TSearchIterator<TTrie>(trie); } diff --git a/library/cpp/containers/comptrie/set.h b/library/cpp/containers/comptrie/set.h index ffea92485c..acd43338f0 100644 --- a/library/cpp/containers/comptrie/set.h +++ b/library/cpp/containers/comptrie/set.h @@ -1,16 +1,16 @@ -#pragma once - +#pragma once + #include "comptrie_trie.h" template <typename T = char> -class TCompactTrieSet: public TCompactTrie<T, ui8, TNullPacker<ui8>> { +class TCompactTrieSet: public TCompactTrie<T, ui8, TNullPacker<ui8>> { public: - typedef TCompactTrie<T, ui8, TNullPacker<ui8>> TBase; + typedef TCompactTrie<T, ui8, TNullPacker<ui8>> TBase; - using typename TBase::TBuilder; + using typename TBase::TBuilder; using typename TBase::TKey; using typename TBase::TKeyBuf; - using typename TBase::TSymbol; + using typename TBase::TSymbol; TCompactTrieSet() = default; @@ -20,8 +20,8 @@ public: } template <typename D> - explicit TCompactTrieSet(const TCompactTrie<T, D, TNullPacker<D>>& trie) - : TBase(trie.Data()) // should be binary compatible for any D + explicit TCompactTrieSet(const TCompactTrie<T, D, TNullPacker<D>>& trie) + : TBase(trie.Data()) // should be binary compatible for any D { } diff --git a/library/cpp/containers/comptrie/write_trie_backwards.cpp b/library/cpp/containers/comptrie/write_trie_backwards.cpp index f7459b1c49..fd8c28b0ed 100644 --- a/library/cpp/containers/comptrie/write_trie_backwards.cpp +++ b/library/cpp/containers/comptrie/write_trie_backwards.cpp @@ -7,104 +7,104 @@ #include <util/generic/vector.h> namespace NCompactTrie { - size_t WriteTrieBackwards(IOutputStream& os, TReverseNodeEnumerator& enumerator, bool verbose) { - if (verbose) { - Cerr << "Writing down the trie..." << Endl; - } - - // Rewrite everything from the back, removing unused pieces. - const size_t chunksize = 0x10000; - TVector<char*> resultData; - - resultData.push_back(new char[chunksize]); - char* chunkend = resultData.back() + chunksize; - - size_t resultLength = 0; - size_t chunkLength = 0; - - size_t counter = 0; - TBuffer bufferHolder; - while (enumerator.Move()) { - if (verbose) - ShowProgress(++counter); - - size_t bufferLength = 64 + enumerator.GetLeafLength(); // never know how big leaf data can be - bufferHolder.Clear(); - bufferHolder.Resize(bufferLength); - char* buffer = bufferHolder.Data(); - - size_t nodelength = enumerator.RecreateNode(buffer, resultLength); - Y_ASSERT(nodelength <= bufferLength); - - resultLength += nodelength; - - if (chunkLength + nodelength <= chunksize) { - chunkLength += nodelength; - memcpy(chunkend - chunkLength, buffer, nodelength); - } else { // allocate a new chunk - memcpy(chunkend - chunksize, buffer + nodelength - (chunksize - chunkLength), chunksize - chunkLength); - chunkLength = chunkLength + nodelength - chunksize; - - resultData.push_back(new char[chunksize]); - chunkend = resultData.back() + chunksize; - - while (chunkLength > chunksize) { // allocate a new chunks - chunkLength -= chunksize; - memcpy(chunkend - chunksize, buffer + chunkLength, chunksize); - - resultData.push_back(new char[chunksize]); - chunkend = resultData.back() + chunksize; - } - - memcpy(chunkend - chunkLength, buffer, chunkLength); + size_t WriteTrieBackwards(IOutputStream& os, TReverseNodeEnumerator& enumerator, bool verbose) { + if (verbose) { + Cerr << "Writing down the trie..." << Endl; + } + + // Rewrite everything from the back, removing unused pieces. + const size_t chunksize = 0x10000; + TVector<char*> resultData; + + resultData.push_back(new char[chunksize]); + char* chunkend = resultData.back() + chunksize; + + size_t resultLength = 0; + size_t chunkLength = 0; + + size_t counter = 0; + TBuffer bufferHolder; + while (enumerator.Move()) { + if (verbose) + ShowProgress(++counter); + + size_t bufferLength = 64 + enumerator.GetLeafLength(); // never know how big leaf data can be + bufferHolder.Clear(); + bufferHolder.Resize(bufferLength); + char* buffer = bufferHolder.Data(); + + size_t nodelength = enumerator.RecreateNode(buffer, resultLength); + Y_ASSERT(nodelength <= bufferLength); + + resultLength += nodelength; + + if (chunkLength + nodelength <= chunksize) { + chunkLength += nodelength; + memcpy(chunkend - chunkLength, buffer, nodelength); + } else { // allocate a new chunk + memcpy(chunkend - chunksize, buffer + nodelength - (chunksize - chunkLength), chunksize - chunkLength); + chunkLength = chunkLength + nodelength - chunksize; + + resultData.push_back(new char[chunksize]); + chunkend = resultData.back() + chunksize; + + while (chunkLength > chunksize) { // allocate a new chunks + chunkLength -= chunksize; + memcpy(chunkend - chunksize, buffer + chunkLength, chunksize); + + resultData.push_back(new char[chunksize]); + chunkend = resultData.back() + chunksize; + } + + memcpy(chunkend - chunkLength, buffer, chunkLength); } - } - - if (verbose) - Cerr << counter << Endl; - - // Write the whole thing down - while (!resultData.empty()) { - char* chunk = resultData.back(); - os.Write(chunk + chunksize - chunkLength, chunkLength); - chunkLength = chunksize; - delete[] chunk; - resultData.pop_back(); } - - return resultLength; + + if (verbose) + Cerr << counter << Endl; + + // Write the whole thing down + while (!resultData.empty()) { + char* chunk = resultData.back(); + os.Write(chunk + chunksize - chunkLength, chunkLength); + chunkLength = chunksize; + delete[] chunk; + resultData.pop_back(); + } + + return resultLength; } - size_t WriteTrieBackwardsNoAlloc(IOutputStream& os, TReverseNodeEnumerator& enumerator, TOpaqueTrie& trie, EMinimizeMode mode) { - char* data = const_cast<char*>(trie.Data); - char* end = data + trie.Length; - char* pos = end; - - TVector<char> buf(64); - while (enumerator.Move()) { - size_t nodeLength = enumerator.RecreateNode(nullptr, end - pos); - if (nodeLength > buf.size()) - buf.resize(nodeLength); - - size_t realLength = enumerator.RecreateNode(buf.data(), end - pos); - Y_ASSERT(realLength == nodeLength); - - pos -= nodeLength; - memcpy(pos, buf.data(), nodeLength); - } - - switch (mode) { - case MM_NOALLOC: - os.Write(pos, end - pos); - break; - case MM_INPLACE: - memmove(data, pos, end - pos); - break; - default: - Y_VERIFY(false); - } - - return end - pos; + size_t WriteTrieBackwardsNoAlloc(IOutputStream& os, TReverseNodeEnumerator& enumerator, TOpaqueTrie& trie, EMinimizeMode mode) { + char* data = const_cast<char*>(trie.Data); + char* end = data + trie.Length; + char* pos = end; + + TVector<char> buf(64); + while (enumerator.Move()) { + size_t nodeLength = enumerator.RecreateNode(nullptr, end - pos); + if (nodeLength > buf.size()) + buf.resize(nodeLength); + + size_t realLength = enumerator.RecreateNode(buf.data(), end - pos); + Y_ASSERT(realLength == nodeLength); + + pos -= nodeLength; + memcpy(pos, buf.data(), nodeLength); + } + + switch (mode) { + case MM_NOALLOC: + os.Write(pos, end - pos); + break; + case MM_INPLACE: + memmove(data, pos, end - pos); + break; + default: + Y_VERIFY(false); + } + + return end - pos; } } diff --git a/library/cpp/containers/comptrie/writeable_node.cpp b/library/cpp/containers/comptrie/writeable_node.cpp index d0e986f99a..404003dbbd 100644 --- a/library/cpp/containers/comptrie/writeable_node.cpp +++ b/library/cpp/containers/comptrie/writeable_node.cpp @@ -3,94 +3,94 @@ #include "comptrie_impl.h" namespace NCompactTrie { - TWriteableNode::TWriteableNode() - : LeafPos(nullptr) - , LeafLength(0) - , ForwardOffset(NPOS) - , LeftOffset(NPOS) - , RightOffset(NPOS) - , Label(0) - { - } + TWriteableNode::TWriteableNode() + : LeafPos(nullptr) + , LeafLength(0) + , ForwardOffset(NPOS) + , LeftOffset(NPOS) + , RightOffset(NPOS) + , Label(0) + { + } + + static size_t GetOffsetFromEnd(const TNode& node, size_t absOffset) { + return absOffset ? absOffset - node.GetOffset() - node.GetCoreLength() : NPOS; + } - static size_t GetOffsetFromEnd(const TNode& node, size_t absOffset) { - return absOffset ? absOffset - node.GetOffset() - node.GetCoreLength() : NPOS; - } - - TWriteableNode::TWriteableNode(const TNode& node, const char* data) - : LeafPos(node.IsFinal() ? data + node.GetLeafOffset() : nullptr) - , LeafLength(node.GetLeafLength()) - , ForwardOffset(GetOffsetFromEnd(node, node.GetForwardOffset())) - , LeftOffset(GetOffsetFromEnd(node, node.GetLeftOffset())) - , RightOffset(GetOffsetFromEnd(node, node.GetRightOffset())) - , Label(node.GetLabel()) - { - } + TWriteableNode::TWriteableNode(const TNode& node, const char* data) + : LeafPos(node.IsFinal() ? data + node.GetLeafOffset() : nullptr) + , LeafLength(node.GetLeafLength()) + , ForwardOffset(GetOffsetFromEnd(node, node.GetForwardOffset())) + , LeftOffset(GetOffsetFromEnd(node, node.GetLeftOffset())) + , RightOffset(GetOffsetFromEnd(node, node.GetRightOffset())) + , Label(node.GetLabel()) + { + } - size_t TWriteableNode::Measure() const { - size_t len = 2 + LeafLength; - size_t fwdLen = 0; - size_t lastLen = 0; - size_t lastFwdLen = 0; - // Now, increase all the offsets by the length and recalculate everything, until it converges - do { - lastLen = len; - lastFwdLen = fwdLen; + size_t TWriteableNode::Measure() const { + size_t len = 2 + LeafLength; + size_t fwdLen = 0; + size_t lastLen = 0; + size_t lastFwdLen = 0; + // Now, increase all the offsets by the length and recalculate everything, until it converges + do { + lastLen = len; + lastFwdLen = fwdLen; - len = 2 + LeafLength; - len += MeasureOffset(LeftOffset != NPOS ? LeftOffset + lastLen : 0); - len += MeasureOffset(RightOffset != NPOS ? RightOffset + lastLen : 0); - - // Relative forward offset of 0 means we don't need extra length for an epsilon link. - // But an epsilon link means we need an extra 1 for the flags and the forward offset is measured - // from the start of the epsilon link, not from the start of our node. - if (ForwardOffset != NPOS && ForwardOffset != 0) { - fwdLen = MeasureOffset(ForwardOffset + lastFwdLen) + 1; - len += fwdLen; - } + len = 2 + LeafLength; + len += MeasureOffset(LeftOffset != NPOS ? LeftOffset + lastLen : 0); + len += MeasureOffset(RightOffset != NPOS ? RightOffset + lastLen : 0); - } while (lastLen != len || lastFwdLen != fwdLen); + // Relative forward offset of 0 means we don't need extra length for an epsilon link. + // But an epsilon link means we need an extra 1 for the flags and the forward offset is measured + // from the start of the epsilon link, not from the start of our node. + if (ForwardOffset != NPOS && ForwardOffset != 0) { + fwdLen = MeasureOffset(ForwardOffset + lastFwdLen) + 1; + len += fwdLen; + } + + } while (lastLen != len || lastFwdLen != fwdLen); + + return len; + } - return len; - } + size_t TWriteableNode::Pack(char* buffer) const { + const size_t length = Measure(); - size_t TWriteableNode::Pack(char* buffer) const { - const size_t length = Measure(); + char flags = 0; + if (LeafPos) { + flags |= MT_FINAL; + } + if (ForwardOffset != NPOS) { + flags |= MT_NEXT; + } - char flags = 0; - if (LeafPos) { - flags |= MT_FINAL; - } - if (ForwardOffset != NPOS) { - flags |= MT_NEXT; - } + const size_t leftOffset = LeftOffset != NPOS ? LeftOffset + length : 0; + const size_t rightOffset = RightOffset != NPOS ? RightOffset + length : 0; + const size_t leftOffsetSize = MeasureOffset(leftOffset); + const size_t rightOffsetSize = MeasureOffset(rightOffset); + flags |= (leftOffsetSize << MT_LEFTSHIFT); + flags |= (rightOffsetSize << MT_RIGHTSHIFT); - const size_t leftOffset = LeftOffset != NPOS ? LeftOffset + length : 0; - const size_t rightOffset = RightOffset != NPOS ? RightOffset + length : 0; - const size_t leftOffsetSize = MeasureOffset(leftOffset); - const size_t rightOffsetSize = MeasureOffset(rightOffset); - flags |= (leftOffsetSize << MT_LEFTSHIFT); - flags |= (rightOffsetSize << MT_RIGHTSHIFT); + buffer[0] = flags; + buffer[1] = Label; + size_t usedLen = 2; + usedLen += PackOffset(buffer + usedLen, leftOffset); + usedLen += PackOffset(buffer + usedLen, rightOffset); - buffer[0] = flags; - buffer[1] = Label; - size_t usedLen = 2; - usedLen += PackOffset(buffer + usedLen, leftOffset); - usedLen += PackOffset(buffer + usedLen, rightOffset); + if (LeafPos && LeafLength) { + memcpy(buffer + usedLen, LeafPos, LeafLength); + usedLen += LeafLength; + } - if (LeafPos && LeafLength) { - memcpy(buffer + usedLen, LeafPos, LeafLength); - usedLen += LeafLength; - } - - if (ForwardOffset != NPOS && ForwardOffset != 0) { - const size_t fwdOffset = ForwardOffset + length - usedLen; - size_t fwdOffsetSize = MeasureOffset(fwdOffset); - buffer[usedLen++] = (char)(fwdOffsetSize & MT_SIZEMASK); - usedLen += PackOffset(buffer + usedLen, fwdOffset); - } - Y_ASSERT(usedLen == length); - return usedLen; + if (ForwardOffset != NPOS && ForwardOffset != 0) { + const size_t fwdOffset = ForwardOffset + length - usedLen; + size_t fwdOffsetSize = MeasureOffset(fwdOffset); + buffer[usedLen++] = (char)(fwdOffsetSize & MT_SIZEMASK); + usedLen += PackOffset(buffer + usedLen, fwdOffset); + } + Y_ASSERT(usedLen == length); + return usedLen; } } diff --git a/library/cpp/containers/comptrie/writeable_node.h b/library/cpp/containers/comptrie/writeable_node.h index 585ef0a5e4..5454e579ef 100644 --- a/library/cpp/containers/comptrie/writeable_node.h +++ b/library/cpp/containers/comptrie/writeable_node.h @@ -3,24 +3,24 @@ #include <cstddef> namespace NCompactTrie { - class TNode; + class TNode; - class TWriteableNode { - public: - const char* LeafPos; - size_t LeafLength; + class TWriteableNode { + public: + const char* LeafPos; + size_t LeafLength; - size_t ForwardOffset; - size_t LeftOffset; - size_t RightOffset; - char Label; + size_t ForwardOffset; + size_t LeftOffset; + size_t RightOffset; + char Label; - TWriteableNode(); - TWriteableNode(const TNode& node, const char* data); + TWriteableNode(); + TWriteableNode(const TNode& node, const char* data); - // When you call this, the offsets should be relative to the end of the node. Use NPOS to indicate an absent offset. - size_t Pack(char* buffer) const; - size_t Measure() const; - }; + // When you call this, the offsets should be relative to the end of the node. Use NPOS to indicate an absent offset. + size_t Pack(char* buffer) const; + size_t Measure() const; + }; } diff --git a/library/cpp/containers/comptrie/ya.make b/library/cpp/containers/comptrie/ya.make index 43d6534634..81352da4b2 100644 --- a/library/cpp/containers/comptrie/ya.make +++ b/library/cpp/containers/comptrie/ya.make @@ -1,4 +1,4 @@ -LIBRARY() +LIBRARY() OWNER(velavokr) @@ -12,9 +12,9 @@ SRCS( key_selector.h leaf_skipper.h set.h - comptrie.cpp + comptrie.cpp comptrie_builder.cpp - comptrie_impl.cpp + comptrie_impl.cpp make_fast_layout.cpp minimize.cpp node.cpp |