diff options
author | Devtools Arcadia <arcadia-devtools@yandex-team.ru> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/containers/bitseq | |
download | ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/containers/bitseq')
-rw-r--r-- | library/cpp/containers/bitseq/bititerator.h | 138 | ||||
-rw-r--r-- | library/cpp/containers/bitseq/bititerator_ut.cpp | 109 | ||||
-rw-r--r-- | library/cpp/containers/bitseq/bitvector.cpp | 1 | ||||
-rw-r--r-- | library/cpp/containers/bitseq/bitvector.h | 158 | ||||
-rw-r--r-- | library/cpp/containers/bitseq/bitvector_ut.cpp | 86 | ||||
-rw-r--r-- | library/cpp/containers/bitseq/readonly_bitvector.cpp | 1 | ||||
-rw-r--r-- | library/cpp/containers/bitseq/readonly_bitvector.h | 76 | ||||
-rw-r--r-- | library/cpp/containers/bitseq/traits.h | 49 | ||||
-rw-r--r-- | library/cpp/containers/bitseq/ut/ya.make | 10 | ||||
-rw-r--r-- | library/cpp/containers/bitseq/ya.make | 15 |
10 files changed, 643 insertions, 0 deletions
diff --git a/library/cpp/containers/bitseq/bititerator.h b/library/cpp/containers/bitseq/bititerator.h new file mode 100644 index 0000000000..52dadd3798 --- /dev/null +++ b/library/cpp/containers/bitseq/bititerator.h @@ -0,0 +1,138 @@ +#pragma once + +#include "traits.h" + +#include <library/cpp/pop_count/popcount.h> + +template <typename T> +class TBitIterator { +public: + using TWord = T; + using TTraits = TBitSeqTraits<TWord>; + +public: + TBitIterator(const T* data = nullptr) + : Current(0) + , Mask(0) + , Data(data) + { + } + + /// Get the word next to the one we are currenlty iterating over. + const TWord* NextWord() const { + return Data; + } + + /// Get the next bit without moving the iterator. + bool Peek() const { + return Mask ? (Current & Mask) : (*Data & 1); + } + + /// Get the next bit and move forward. + /// TODO: Implement inversed iteration as well. + bool Next() { + if (!Mask) { + Current = *Data++; + Mask = 1; + } + const bool bit = Current & Mask; + Mask <<= 1; + return bit; + } + + /// Get the next count bits without moving the iterator. + TWord Peek(ui8 count) const { + if (!count) + return 0; + Y_VERIFY_DEBUG(count <= TTraits::NumBits); + + if (!Mask) + return *Data & TTraits::ElemMask(count); + + auto usedBits = (size_t)PopCount(Mask - 1); + TWord result = Current >> usedBits; + auto leftInCurrent = TTraits::NumBits - usedBits; + if (count <= leftInCurrent) + return result & TTraits::ElemMask(count); + + count -= leftInCurrent; + result |= (*Data & TTraits::ElemMask(count)) << leftInCurrent; + return result; + } + + /// Get the next count bits and move forward by count bits. + TWord Read(ui8 count) { + if (!count) + return 0; + Y_VERIFY_DEBUG(count <= TTraits::NumBits); + + if (!Mask) { + Current = *Data++; + Mask = 1 << count; + return Current & TTraits::ElemMask(count); + } + + auto usedBits = (size_t)PopCount(Mask - 1); + TWord result = Current >> usedBits; + auto leftInCurrent = TTraits::NumBits - usedBits; + if (count < leftInCurrent) { + Mask <<= count; + return result & TTraits::ElemMask(count); + } + + count -= leftInCurrent; + if (count) { + Current = *Data++; + Mask = 1 << count; + result |= (Current & TTraits::ElemMask(count)) << leftInCurrent; + } else { + Mask = 0; + } + + return result; + } + + /// Move the iterator forward by count bits. + void Forward(int count) { + if (!count) + return; + + int leftInCurrent = (size_t)PopCount(~(Mask - 1)); + if (count < leftInCurrent) { + Mask <<= count; + return; + } + + count -= leftInCurrent; + Data += count >> TTraits::DivShift; + auto remainder = count & TTraits::ModMask; + + if (remainder) { + Current = *Data++; + Mask = 1 << remainder; + } else { + Current = 0; + Mask = 0; + } + } + + /// Skip trailing bits of the current word and move by count words forward. + void Align(int count = 0) { + Current = 0; + if (Mask) + Mask = 0; + Data += count; + } + + /// Initialize the iterator. + void Reset(const TWord* data) { + Current = 0; + Mask = 0; + Data = data; + } + +private: + TWord Current; + TWord Mask; + const TWord* Data; +}; diff --git a/library/cpp/containers/bitseq/bititerator_ut.cpp b/library/cpp/containers/bitseq/bititerator_ut.cpp new file mode 100644 index 0000000000..ed0925866f --- /dev/null +++ b/library/cpp/containers/bitseq/bititerator_ut.cpp @@ -0,0 +1,109 @@ +#include "bititerator.h" + +#include <library/cpp/testing/unittest/registar.h> +#include <util/generic/vector.h> + +Y_UNIT_TEST_SUITE(TBitIteratorTest) { + TVector<ui16> GenWords() { + TVector<ui16> words(1, 0); + for (ui16 word = 1; word; ++word) + words.push_back(word); + return words; + } + + template <typename TWord> + void AssertPeekRead(TBitIterator<TWord> & iter, ui8 count, TWord expected) { + auto peek = iter.Peek(count); + auto read = iter.Read(count); + UNIT_ASSERT_EQUAL(peek, read); + UNIT_ASSERT_EQUAL(peek, expected); + } + + Y_UNIT_TEST(TestNextAndPeek) { + const auto& words = GenWords(); + + TBitIterator<ui16> iter(words.data()); + ui16 word = 0; + for (int i = 0; i != (1 << 16); ++i, ++word) { + for (int bit = 0; bit != 16; ++bit) { + auto peek = iter.Peek(); + auto next = iter.Next(); + UNIT_ASSERT_EQUAL(peek, next); + UNIT_ASSERT_EQUAL(peek, (word >> bit) & 1); + } + UNIT_ASSERT_EQUAL(iter.NextWord(), words.data() + i + 1); + } + + UNIT_ASSERT_EQUAL(iter.NextWord(), words.data() + words.size()); + } + + Y_UNIT_TEST(TestAlignedReadAndPeek) { + const auto& words = GenWords(); + + TBitIterator<ui16> iter(words.data()); + ui16 word = 0; + for (int i = 0; i != (1 << 16); ++i, ++word) { + AssertPeekRead(iter, 16, word); + UNIT_ASSERT_EQUAL(iter.NextWord(), words.data() + i + 1); + } + + UNIT_ASSERT_EQUAL(iter.NextWord(), words.data() + words.size()); + } + + Y_UNIT_TEST(TestForward) { + TVector<ui32> words; + words.push_back((1 << 10) | (1 << 20) | (1 << 25)); + words.push_back(1 | (1 << 5) | (1 << 6) | (1 << 30)); + for (int i = 0; i < 3; ++i) + words.push_back(0); + words.push_back(1 << 10); + + TBitIterator<ui32> iter(words.data()); + UNIT_ASSERT(!iter.Next()); + UNIT_ASSERT(!iter.Next()); + UNIT_ASSERT(!iter.Next()); + iter.Forward(6); + UNIT_ASSERT(!iter.Next()); + UNIT_ASSERT(iter.Next()); + UNIT_ASSERT(!iter.Next()); + iter.Forward(8); + UNIT_ASSERT(iter.Next()); + iter.Forward(4); + UNIT_ASSERT(iter.Next()); + iter.Forward(5); + UNIT_ASSERT(!iter.Next()); + UNIT_ASSERT(iter.Next()); + iter.Forward(4); + UNIT_ASSERT(iter.Next()); + + iter.Reset(words.data()); + iter.Forward(38); + UNIT_ASSERT(iter.Next()); + UNIT_ASSERT(!iter.Next()); + UNIT_ASSERT_EQUAL(iter.NextWord(), words.data() + 2); + + iter.Forward(24 + 32 * 3 + 9); + UNIT_ASSERT(!iter.Next()); + UNIT_ASSERT(iter.Next()); + UNIT_ASSERT(!iter.Next()); + UNIT_ASSERT_EQUAL(iter.NextWord(), words.data() + 6); + } + + Y_UNIT_TEST(TestUnalignedReadAndPeek) { + TVector<ui32> words; + words.push_back((1 << 10) | (1 << 20) | (1 << 25)); + words.push_back(1 | (1 << 5) | (1 << 6) | (1 << 30)); + for (int i = 0; i < 5; ++i) + words.push_back(1 | (1 << 10)); + + TBitIterator<ui32> iter(words.data()); + AssertPeekRead(iter, 5, ui32(0)); + AssertPeekRead(iter, 7, ui32(1 << 5)); + AssertPeekRead(iter, 21, ui32((1 << 8) | (1 << 13) | (1 << 20))); + AssertPeekRead(iter, 32, (words[1] >> 1) | (1 << 31)); + iter.Forward(8); + UNIT_ASSERT(!iter.Next()); + UNIT_ASSERT(iter.Next()); + UNIT_ASSERT(!iter.Next()); + } +} diff --git a/library/cpp/containers/bitseq/bitvector.cpp b/library/cpp/containers/bitseq/bitvector.cpp new file mode 100644 index 0000000000..05cb3a881d --- /dev/null +++ b/library/cpp/containers/bitseq/bitvector.cpp @@ -0,0 +1 @@ +#include "bitvector.h" diff --git a/library/cpp/containers/bitseq/bitvector.h b/library/cpp/containers/bitseq/bitvector.h new file mode 100644 index 0000000000..3f8fd81ee5 --- /dev/null +++ b/library/cpp/containers/bitseq/bitvector.h @@ -0,0 +1,158 @@ +#pragma once + +#include "traits.h" + +#include <library/cpp/pop_count/popcount.h> + +#include <util/generic/vector.h> +#include <util/ysaveload.h> + +template <typename T> +class TReadonlyBitVector; + +template <typename T> +class TBitVector { +public: + using TWord = T; + using TTraits = TBitSeqTraits<TWord>; + +private: + friend class TReadonlyBitVector<T>; + ui64 Size_; + TVector<TWord> Data_; + +public: + TBitVector() + : Size_(0) + , Data_(0) + { + } + + TBitVector(ui64 size) + : Size_(size) + , Data_(static_cast<size_t>((Size_ + TTraits::ModMask) >> TTraits::DivShift), 0) + { + } + + virtual ~TBitVector() = default; + + void Clear() { + Size_ = 0; + Data_.clear(); + } + + void Resize(ui64 size) { + Size_ = size; + Data_.resize((Size_ + TTraits::ModMask) >> TTraits::DivShift); + } + + void Swap(TBitVector& other) { + DoSwap(Size_, other.Size_); + DoSwap(Data_, other.Data_); + } + + bool Set(ui64 pos) { + Y_ASSERT(pos < Size_); + TWord& val = Data_[pos >> TTraits::DivShift]; + if (val & TTraits::BitMask(pos & TTraits::ModMask)) + return false; + val |= TTraits::BitMask(pos & TTraits::ModMask); + return true; + } + + bool Test(ui64 pos) const { + return TTraits::Test(Data(), pos, Size_); + } + + void Reset(ui64 pos) { + Y_ASSERT(pos < Size_); + Data_[pos >> TTraits::DivShift] &= ~TTraits::BitMask(pos & TTraits::ModMask); + } + + TWord Get(ui64 pos, ui8 width, TWord mask) const { + return TTraits::Get(Data(), pos, width, mask, Size_); + } + + TWord Get(ui64 pos, ui8 width) const { + return Get(pos, width, TTraits::ElemMask(width)); + } + + void Set(ui64 pos, TWord value, ui8 width, TWord mask) { + if (!width) + return; + Y_ASSERT((pos + width) <= Size_); + size_t word = pos >> TTraits::DivShift; + TWord shift1 = pos & TTraits::ModMask; + TWord shift2 = TTraits::NumBits - shift1; + Data_[word] &= ~(mask << shift1); + Data_[word] |= (value & mask) << shift1; + if (shift2 < width) { + Data_[word + 1] &= ~(mask >> shift2); + Data_[word + 1] |= (value & mask) >> shift2; + } + } + + void Set(ui64 pos, TWord value, ui8 width) { + Set(pos, value, width, TTraits::ElemMask(width)); + } + + void Append(TWord value, ui8 width, TWord mask) { + if (!width) + return; + if (Data_.size() * TTraits::NumBits < Size_ + width) { + Data_.push_back(0); + } + Size_ += width; + Set(Size_ - width, value, width, mask); + } + + void Append(TWord value, ui8 width) { + Append(value, width, TTraits::ElemMask(width)); + } + + size_t Count() const { + size_t count = 0; + for (size_t i = 0; i < Data_.size(); ++i) { + count += (size_t)PopCount(Data_[i]); + } + return count; + } + + ui64 Size() const { + return Size_; + } + + size_t Words() const { + return Data_.size(); + } + + const TWord* Data() const { + return Data_.data(); + } + + void Save(IOutputStream* out) const { + ::Save(out, Size_); + ::Save(out, Data_); + } + + void Load(IInputStream* inp) { + ::Load(inp, Size_); + ::Load(inp, Data_); + } + + ui64 Space() const { + return CHAR_BIT * (sizeof(Size_) + + Data_.size() * sizeof(TWord)); + } + + void Print(IOutputStream& out, size_t truncate = 128) { + for (size_t i = 0; i < Data_.size() && i < truncate; ++i) { + for (int j = TTraits::NumBits - 1; j >= 0; --j) { + size_t pos = TTraits::NumBits * i + j; + out << (pos < Size_ && Test(pos) ? '1' : '0'); + } + out << " "; + } + out << Endl; + } +}; diff --git a/library/cpp/containers/bitseq/bitvector_ut.cpp b/library/cpp/containers/bitseq/bitvector_ut.cpp new file mode 100644 index 0000000000..6137adab1e --- /dev/null +++ b/library/cpp/containers/bitseq/bitvector_ut.cpp @@ -0,0 +1,86 @@ +#include "bitvector.h" +#include "readonly_bitvector.h" + +#include <library/cpp/testing/unittest/registar.h> + +#include <util/memory/blob.h> +#include <util/stream/buffer.h> + +Y_UNIT_TEST_SUITE(TBitVectorTest) { + Y_UNIT_TEST(TestEmpty) { + TBitVector<ui64> v64; + UNIT_ASSERT_EQUAL(v64.Size(), 0); + UNIT_ASSERT_EQUAL(v64.Words(), 0); + + TBitVector<ui32> v32(0); + UNIT_ASSERT_EQUAL(v32.Size(), 0); + UNIT_ASSERT_EQUAL(v32.Words(), 0); + } + + Y_UNIT_TEST(TestOneWord) { + TBitVector<ui32> v; + v.Append(1, 1); + v.Append(0, 1); + v.Append(1, 3); + v.Append(10, 4); + v.Append(100500, 20); + + UNIT_ASSERT_EQUAL(v.Get(0, 1), 1); + UNIT_ASSERT(v.Test(0)); + UNIT_ASSERT_EQUAL(v.Get(1, 1), 0); + UNIT_ASSERT_EQUAL(v.Get(2, 3), 1); + UNIT_ASSERT_EQUAL(v.Get(5, 4), 10); + UNIT_ASSERT_EQUAL(v.Get(9, 20), 100500); + + v.Reset(0); + v.Set(9, 1234, 15); + UNIT_ASSERT_EQUAL(v.Get(0, 1), 0); + UNIT_ASSERT(!v.Test(0)); + UNIT_ASSERT_EQUAL(v.Get(9, 15), 1234); + + UNIT_ASSERT_EQUAL(v.Size(), 29); + UNIT_ASSERT_EQUAL(v.Words(), 1); + } + + Y_UNIT_TEST(TestManyWords) { + static const int BITS = 10; + TBitVector<ui64> v; + + for (int i = 0, end = (1 << BITS); i < end; ++i) + v.Append(i, BITS); + + UNIT_ASSERT_EQUAL(v.Size(), BITS * (1 << BITS)); + UNIT_ASSERT_EQUAL(v.Words(), (v.Size() + 63) / 64); + for (int i = 0, end = (1 << BITS); i < end; ++i) + UNIT_ASSERT_EQUAL(v.Get(i * BITS, BITS), (ui64)i); + } + + Y_UNIT_TEST(TestMaxWordSize) { + TBitVector<ui32> v; + for (int i = 0; i < 100; ++i) + v.Append(i, 32); + + for (int i = 0; i < 100; ++i) + UNIT_ASSERT_EQUAL(v.Get(i * 32, 32), (ui32)i); + + v.Set(10 * 32, 100500, 32); + UNIT_ASSERT_EQUAL(v.Get(10 * 32, 32), 100500); + } + + Y_UNIT_TEST(TestReadonlyVector) { + TBitVector<ui64> v(100); + for (ui64 i = 0; i < v.Size(); ++i) { + if (i % 3 == 0) { + v.Set(i); + } + } + TBufferStream bs; + TReadonlyBitVector<ui64>::SaveForReadonlyAccess(&bs, v); + const auto blob = TBlob::FromBuffer(bs.Buffer()); + TReadonlyBitVector<ui64> rv; + rv.LoadFromBlob(blob); + for (ui64 i = 0; i < rv.Size(); ++i) { + UNIT_ASSERT_VALUES_EQUAL(rv.Test(i), i % 3 == 0); + } + } +} diff --git a/library/cpp/containers/bitseq/readonly_bitvector.cpp b/library/cpp/containers/bitseq/readonly_bitvector.cpp new file mode 100644 index 0000000000..891aa7cde2 --- /dev/null +++ b/library/cpp/containers/bitseq/readonly_bitvector.cpp @@ -0,0 +1 @@ +#include "readonly_bitvector.h" diff --git a/library/cpp/containers/bitseq/readonly_bitvector.h b/library/cpp/containers/bitseq/readonly_bitvector.h new file mode 100644 index 0000000000..8612739c3f --- /dev/null +++ b/library/cpp/containers/bitseq/readonly_bitvector.h @@ -0,0 +1,76 @@ +#pragma once + +#include "bitvector.h" +#include "traits.h" + +#include <util/memory/blob.h> + +#include <cstring> + +template <typename T> +class TReadonlyBitVector { +public: + using TWord = T; + using TTraits = TBitSeqTraits<TWord>; + + TReadonlyBitVector() + : Size_() + , Data_() + { + } + + explicit TReadonlyBitVector(const TBitVector<T>& vector) + : Size_(vector.Size_) + , Data_(vector.Data_.data()) + { + } + + bool Test(ui64 pos) const { + return TTraits::Test(Data_, pos, Size_); + } + + TWord Get(ui64 pos, ui8 width, TWord mask) const { + return TTraits::Get(Data_, pos, width, mask, Size_); + } + + TWord Get(ui64 pos, ui8 width) const { + return Get(pos, width, TTraits::ElemMask(width)); + } + + ui64 Size() const { + return Size_; + } + + const T* Data() const { + return Data_; + } + + static void SaveForReadonlyAccess(IOutputStream* out, const TBitVector<T>& bv) { + ::Save(out, bv.Size_); + ::Save(out, static_cast<ui64>(bv.Data_.size())); + ::SavePodArray(out, bv.Data_.data(), bv.Data_.size()); + } + + virtual TBlob LoadFromBlob(const TBlob& blob) { + size_t read = 0; + auto cursor = [&]() { return blob.AsUnsignedCharPtr() + read; }; + auto readToPtr = [&](auto* ptr) { + memcpy(ptr, cursor(), sizeof(*ptr)); + read += sizeof(*ptr); + }; + + readToPtr(&Size_); + + ui64 wordCount{}; + readToPtr(&wordCount); + + Data_ = reinterpret_cast<const T*>(cursor()); + read += wordCount * sizeof(T); + + return blob.SubBlob(read, blob.Size()); + } + +private: + ui64 Size_; + const T* Data_; +}; diff --git a/library/cpp/containers/bitseq/traits.h b/library/cpp/containers/bitseq/traits.h new file mode 100644 index 0000000000..2330b1b4f2 --- /dev/null +++ b/library/cpp/containers/bitseq/traits.h @@ -0,0 +1,49 @@ +#pragma once + +#include <util/generic/bitops.h> +#include <util/generic/typetraits.h> +#include <util/system/yassert.h> + +template <typename TWord> +struct TBitSeqTraits { + static constexpr ui8 NumBits = CHAR_BIT * sizeof(TWord); + static constexpr TWord ModMask = static_cast<TWord>(NumBits - 1); + static constexpr TWord DivShift = MostSignificantBitCT(NumBits); + + static inline TWord ElemMask(ui8 count) { + // NOTE: Shifting by the type's length is UB, so we need this workaround. + if (Y_LIKELY(count)) + return TWord(-1) >> (NumBits - count); + return 0; + } + + static inline TWord BitMask(ui8 pos) { + return TWord(1) << pos; + } + + static size_t NumOfWords(size_t bits) { + return (bits + NumBits - 1) >> DivShift; + } + + static bool Test(const TWord* data, ui64 pos, ui64 size) { + Y_ASSERT(pos < size); + return data[pos >> DivShift] & BitMask(pos & ModMask); + } + + static TWord Get(const TWord* data, ui64 pos, ui8 width, TWord mask, ui64 size) { + if (!width) + return 0; + Y_ASSERT((pos + width) <= size); + size_t word = pos >> DivShift; + TWord shift1 = pos & ModMask; + TWord shift2 = NumBits - shift1; + TWord res = data[word] >> shift1 & mask; + if (shift2 < width) { + res |= data[word + 1] << shift2 & mask; + } + return res; + } + + static_assert(std::is_unsigned<TWord>::value, "Expected std::is_unsigned<T>::value."); + static_assert((NumBits & (NumBits - 1)) == 0, "NumBits should be a power of 2."); +}; diff --git a/library/cpp/containers/bitseq/ut/ya.make b/library/cpp/containers/bitseq/ut/ya.make new file mode 100644 index 0000000000..7155e82c06 --- /dev/null +++ b/library/cpp/containers/bitseq/ut/ya.make @@ -0,0 +1,10 @@ +UNITTEST_FOR(library/cpp/containers/bitseq) + +OWNER(g:util) + +SRCS( + bititerator_ut.cpp + bitvector_ut.cpp +) + +END() diff --git a/library/cpp/containers/bitseq/ya.make b/library/cpp/containers/bitseq/ya.make new file mode 100644 index 0000000000..7090956c55 --- /dev/null +++ b/library/cpp/containers/bitseq/ya.make @@ -0,0 +1,15 @@ +LIBRARY() + +OWNER(g:util) + +PEERDIR( + util/draft + library/cpp/pop_count +) + +SRCS( + bitvector.cpp + readonly_bitvector.cpp +) + +END() |