diff options
author | Aleksei Borzenkov <snaury@gmail.com> | 2022-06-30 22:25:27 +0300 |
---|---|---|
committer | Aleksei Borzenkov <snaury@gmail.com> | 2022-06-30 22:25:27 +0300 |
commit | 744ac9f6b9248d75b97b289a0c7fa3141742769b (patch) | |
tree | da4fd21c08e17c985deea42c7a158b916978ba5a | |
parent | 35889738d268486c400d0324ba50b570dc2b39f5 (diff) | |
download | ydb-744ac9f6b9248d75b97b289a0c7fa3141742769b.tar.gz |
Support rollback to snapshot in copy-on-write b+tree, KIKIMR-14732
ref:9320078a8b84fde526ba5dd552cb69161b3fba3c
-rw-r--r-- | ydb/core/util/btree_cow.h | 77 | ||||
-rw-r--r-- | ydb/core/util/btree_cow_ut.cpp | 83 |
2 files changed, 159 insertions, 1 deletions
diff --git a/ydb/core/util/btree_cow.h b/ydb/core/util/btree_cow.h index 3882e8d930..6430dc8df8 100644 --- a/ydb/core/util/btree_cow.h +++ b/ydb/core/util/btree_cow.h @@ -208,6 +208,9 @@ namespace NKikimr { // Not thread safe, only accessed by a single writer thread TPage* LastDroppedPage = nullptr; + // True when it is possible to rollback to this snapshot + bool RollbackSupported = true; + // Disable all copy constructors TSnapshotContext(const TSnapshotContext&) = delete; TSnapshotContext& operator=(const TSnapshotContext&) = delete; @@ -890,7 +893,7 @@ namespace NKikimr { * other threads. */ TSnapshot Snapshot() { - const_cast<TCowBTree*>(this)->CollectGarbage(); + CollectGarbage(); Y_VERIFY(CurrentEpoch < PageEpochMax, "Epoch overflow: too many snapshots"); @@ -912,6 +915,61 @@ namespace NKikimr { return TSnapshot(TTreeHandle(this), Root, Size_, InnerHeight_); } + /** + * Rollback to a previously created snapshot + * + * Note that newer snapshots will continue to be valid, but + * rolling forward is undefined. + */ + void RollbackTo(const TSnapshot& snapshot) { + CollectGarbage(); + + TSnapshotContext* context = snapshot.Tree.Context; + Y_VERIFY(context, "Cannot rollback to an invalid or an empty snapshot"); + Y_VERIFY(context->RollbackSupported, "Cannot rollback to a newer snapshot"); + + Y_VERIFY(snapshot.Tree.Tree == this, "Cannot rollback to snapshot from a different tree"); + + TPage* prevRoot = Root; + ui64 epoch = context->Epoch; + + // Restore tree state to the one stored in snapshot + Root = const_cast<TPage*>(snapshot.Root); + Size_ = snapshot.Size_; + InnerHeight_ = snapshot.InnerHeight_; + InsertPath.clear(); + + // Restore all dropped pages not newer than snapshot epoch + for (;;) { + TPage** nextDropped = &context->LastDroppedPage; + while (TPage* page = *nextDropped) { + if (page->Epoch <= epoch) { + // Remove it from the list of dropped pages + Y_VERIFY_DEBUG(DroppedPages_ > 0); + *nextDropped = page->NextDroppedPage; + page->NextDroppedPage = nullptr; + --DroppedPages_; + } else { + // Skip and inspect the next page + nextDropped = &page->NextDroppedPage; + } + } + context = context->Next; + if (!context) { + break; + } + // All newer snapshots cannot be used for rollbacks, since + // some invariants of dropped page lists become invalid at + // future updates. + context->RollbackSupported = false; + } + + // Drop newer pages, they will be added to the latest snapshot + if (prevRoot) { + DropNewerPageRecursive(prevRoot, epoch); + } + } + public: void clear() { Clear(); @@ -1748,6 +1806,23 @@ namespace NKikimr { } private: + void DropNewerPageRecursive(TPage* page, ui64 epoch) { + if (page->Epoch <= epoch) { + return; + } + + if (page->GetTag() == ETag::Inner) { + TInnerPage* inner = TInnerPage::From(page); + size_t count = inner->Count; + TEdge* edges = inner->Edges(); + for (size_t index = 0; index <= count; ++index) { + DropNewerPageRecursive(*edges++, epoch); + } + } + + DropPage(page); + } + void DropPageRecursive(TPage* page) { if (page->GetTag() == ETag::Inner) { TInnerPage* inner = TInnerPage::From(page); diff --git a/ydb/core/util/btree_cow_ut.cpp b/ydb/core/util/btree_cow_ut.cpp index 309ec5b18c..749644bd9b 100644 --- a/ydb/core/util/btree_cow_ut.cpp +++ b/ydb/core/util/btree_cow_ut.cpp @@ -584,6 +584,89 @@ Y_UNIT_TEST_SUITE(TCowBTreeTest) { UNIT_ASSERT_VALUES_EQUAL(tree.DroppedPages(), 0u); } + void DoSnapshotRollback(bool earlyErase) { + using TTree = TCowBTree<ui64, ui64>; + + TTree tree; + std::map<ui64, ui64> keys; + + TVector<std::pair<TTree::TSnapshot, std::map<ui64, ui64>>> snapshots; + snapshots.emplace_back(tree.Snapshot(), keys); + TVector<std::pair<TTree::TSnapshot, std::map<ui64, ui64>>> snapshotsOld; + + size_t count = NSan::PlainOrUnderSanitizer(1000000u, 200000u); + size_t snapshotEvery = count / 20; + size_t rollbackEvery = count / 10; + + auto checkSnapshot = [&](const TTree::TSnapshot& snapshot, const std::map<ui64, ui64>& expected) { + auto itTree = snapshot.Iterator(); + itTree.SeekFirst(); + auto itMap = expected.begin(); + while (itTree.IsValid()) { + ui64 key = itTree.GetKey(); + ui64 value = itTree.GetValue(); + UNIT_ASSERT_C(itMap != expected.end(), "Key " << key << " present in tree, but not in expected map"); + UNIT_ASSERT_C(key == itMap->first && value == itMap->second, "Found " << key << " value " << value + << ", but expected map has " << itMap->first << " value " << itMap->second); + itTree.Next(); + ++itMap; + } + UNIT_ASSERT_C(itMap == expected.end(), "Expected " << itMap->first << " value " << itMap->second + << ", but not found in the tree"); + }; + + for (size_t i = 0; i < count; ++i) { + ui64 key = RandomNumber<ui64>(); + ui64 value = RandomNumber<ui64>(); + if (keys.emplace(key, value).second) { + UNIT_ASSERT_C( + tree.Emplace(key, value), + "Unexpected failure to insert key " << key); + } + if (((i + 1) % snapshotEvery) == 0) { + // Create a new snapshot periodically + snapshots.emplace_back(tree.Snapshot(), keys); + } + if (((i + 1) % rollbackEvery) == 0) { + // Rollback to a random previous snapshot periodically + size_t index = RandomNumber<ui64>() % snapshots.size(); + tree.RollbackTo(snapshots[index].first); + keys = snapshots[index].second; + for (size_t it = index + 1; it < snapshots.size(); ++it) { + snapshotsOld.emplace_back(std::move(snapshots[it])); + } + snapshots.erase(snapshots.begin() + (index + 1), snapshots.end()); + while (earlyErase && snapshotsOld.size() > 0) { + checkSnapshot(snapshotsOld.back().first, snapshotsOld.back().second); + snapshotsOld.pop_back(); + tree.CollectGarbage(); + } + } + } + + while (snapshotsOld.size() > 0) { + checkSnapshot(snapshotsOld.back().first, snapshotsOld.back().second); + snapshotsOld.pop_back(); + tree.CollectGarbage(); + } + + while (snapshots.size() > 0) { + checkSnapshot(snapshots.back().first, snapshots.back().second); + snapshots.pop_back(); + tree.CollectGarbage(); + } + + checkSnapshot(tree.UnsafeSnapshot(), keys); + } + + Y_UNIT_TEST(SnapshotRollback) { + DoSnapshotRollback(false); + } + + Y_UNIT_TEST(SnapshotRollbackEarlyErase) { + DoSnapshotRollback(true); + } + Y_UNIT_TEST(IteratorDestructor) { using TTree = TCowBTree<ui64, ui64>; |