aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAleksei Borzenkov <snaury@gmail.com>2022-06-30 22:25:27 +0300
committerAleksei Borzenkov <snaury@gmail.com>2022-06-30 22:25:27 +0300
commit744ac9f6b9248d75b97b289a0c7fa3141742769b (patch)
treeda4fd21c08e17c985deea42c7a158b916978ba5a
parent35889738d268486c400d0324ba50b570dc2b39f5 (diff)
downloadydb-744ac9f6b9248d75b97b289a0c7fa3141742769b.tar.gz
Support rollback to snapshot in copy-on-write b+tree, KIKIMR-14732
ref:9320078a8b84fde526ba5dd552cb69161b3fba3c
-rw-r--r--ydb/core/util/btree_cow.h77
-rw-r--r--ydb/core/util/btree_cow_ut.cpp83
2 files changed, 159 insertions, 1 deletions
diff --git a/ydb/core/util/btree_cow.h b/ydb/core/util/btree_cow.h
index 3882e8d930..6430dc8df8 100644
--- a/ydb/core/util/btree_cow.h
+++ b/ydb/core/util/btree_cow.h
@@ -208,6 +208,9 @@ namespace NKikimr {
// Not thread safe, only accessed by a single writer thread
TPage* LastDroppedPage = nullptr;
+ // True when it is possible to rollback to this snapshot
+ bool RollbackSupported = true;
+
// Disable all copy constructors
TSnapshotContext(const TSnapshotContext&) = delete;
TSnapshotContext& operator=(const TSnapshotContext&) = delete;
@@ -890,7 +893,7 @@ namespace NKikimr {
* other threads.
*/
TSnapshot Snapshot() {
- const_cast<TCowBTree*>(this)->CollectGarbage();
+ CollectGarbage();
Y_VERIFY(CurrentEpoch < PageEpochMax, "Epoch overflow: too many snapshots");
@@ -912,6 +915,61 @@ namespace NKikimr {
return TSnapshot(TTreeHandle(this), Root, Size_, InnerHeight_);
}
+ /**
+ * Rollback to a previously created snapshot
+ *
+ * Note that newer snapshots will continue to be valid, but
+ * rolling forward is undefined.
+ */
+ void RollbackTo(const TSnapshot& snapshot) {
+ CollectGarbage();
+
+ TSnapshotContext* context = snapshot.Tree.Context;
+ Y_VERIFY(context, "Cannot rollback to an invalid or an empty snapshot");
+ Y_VERIFY(context->RollbackSupported, "Cannot rollback to a newer snapshot");
+
+ Y_VERIFY(snapshot.Tree.Tree == this, "Cannot rollback to snapshot from a different tree");
+
+ TPage* prevRoot = Root;
+ ui64 epoch = context->Epoch;
+
+ // Restore tree state to the one stored in snapshot
+ Root = const_cast<TPage*>(snapshot.Root);
+ Size_ = snapshot.Size_;
+ InnerHeight_ = snapshot.InnerHeight_;
+ InsertPath.clear();
+
+ // Restore all dropped pages not newer than snapshot epoch
+ for (;;) {
+ TPage** nextDropped = &context->LastDroppedPage;
+ while (TPage* page = *nextDropped) {
+ if (page->Epoch <= epoch) {
+ // Remove it from the list of dropped pages
+ Y_VERIFY_DEBUG(DroppedPages_ > 0);
+ *nextDropped = page->NextDroppedPage;
+ page->NextDroppedPage = nullptr;
+ --DroppedPages_;
+ } else {
+ // Skip and inspect the next page
+ nextDropped = &page->NextDroppedPage;
+ }
+ }
+ context = context->Next;
+ if (!context) {
+ break;
+ }
+ // All newer snapshots cannot be used for rollbacks, since
+ // some invariants of dropped page lists become invalid at
+ // future updates.
+ context->RollbackSupported = false;
+ }
+
+ // Drop newer pages, they will be added to the latest snapshot
+ if (prevRoot) {
+ DropNewerPageRecursive(prevRoot, epoch);
+ }
+ }
+
public:
void clear() {
Clear();
@@ -1748,6 +1806,23 @@ namespace NKikimr {
}
private:
+ void DropNewerPageRecursive(TPage* page, ui64 epoch) {
+ if (page->Epoch <= epoch) {
+ return;
+ }
+
+ if (page->GetTag() == ETag::Inner) {
+ TInnerPage* inner = TInnerPage::From(page);
+ size_t count = inner->Count;
+ TEdge* edges = inner->Edges();
+ for (size_t index = 0; index <= count; ++index) {
+ DropNewerPageRecursive(*edges++, epoch);
+ }
+ }
+
+ DropPage(page);
+ }
+
void DropPageRecursive(TPage* page) {
if (page->GetTag() == ETag::Inner) {
TInnerPage* inner = TInnerPage::From(page);
diff --git a/ydb/core/util/btree_cow_ut.cpp b/ydb/core/util/btree_cow_ut.cpp
index 309ec5b18c..749644bd9b 100644
--- a/ydb/core/util/btree_cow_ut.cpp
+++ b/ydb/core/util/btree_cow_ut.cpp
@@ -584,6 +584,89 @@ Y_UNIT_TEST_SUITE(TCowBTreeTest) {
UNIT_ASSERT_VALUES_EQUAL(tree.DroppedPages(), 0u);
}
+ void DoSnapshotRollback(bool earlyErase) {
+ using TTree = TCowBTree<ui64, ui64>;
+
+ TTree tree;
+ std::map<ui64, ui64> keys;
+
+ TVector<std::pair<TTree::TSnapshot, std::map<ui64, ui64>>> snapshots;
+ snapshots.emplace_back(tree.Snapshot(), keys);
+ TVector<std::pair<TTree::TSnapshot, std::map<ui64, ui64>>> snapshotsOld;
+
+ size_t count = NSan::PlainOrUnderSanitizer(1000000u, 200000u);
+ size_t snapshotEvery = count / 20;
+ size_t rollbackEvery = count / 10;
+
+ auto checkSnapshot = [&](const TTree::TSnapshot& snapshot, const std::map<ui64, ui64>& expected) {
+ auto itTree = snapshot.Iterator();
+ itTree.SeekFirst();
+ auto itMap = expected.begin();
+ while (itTree.IsValid()) {
+ ui64 key = itTree.GetKey();
+ ui64 value = itTree.GetValue();
+ UNIT_ASSERT_C(itMap != expected.end(), "Key " << key << " present in tree, but not in expected map");
+ UNIT_ASSERT_C(key == itMap->first && value == itMap->second, "Found " << key << " value " << value
+ << ", but expected map has " << itMap->first << " value " << itMap->second);
+ itTree.Next();
+ ++itMap;
+ }
+ UNIT_ASSERT_C(itMap == expected.end(), "Expected " << itMap->first << " value " << itMap->second
+ << ", but not found in the tree");
+ };
+
+ for (size_t i = 0; i < count; ++i) {
+ ui64 key = RandomNumber<ui64>();
+ ui64 value = RandomNumber<ui64>();
+ if (keys.emplace(key, value).second) {
+ UNIT_ASSERT_C(
+ tree.Emplace(key, value),
+ "Unexpected failure to insert key " << key);
+ }
+ if (((i + 1) % snapshotEvery) == 0) {
+ // Create a new snapshot periodically
+ snapshots.emplace_back(tree.Snapshot(), keys);
+ }
+ if (((i + 1) % rollbackEvery) == 0) {
+ // Rollback to a random previous snapshot periodically
+ size_t index = RandomNumber<ui64>() % snapshots.size();
+ tree.RollbackTo(snapshots[index].first);
+ keys = snapshots[index].second;
+ for (size_t it = index + 1; it < snapshots.size(); ++it) {
+ snapshotsOld.emplace_back(std::move(snapshots[it]));
+ }
+ snapshots.erase(snapshots.begin() + (index + 1), snapshots.end());
+ while (earlyErase && snapshotsOld.size() > 0) {
+ checkSnapshot(snapshotsOld.back().first, snapshotsOld.back().second);
+ snapshotsOld.pop_back();
+ tree.CollectGarbage();
+ }
+ }
+ }
+
+ while (snapshotsOld.size() > 0) {
+ checkSnapshot(snapshotsOld.back().first, snapshotsOld.back().second);
+ snapshotsOld.pop_back();
+ tree.CollectGarbage();
+ }
+
+ while (snapshots.size() > 0) {
+ checkSnapshot(snapshots.back().first, snapshots.back().second);
+ snapshots.pop_back();
+ tree.CollectGarbage();
+ }
+
+ checkSnapshot(tree.UnsafeSnapshot(), keys);
+ }
+
+ Y_UNIT_TEST(SnapshotRollback) {
+ DoSnapshotRollback(false);
+ }
+
+ Y_UNIT_TEST(SnapshotRollbackEarlyErase) {
+ DoSnapshotRollback(true);
+ }
+
Y_UNIT_TEST(IteratorDestructor) {
using TTree = TCowBTree<ui64, ui64>;