diff options
author | vporyadke <zalyalov@ydb.tech> | 2024-10-16 08:18:37 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-10-16 08:18:37 +0200 |
commit | 743e0929eaa9b4d51d5ca0718a3716050f0bf4a5 (patch) | |
tree | 0cb8b64ac40584cc160aef372aa0a93413179d26 | |
parent | b8023f625029c81266fc6b6deda580960a544837 (diff) | |
download | ydb-743e0929eaa9b4d51d5ca0718a3716050f0bf4a5.tar.gz |
delete nodes from local db every time we delete them from memory (#10051)
-rw-r--r-- | ydb/core/mind/hive/hive_impl.cpp | 16 | ||||
-rw-r--r-- | ydb/core/mind/hive/hive_impl.h | 1 | ||||
-rw-r--r-- | ydb/core/mind/hive/hive_ut.cpp | 99 | ||||
-rw-r--r-- | ydb/core/mind/hive/node_info.cpp | 4 | ||||
-rw-r--r-- | ydb/core/mind/hive/node_info.h | 2 | ||||
-rw-r--r-- | ydb/core/mind/hive/tx__delete_node.cpp | 38 | ||||
-rw-r--r-- | ydb/core/mind/hive/tx__load_everything.cpp | 3 | ||||
-rw-r--r-- | ydb/core/mind/hive/tx__register_node.cpp | 3 | ||||
-rw-r--r-- | ydb/core/mind/hive/ya.make | 1 |
9 files changed, 157 insertions, 10 deletions
diff --git a/ydb/core/mind/hive/hive_impl.cpp b/ydb/core/mind/hive/hive_impl.cpp index f8322e2559..5cdbd5c785 100644 --- a/ydb/core/mind/hive/hive_impl.cpp +++ b/ydb/core/mind/hive/hive_impl.cpp @@ -96,7 +96,7 @@ void THive::RestartPipeTx(ui64 tabletId) { } bool THive::TryToDeleteNode(TNodeInfo* node) { - if (node->CanBeDeleted()) { + if (node->CanBeDeleted(TActivationContext::Now())) { BLOG_I("TryToDeleteNode(" << node->Id << "): deleting"); DeleteNode(node->Id); return true; @@ -120,12 +120,15 @@ void THive::Handle(TEvTabletPipe::TEvServerConnected::TPtr& ev) { void THive::Handle(TEvTabletPipe::TEvServerDisconnected::TPtr& ev) { if (ev->Get()->TabletId == TabletID()) { BLOG_TRACE("Handle TEvTabletPipe::TEvServerDisconnected(" << ev->Get()->ClientId << ") " << ev->Get()->ServerId); - TNodeInfo* node = FindNode(ev->Get()->ClientId.NodeId()); + auto nodeId = ev->Get()->ClientId.NodeId(); + TNodeInfo* node = FindNode(nodeId); if (node != nullptr) { Erase(node->PipeServers, ev->Get()->ServerId); if (node->PipeServers.empty() && node->IsUnknown()) { ObjectDistributions.RemoveNode(*node); - TryToDeleteNode(node); + if (TryToDeleteNode(node)) { + Execute(CreateDeleteNode(nodeId)); + } } } } @@ -3427,13 +3430,16 @@ void THive::Handle(TEvPrivate::TEvLogTabletMoves::TPtr&) { } void THive::Handle(TEvPrivate::TEvDeleteNode::TPtr& ev) { - auto node = FindNode(ev->Get()->NodeId); + auto nodeId = ev->Get()->NodeId; + auto node = FindNode(nodeId); if (node == nullptr) { return; } node->DeletionScheduled = false; if (!node->IsAlive()) { - TryToDeleteNode(node); + if (TryToDeleteNode(node)) { + Execute(CreateDeleteNode(nodeId)); + } } } diff --git a/ydb/core/mind/hive/hive_impl.h b/ydb/core/mind/hive/hive_impl.h index 18f270b371..ccb740754e 100644 --- a/ydb/core/mind/hive/hive_impl.h +++ b/ydb/core/mind/hive/hive_impl.h @@ -304,6 +304,7 @@ protected: ITransaction* CreateUpdateTabletsObject(TEvHive::TEvUpdateTabletsObject::TPtr event); ITransaction* CreateUpdateDomain(TSubDomainKey subdomainKey, TEvHive::TEvUpdateDomain::TPtr event = {}); ITransaction* CreateUpdateDcFollowers(const TDataCenterId& dc); + ITransaction* CreateDeleteNode(TNodeId nodeId); public: TDomainsView DomainsView; diff --git a/ydb/core/mind/hive/hive_ut.cpp b/ydb/core/mind/hive/hive_ut.cpp index e3c2a6adbe..495c439092 100644 --- a/ydb/core/mind/hive/hive_ut.cpp +++ b/ydb/core/mind/hive/hive_ut.cpp @@ -1069,6 +1069,105 @@ Y_UNIT_TEST_SUITE(THiveTest) { UNIT_ASSERT(!isNodeEmpty(nodeId)); } + Y_UNIT_TEST(DrainWithHiveRestart) { + // 1. Drain a node + // 2. Kill it & wait for hive to delete it + // 3. Start the node again + // 4. Restart hive + // 5. Ensure node is not down (by creating tablets) + const int NUM_NODES = 3; + const int NUM_TABLETS = 10; + TTestBasicRuntime runtime(NUM_NODES, false); + Setup(runtime, true, 2, [](TAppPrepare& app) { + app.HiveConfig.SetNodeDeletePeriod(1); + }); + const ui64 hiveTablet = MakeDefaultHiveID(); + const ui64 testerTablet = MakeTabletID(false, 1); + const TActorId hiveActor = CreateTestBootstrapper(runtime, CreateTestTabletInfo(hiveTablet, TTabletTypes::Hive), &CreateDefaultHive); + runtime.EnableScheduleForActor(hiveActor); + { + TDispatchOptions options; + options.FinalEvents.emplace_back(TEvLocal::EvStatus, NUM_NODES); + runtime.DispatchEvents(options); + } + TTabletTypes::EType tabletType = TTabletTypes::Dummy; + std::unordered_set<TTabletId> tablets; + TActorId senderA = runtime.AllocateEdgeActor(0); + auto createTablets = [&] { + for (int i = 0; i < NUM_TABLETS; ++i) { + THolder<TEvHive::TEvCreateTablet> ev(new TEvHive::TEvCreateTablet(testerTablet, 100500 + tablets.size(), tabletType, BINDED_CHANNELS)); + runtime.SendToPipe(hiveTablet, senderA, ev.Release(), 0, GetPipeConfigWithRetries()); + TAutoPtr<IEventHandle> handle; + auto createTabletReply = runtime.GrabEdgeEventRethrow<TEvHive::TEvCreateTabletReply>(handle); + ui64 tabletId = createTabletReply->Record.GetTabletID(); + tablets.insert(tabletId); + } + NTabletPipe::TClientConfig pipeConfig; + pipeConfig.RetryPolicy = NTabletPipe::TClientRetryPolicy::WithRetries(); + for (TTabletId tabletId : tablets) { + MakeSureTabletIsUp(runtime, tabletId, 0, &pipeConfig); + } + }; + + createTablets(); + + ui32 nodeId = runtime.GetNodeId(2); + { + Ctest << "1. Drain a node\n"; + + runtime.SendToPipe(hiveTablet, senderA, new TEvHive::TEvDrainNode(nodeId)); + + Ctest << "2. Kill it & wait for hive to delete it\n"; + + SendKillLocal(runtime, 0); + { + TDispatchOptions options; + options.FinalEvents.emplace_back(NHive::TEvPrivate::EvDeleteNode); + runtime.DispatchEvents(options, TDuration::Seconds(6)); + } + } + + auto isNodeEmpty = [&](ui32 nodeId) -> bool { + bool empty = true; + TAutoPtr<IEventHandle> handle; + TActorId whiteboard = NNodeWhiteboard::MakeNodeWhiteboardServiceId(nodeId); + runtime.Send(new IEventHandle(whiteboard, senderA, new NNodeWhiteboard::TEvWhiteboard::TEvTabletStateRequest())); + NNodeWhiteboard::TEvWhiteboard::TEvTabletStateResponse* wbResponse = runtime.GrabEdgeEventRethrow<NNodeWhiteboard::TEvWhiteboard::TEvTabletStateResponse>(handle); + for (const NKikimrWhiteboard::TTabletStateInfo& tabletInfo : wbResponse->Record.GetTabletStateInfo()) { + if (tablets.contains(tabletInfo.GetTabletId()) && tabletInfo.GetState() != NKikimrWhiteboard::TTabletStateInfo::Dead) { + Ctest << "Tablet " << tabletInfo.GetTabletId() << "." << tabletInfo.GetFollowerId() + << " is not dead yet (" << NKikimrWhiteboard::TTabletStateInfo::ETabletState_Name(tabletInfo.GetState()) << ")" << Endl; + empty = false; + } + } + return empty; + }; + + Ctest << "3. Start the node again\n"; + CreateLocal(runtime, 0); + + { + TDispatchOptions options; + options.FinalEvents.emplace_back(TEvLocal::EvStatus); + runtime.DispatchEvents(options); + } + + Ctest << "4. Restart hive\n"; + + runtime.Register(CreateTabletKiller(hiveTablet)); + { + TDispatchOptions options; + options.FinalEvents.emplace_back(TEvLocal::EvStatus, NUM_NODES); + runtime.DispatchEvents(options); + } + + Ctest << "5. Ensure node is not down (by creating tablets)\n"; + + createTablets(); + + UNIT_ASSERT(!isNodeEmpty(nodeId)); + } + Y_UNIT_TEST(TestCreateSubHiveCreateTablet) { TTestBasicRuntime runtime(1, false); Setup(runtime, true); diff --git a/ydb/core/mind/hive/node_info.cpp b/ydb/core/mind/hive/node_info.cpp index 688cf2955e..ae5d1c4c31 100644 --- a/ydb/core/mind/hive/node_info.cpp +++ b/ydb/core/mind/hive/node_info.cpp @@ -453,7 +453,7 @@ TResourceRawValues TNodeInfo::GetStDevResourceValues() { return GetStDev(values); } -bool TNodeInfo::CanBeDeleted() const { +bool TNodeInfo::CanBeDeleted(TInstant now) const { TInstant lastAlive(TInstant::MilliSeconds(Statistics.GetLastAliveTimestamp())); if (lastAlive) { return (IsDisconnected() || IsUnknown()) @@ -461,7 +461,7 @@ bool TNodeInfo::CanBeDeleted() const { && GetTabletsTotal() == 0 && LockedTablets.empty() && !Freeze - && (lastAlive + Hive.GetNodeDeletePeriod() < TInstant::Now()); + && (lastAlive + Hive.GetNodeDeletePeriod() < now); } else { return (IsDisconnected() || IsUnknown()) && !Local && GetTabletsTotal() == 0 && LockedTablets.empty() && !Freeze; } diff --git a/ydb/core/mind/hive/node_info.h b/ydb/core/mind/hive/node_info.h index d975921bde..4c4483317b 100644 --- a/ydb/core/mind/hive/node_info.h +++ b/ydb/core/mind/hive/node_info.h @@ -232,7 +232,7 @@ public: } } - bool CanBeDeleted() const; + bool CanBeDeleted(TInstant now) const; void RegisterInDomains(); void DeregisterInDomains(); void Ping(); diff --git a/ydb/core/mind/hive/tx__delete_node.cpp b/ydb/core/mind/hive/tx__delete_node.cpp new file mode 100644 index 0000000000..fa3b039826 --- /dev/null +++ b/ydb/core/mind/hive/tx__delete_node.cpp @@ -0,0 +1,38 @@ +#include "hive_impl.h" +#include "hive_log.h" + +namespace NKikimr { +namespace NHive { + +class TTxDeleteNode : public TTransactionBase<THive> { +protected: + TNodeId NodeId; +public: + TTxDeleteNode(TNodeId nodeId, THive *hive) + : TBase(hive) + , NodeId(nodeId) + {} + + bool Execute(TTransactionContext &txc, const TActorContext&) override { + NIceDb::TNiceDb db(txc.DB); + db.Table<Schema::Node>().Key(NodeId).Delete(); + auto restrictionsRowset = db.Table<Schema::TabletAvailabilityRestrictions>().Range(NodeId).Select(); + while (!restrictionsRowset.EndOfSet()) { + db.Table<Schema::TabletAvailabilityRestrictions>().Key(restrictionsRowset.GetKey()).Delete(); + if (!restrictionsRowset.Next()) { + return false; + } + } + return true; + } + + void Complete(const TActorContext&) override { + } + }; + + ITransaction* THive::CreateDeleteNode(TNodeId nodeId) { + return new TTxDeleteNode(nodeId, this); + } + +} // NHive +} // NKikimr diff --git a/ydb/core/mind/hive/tx__load_everything.cpp b/ydb/core/mind/hive/tx__load_everything.cpp index e8988e283b..a067b39b02 100644 --- a/ydb/core/mind/hive/tx__load_everything.cpp +++ b/ydb/core/mind/hive/tx__load_everything.cpp @@ -750,8 +750,9 @@ public: size_t numDeletedNodes = 0; size_t numDeletedRestrictions = 0; + TInstant now = TActivationContext::Now(); for (auto itNode = Self->Nodes.begin(); itNode != Self->Nodes.end();) { - if (itNode->second.CanBeDeleted()) { + if (itNode->second.CanBeDeleted(now)) { ++numDeletedNodes; auto restrictionsRowset = db.Table<Schema::TabletAvailabilityRestrictions>().Range(itNode->first).Select(); while (!restrictionsRowset.EndOfSet()) { diff --git a/ydb/core/mind/hive/tx__register_node.cpp b/ydb/core/mind/hive/tx__register_node.cpp index 7e8fb49b1f..7c40397313 100644 --- a/ydb/core/mind/hive/tx__register_node.cpp +++ b/ydb/core/mind/hive/tx__register_node.cpp @@ -23,7 +23,7 @@ public: TNodeId nodeId = Local.NodeId(); TNodeInfo& node = Self->GetNode(nodeId); if (node.Local != Local) { - TInstant now = TInstant::Now(); + TInstant now = TActivationContext::Now(); node.Statistics.AddRestartTimestamp(now.MilliSeconds()); node.ActualizeNodeStatistics(now); for (const auto& t : node.Tablets) { @@ -57,6 +57,7 @@ public: db.Table<Schema::Node>().Key(nodeId).Update<Schema::Node::Down, Schema::Node::Freeze>(false, false); } if (node.BecomeUpOnRestart) { + BLOG_TRACE("THive::TTxRegisterNode(" << Local.NodeId() << ")::Execute - node became up on restart"); node.SetDown(false); node.BecomeUpOnRestart = false; db.Table<Schema::Node>().Key(nodeId).Update<Schema::Node::Down, Schema::Node::BecomeUpOnRestart>(false, false); diff --git a/ydb/core/mind/hive/ya.make b/ydb/core/mind/hive/ya.make index ec05c7fd9a..3fad547f0b 100644 --- a/ydb/core/mind/hive/ya.make +++ b/ydb/core/mind/hive/ya.make @@ -48,6 +48,7 @@ SRCS( tx__configure_subdomain.cpp tx__create_tablet.cpp tx__cut_tablet_history.cpp + tx__delete_node.cpp tx__delete_tablet.cpp tx__delete_tablet_result.cpp tx__disconnect_node.cpp |