diff options
author | Andrey Molotkov <[email protected]> | 2025-06-03 19:25:30 +0300 |
---|---|---|
committer | GitHub <[email protected]> | 2025-06-03 19:25:30 +0300 |
commit | 6079d14e8b4e5b0d709193eb278d890372ea2aa2 (patch) | |
tree | a7879573b083d5eba76b740b37dfb29c7f54c012 | |
parent | 9f92ebb034cea30aa04d9bcce9a98184a60af1a0 (diff) |
Fix not consistent generation counters for data erasure in SchemeShard tablet and BSC tablet (#19019)
5 files changed, 106 insertions, 16 deletions
diff --git a/ydb/core/tx/schemeshard/schemeshard__data_erasure_manager.h b/ydb/core/tx/schemeshard/schemeshard__data_erasure_manager.h index 3d49a05152a..0e5cd3a0498 100644 --- a/ydb/core/tx/schemeshard/schemeshard__data_erasure_manager.h +++ b/ydb/core/tx/schemeshard/schemeshard__data_erasure_manager.h @@ -54,6 +54,7 @@ public: virtual bool Remove(const TPathId& pathId) = 0; virtual bool Remove(const TShardIdx& shardIdx) = 0; virtual void HandleNewPartitioning(const std::vector<TShardIdx>& dataErasureShards, NIceDb::TNiceDb& db) = 0; + virtual void SyncBscGeneration(NIceDb::TNiceDb& db, ui64 currentBscGeneration) = 0; void Clear(); @@ -128,6 +129,7 @@ public: bool Remove(const TPathId& pathId) override; bool Remove(const TShardIdx& shardIdx) override; void HandleNewPartitioning(const std::vector<TShardIdx>& dataErasureShards, NIceDb::TNiceDb& db) override; + void SyncBscGeneration(NIceDb::TNiceDb& db, ui64 currentBscGeneration) override; private: static TQueue::TConfig ConvertConfig(const NKikimrConfig::TDataErasureConfig& config); @@ -189,6 +191,7 @@ public: bool Remove(const TPathId& pathId) override; bool Remove(const TShardIdx& shardIdx) override; void HandleNewPartitioning(const std::vector<TShardIdx>& dataErasureShards, NIceDb::TNiceDb& db) override; + void SyncBscGeneration(NIceDb::TNiceDb& db, ui64 currentBscGeneration) override; private: static TQueue::TConfig ConvertConfig(const NKikimrConfig::TDataErasureConfig& config); diff --git a/ydb/core/tx/schemeshard/schemeshard__root_data_erasure_manager.cpp b/ydb/core/tx/schemeshard/schemeshard__root_data_erasure_manager.cpp index 6db5af28621..258bf928164 100644 --- a/ydb/core/tx/schemeshard/schemeshard__root_data_erasure_manager.cpp +++ b/ydb/core/tx/schemeshard/schemeshard__root_data_erasure_manager.cpp @@ -131,7 +131,7 @@ void TRootDataErasureManager::Run(NIceDb::TNiceDb& db) { Status = EDataErasureStatus::IN_PROGRESS_BSC; } db.Table<Schema::DataErasureGenerations>().Key(Generation).Update<Schema::DataErasureGenerations::Status, - Schema::DataErasureGenerations::StartTime>(Status, StartTime.MicroSeconds()); + Schema::DataErasureGenerations::StartTime>(Status, StartTime.MicroSeconds()); const auto ctx = SchemeShard->ActorContext(); LOG_NOTICE_S(ctx, NKikimrServices::FLAT_TX_SCHEMESHARD, @@ -481,6 +481,13 @@ void TRootDataErasureManager::HandleNewPartitioning(const std::vector<TShardIdx> LOG_WARN_S(ctx, NKikimrServices::FLAT_TX_SCHEMESHARD, "[RootDataErasureManager] [HandleNewPartitioning] Cannot execute in root schemeshard: " << SchemeShard->TabletID()); } +void TRootDataErasureManager::SyncBscGeneration(NIceDb::TNiceDb& db, ui64 currentBscGeneration) { + db.Table<Schema::DataErasureGenerations>().Key(GetGeneration()).Delete(); + SetGeneration(currentBscGeneration + 1); + db.Table<Schema::DataErasureGenerations>().Key(GetGeneration()).Update<Schema::DataErasureGenerations::Status, + Schema::DataErasureGenerations::StartTime>(GetStatus(), StartTime.MicroSeconds()); +} + void TRootDataErasureManager::UpdateMetrics() { SchemeShard->TabletCounters->Simple()[COUNTER_DATA_ERASURE_QUEUE_SIZE].Set(Queue->Size()); SchemeShard->TabletCounters->Simple()[COUNTER_DATA_ERASURE_QUEUE_RUNNING].Set(Queue->RunningSize()); @@ -633,13 +640,15 @@ struct TSchemeShard::TTxCompleteDataErasureBSC : public TSchemeShard::TRwTxBase const auto& record = Ev->Get()->Record; auto& manager = Self->DataErasureManager; - if (record.GetCurrentGeneration() != manager->GetGeneration()) { + NIceDb::TNiceDb db(txc.DB); + if (ui64 currentBscGeneration = record.GetCurrentGeneration(); currentBscGeneration > manager->GetGeneration()) { LOG_DEBUG_S(ctx, NKikimrServices::FLAT_TX_SCHEMESHARD, - "TTxCompleteDataErasureBSC Unknown generation#" << record.GetCurrentGeneration() << ", Expected gen# " << manager->GetGeneration() << " at schemestard: " << Self->TabletID()); + "TTxCompleteDataErasureBSC Unknown generation#" << currentBscGeneration << ", Expected gen# " << manager->GetGeneration() << " at schemestard: " << Self->TabletID()); + manager->SyncBscGeneration(db, currentBscGeneration); + manager->SendRequestToBSC(); return; } - NIceDb::TNiceDb db(txc.DB); if (record.GetCompleted()) { LOG_NOTICE_S(ctx, NKikimrServices::FLAT_TX_SCHEMESHARD, "TTxCompleteDataErasureBSC: Data shred in BSC is completed"); manager->Complete(); diff --git a/ydb/core/tx/schemeshard/schemeshard__tenant_data_erasure_manager.cpp b/ydb/core/tx/schemeshard/schemeshard__tenant_data_erasure_manager.cpp index 7df521ee5f1..b9404aa293a 100644 --- a/ydb/core/tx/schemeshard/schemeshard__tenant_data_erasure_manager.cpp +++ b/ydb/core/tx/schemeshard/schemeshard__tenant_data_erasure_manager.cpp @@ -445,6 +445,12 @@ void TTenantDataErasureManager::HandleNewPartitioning(const std::vector<TShardId << ", Status# " << static_cast<ui32>(Status)); } +void TTenantDataErasureManager::SyncBscGeneration(NIceDb::TNiceDb&, ui64) { + auto ctx = SchemeShard->ActorContext(); + LOG_WARN_S(ctx, NKikimrServices::FLAT_TX_SCHEMESHARD, + "[TenantDataErasureManager] [SyncBscGeneration] Cannot execute in tenant schemeshard: " << SchemeShard->TabletID()); +} + void TTenantDataErasureManager::UpdateMetrics() { SchemeShard->TabletCounters->Simple()[COUNTER_TENANT_DATA_ERASURE_QUEUE_SIZE].Set(Queue->Size()); SchemeShard->TabletCounters->Simple()[COUNTER_TENANT_DATA_ERASURE_QUEUE_RUNNING].Set(Queue->RunningSize()); diff --git a/ydb/core/tx/schemeshard/ut_data_erasure/ut_data_erasure.cpp b/ydb/core/tx/schemeshard/ut_data_erasure/ut_data_erasure.cpp index ce8bc7ee0d2..e9f989e4b1b 100644 --- a/ydb/core/tx/schemeshard/ut_data_erasure/ut_data_erasure.cpp +++ b/ydb/core/tx/schemeshard/ut_data_erasure/ut_data_erasure.cpp @@ -74,7 +74,7 @@ namespace { } Y_UNIT_TEST_SUITE(TestDataErasure) { - void SimpleDataErasureTest(const TSchemeObject& createSchemeObject) { + void SimpleDataErasureTest(const TSchemeObject& createSchemeObject, ui64 currentBscGeneration = 0) { TTestBasicRuntime runtime; TTestEnv env(runtime); @@ -92,6 +92,11 @@ Y_UNIT_TEST_SUITE(TestDataErasure) { dataErasureConfig.SetBlobStorageControllerRequestIntervalSeconds(1); auto sender = runtime.AllocateEdgeActor(); + // Change BSC counter value between data erasure iterations + if (currentBscGeneration > 1) { + auto request = MakeHolder<TEvBlobStorage::TEvControllerShredRequest>(currentBscGeneration); + runtime.SendToPipe(MakeBSControllerID(), sender, request.Release(), 0, GetPipeConfigWithRetries()); + } RebootTablet(runtime, TTestTxConfig::SchemeShard, sender); ui64 txId = 100; @@ -100,7 +105,7 @@ Y_UNIT_TEST_SUITE(TestDataErasure) { CreateTestExtSubdomain(runtime, env, &txId, "Database2", createSchemeObject); TDispatchOptions options; - options.FinalEvents.push_back(TDispatchOptions::TFinalEventCondition(TEvBlobStorage::EvControllerShredResponse, 3)); + options.FinalEvents.push_back(TDispatchOptions::TFinalEventCondition(TEvBlobStorage::EvControllerShredResponse, (currentBscGeneration > 1 ? 4 : 3))); runtime.DispatchEvents(options); auto request = MakeHolder<TEvSchemeShard::TEvDataErasureInfoRequest>(); @@ -109,22 +114,30 @@ Y_UNIT_TEST_SUITE(TestDataErasure) { TAutoPtr<IEventHandle> handle; auto response = runtime.GrabEdgeEventRethrow<TEvSchemeShard::TEvDataErasureInfoResponse>(handle); - UNIT_ASSERT_EQUAL_C(response->Record.GetGeneration(), 1, response->Record.GetGeneration()); + if (currentBscGeneration > 1) { + UNIT_ASSERT_EQUAL_C(response->Record.GetGeneration(), currentBscGeneration + 1, response->Record.GetGeneration()); + } else { + UNIT_ASSERT_EQUAL_C(response->Record.GetGeneration(), 1, response->Record.GetGeneration()); + } UNIT_ASSERT_EQUAL(response->Record.GetStatus(), NKikimrScheme::TEvDataErasureInfoResponse::COMPLETED); } - Y_UNIT_TEST(SimpleDataErasureTestForTables) { + Y_UNIT_TEST(SimpleTestForTables) { SimpleDataErasureTest({.Table = true, .Topic = false}); } - Y_UNIT_TEST(SimpleDataErasureTestForTopic) { + Y_UNIT_TEST(SimpleTestForTopic) { SimpleDataErasureTest({.Table = false, .Topic = true}); } - Y_UNIT_TEST(SimpleDataErasureTestForAllSupportedObjects) { + Y_UNIT_TEST(SimpleTestForAllSupportedObjects) { SimpleDataErasureTest({.Table = true, .Topic = true}); } + Y_UNIT_TEST(SchemeShardCounterDoesNotConsistWithBscCounter) { + SimpleDataErasureTest({.Table = true, .Topic = false}, /*currentBscGeneration*/ 47); + } + void DataErasureRun3Cycles(const TSchemeObject& createSchemeObject) { TTestBasicRuntime runtime; TTestEnv env(runtime); @@ -164,15 +177,15 @@ Y_UNIT_TEST_SUITE(TestDataErasure) { UNIT_ASSERT_EQUAL(response->Record.GetStatus(), NKikimrScheme::TEvDataErasureInfoResponse::COMPLETED); } - Y_UNIT_TEST(DataErasureRun3CyclesForTables) { + Y_UNIT_TEST(Run3CyclesForTables) { DataErasureRun3Cycles({.Table = true, .Topic = false}); } - Y_UNIT_TEST(DataErasureRun3CyclesForTopics) { + Y_UNIT_TEST(Run3CyclesForTopics) { DataErasureRun3Cycles({.Table = false, .Topic = true}); } - Y_UNIT_TEST(DataErasureRun3CyclesForAllSupportedObjects) { + Y_UNIT_TEST(Run3CyclesForAllSupportedObjects) { DataErasureRun3Cycles({.Table = true, .Topic = true}); } @@ -220,7 +233,7 @@ Y_UNIT_TEST_SUITE(TestDataErasure) { UNIT_ASSERT_EQUAL(response->Record.GetStatus(), NKikimrScheme::TEvDataErasureInfoResponse::COMPLETED); } - Y_UNIT_TEST(DataErasureManualLaunch3Cycles) { + Y_UNIT_TEST(ManualLaunch3Cycles) { TTestBasicRuntime runtime; TTestEnv env(runtime); @@ -271,6 +284,67 @@ Y_UNIT_TEST_SUITE(TestDataErasure) { RunDataErasure(3); } + Y_UNIT_TEST(ManualLaunch3CyclesWithNotConsistentCountersInSchemeShardAndBSC) { + TTestBasicRuntime runtime; + TTestEnv env(runtime); + + runtime.SetLogPriority(NKikimrServices::TX_PROXY, NLog::PRI_DEBUG); + runtime.SetLogPriority(NKikimrServices::FLAT_TX_SCHEMESHARD, NActors::NLog::PRI_TRACE); + + auto info = CreateTestTabletInfo(MakeBSControllerID(), TTabletTypes::BSController); + CreateTestBootstrapper(runtime, info, [](const TActorId &tablet, TTabletStorageInfo *info) -> IActor* { + return new TFakeBSController(tablet, info); + }); + + runtime.GetAppData().FeatureFlags.SetEnableDataErasure(true); + auto& dataErasureConfig = runtime.GetAppData().DataErasureConfig; + dataErasureConfig.SetDataErasureIntervalSeconds(0); // do not schedule + dataErasureConfig.SetBlobStorageControllerRequestIntervalSeconds(1); + + auto sender = runtime.AllocateEdgeActor(); + RebootTablet(runtime, TTestTxConfig::SchemeShard, sender); + + ui64 txId = 100; + + CreateTestExtSubdomain(runtime, env, &txId, "Database1"); + CreateTestExtSubdomain(runtime, env, &txId, "Database2"); + + auto RunDataErasure = [&runtime] (ui32 expectedGeneration, ui32 requiredCountShredResponses) { + auto sender = runtime.AllocateEdgeActor(); + { + auto request = MakeHolder<TEvSchemeShard::TEvDataErasureManualStartupRequest>(); + runtime.SendToPipe(TTestTxConfig::SchemeShard, sender, request.Release(), 0, GetPipeConfigWithRetries()); + } + + TDispatchOptions options; + options.FinalEvents.push_back(TDispatchOptions::TFinalEventCondition(TEvBlobStorage::EvControllerShredResponse, requiredCountShredResponses)); + runtime.DispatchEvents(options); + + auto request = MakeHolder<TEvSchemeShard::TEvDataErasureInfoRequest>(); + runtime.SendToPipe(TTestTxConfig::SchemeShard, sender, request.Release(), 0, GetPipeConfigWithRetries()); + + TAutoPtr<IEventHandle> handle; + auto response = runtime.GrabEdgeEventRethrow<TEvSchemeShard::TEvDataErasureInfoResponse>(handle); + + UNIT_ASSERT_EQUAL_C(response->Record.GetGeneration(), expectedGeneration, response->Record.GetGeneration()); + UNIT_ASSERT_EQUAL(response->Record.GetStatus(), NKikimrScheme::TEvDataErasureInfoResponse::COMPLETED); + }; + + RunDataErasure(1, 3); + // Change BSC counter value between data erasure iterations + { + auto request = MakeHolder<TEvBlobStorage::TEvControllerShredRequest>(50); + runtime.SendToPipe(MakeBSControllerID(), sender, request.Release(), 0, GetPipeConfigWithRetries()); + } + RunDataErasure(51, 4); + // Change BSC counter value between data erasure iterations + { + auto request = MakeHolder<TEvBlobStorage::TEvControllerShredRequest>(100); + runtime.SendToPipe(MakeBSControllerID(), sender, request.Release(), 0, GetPipeConfigWithRetries()); + } + RunDataErasure(101, 4); + } + Y_UNIT_TEST(DataErasureWithCopyTable) { TTestBasicRuntime runtime; TVector<TIntrusivePtr<NFake::TProxyDS>> dsProxies { diff --git a/ydb/core/tx/schemeshard/ut_data_erasure/ya.make b/ydb/core/tx/schemeshard/ut_data_erasure/ya.make index 1eee45270fb..26656e5d129 100644 --- a/ydb/core/tx/schemeshard/ut_data_erasure/ya.make +++ b/ydb/core/tx/schemeshard/ut_data_erasure/ya.make @@ -4,8 +4,6 @@ FORK_SUBTESTS() SPLIT_FACTOR(10) -TIMEOUT(20) - IF (SANITIZER_TYPE == "thread" OR WITH_VALGRIND) SIZE(LARGE) TAG(ya:fat) |