summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndrey Molotkov <[email protected]>2025-06-03 19:25:30 +0300
committerGitHub <[email protected]>2025-06-03 19:25:30 +0300
commit6079d14e8b4e5b0d709193eb278d890372ea2aa2 (patch)
treea7879573b083d5eba76b740b37dfb29c7f54c012
parent9f92ebb034cea30aa04d9bcce9a98184a60af1a0 (diff)
Fix not consistent generation counters for data erasure in SchemeShard tablet and BSC tablet (#19019)
-rw-r--r--ydb/core/tx/schemeshard/schemeshard__data_erasure_manager.h3
-rw-r--r--ydb/core/tx/schemeshard/schemeshard__root_data_erasure_manager.cpp17
-rw-r--r--ydb/core/tx/schemeshard/schemeshard__tenant_data_erasure_manager.cpp6
-rw-r--r--ydb/core/tx/schemeshard/ut_data_erasure/ut_data_erasure.cpp94
-rw-r--r--ydb/core/tx/schemeshard/ut_data_erasure/ya.make2
5 files changed, 106 insertions, 16 deletions
diff --git a/ydb/core/tx/schemeshard/schemeshard__data_erasure_manager.h b/ydb/core/tx/schemeshard/schemeshard__data_erasure_manager.h
index 3d49a05152a..0e5cd3a0498 100644
--- a/ydb/core/tx/schemeshard/schemeshard__data_erasure_manager.h
+++ b/ydb/core/tx/schemeshard/schemeshard__data_erasure_manager.h
@@ -54,6 +54,7 @@ public:
virtual bool Remove(const TPathId& pathId) = 0;
virtual bool Remove(const TShardIdx& shardIdx) = 0;
virtual void HandleNewPartitioning(const std::vector<TShardIdx>& dataErasureShards, NIceDb::TNiceDb& db) = 0;
+ virtual void SyncBscGeneration(NIceDb::TNiceDb& db, ui64 currentBscGeneration) = 0;
void Clear();
@@ -128,6 +129,7 @@ public:
bool Remove(const TPathId& pathId) override;
bool Remove(const TShardIdx& shardIdx) override;
void HandleNewPartitioning(const std::vector<TShardIdx>& dataErasureShards, NIceDb::TNiceDb& db) override;
+ void SyncBscGeneration(NIceDb::TNiceDb& db, ui64 currentBscGeneration) override;
private:
static TQueue::TConfig ConvertConfig(const NKikimrConfig::TDataErasureConfig& config);
@@ -189,6 +191,7 @@ public:
bool Remove(const TPathId& pathId) override;
bool Remove(const TShardIdx& shardIdx) override;
void HandleNewPartitioning(const std::vector<TShardIdx>& dataErasureShards, NIceDb::TNiceDb& db) override;
+ void SyncBscGeneration(NIceDb::TNiceDb& db, ui64 currentBscGeneration) override;
private:
static TQueue::TConfig ConvertConfig(const NKikimrConfig::TDataErasureConfig& config);
diff --git a/ydb/core/tx/schemeshard/schemeshard__root_data_erasure_manager.cpp b/ydb/core/tx/schemeshard/schemeshard__root_data_erasure_manager.cpp
index 6db5af28621..258bf928164 100644
--- a/ydb/core/tx/schemeshard/schemeshard__root_data_erasure_manager.cpp
+++ b/ydb/core/tx/schemeshard/schemeshard__root_data_erasure_manager.cpp
@@ -131,7 +131,7 @@ void TRootDataErasureManager::Run(NIceDb::TNiceDb& db) {
Status = EDataErasureStatus::IN_PROGRESS_BSC;
}
db.Table<Schema::DataErasureGenerations>().Key(Generation).Update<Schema::DataErasureGenerations::Status,
- Schema::DataErasureGenerations::StartTime>(Status, StartTime.MicroSeconds());
+ Schema::DataErasureGenerations::StartTime>(Status, StartTime.MicroSeconds());
const auto ctx = SchemeShard->ActorContext();
LOG_NOTICE_S(ctx, NKikimrServices::FLAT_TX_SCHEMESHARD,
@@ -481,6 +481,13 @@ void TRootDataErasureManager::HandleNewPartitioning(const std::vector<TShardIdx>
LOG_WARN_S(ctx, NKikimrServices::FLAT_TX_SCHEMESHARD, "[RootDataErasureManager] [HandleNewPartitioning] Cannot execute in root schemeshard: " << SchemeShard->TabletID());
}
+void TRootDataErasureManager::SyncBscGeneration(NIceDb::TNiceDb& db, ui64 currentBscGeneration) {
+ db.Table<Schema::DataErasureGenerations>().Key(GetGeneration()).Delete();
+ SetGeneration(currentBscGeneration + 1);
+ db.Table<Schema::DataErasureGenerations>().Key(GetGeneration()).Update<Schema::DataErasureGenerations::Status,
+ Schema::DataErasureGenerations::StartTime>(GetStatus(), StartTime.MicroSeconds());
+}
+
void TRootDataErasureManager::UpdateMetrics() {
SchemeShard->TabletCounters->Simple()[COUNTER_DATA_ERASURE_QUEUE_SIZE].Set(Queue->Size());
SchemeShard->TabletCounters->Simple()[COUNTER_DATA_ERASURE_QUEUE_RUNNING].Set(Queue->RunningSize());
@@ -633,13 +640,15 @@ struct TSchemeShard::TTxCompleteDataErasureBSC : public TSchemeShard::TRwTxBase
const auto& record = Ev->Get()->Record;
auto& manager = Self->DataErasureManager;
- if (record.GetCurrentGeneration() != manager->GetGeneration()) {
+ NIceDb::TNiceDb db(txc.DB);
+ if (ui64 currentBscGeneration = record.GetCurrentGeneration(); currentBscGeneration > manager->GetGeneration()) {
LOG_DEBUG_S(ctx, NKikimrServices::FLAT_TX_SCHEMESHARD,
- "TTxCompleteDataErasureBSC Unknown generation#" << record.GetCurrentGeneration() << ", Expected gen# " << manager->GetGeneration() << " at schemestard: " << Self->TabletID());
+ "TTxCompleteDataErasureBSC Unknown generation#" << currentBscGeneration << ", Expected gen# " << manager->GetGeneration() << " at schemestard: " << Self->TabletID());
+ manager->SyncBscGeneration(db, currentBscGeneration);
+ manager->SendRequestToBSC();
return;
}
- NIceDb::TNiceDb db(txc.DB);
if (record.GetCompleted()) {
LOG_NOTICE_S(ctx, NKikimrServices::FLAT_TX_SCHEMESHARD, "TTxCompleteDataErasureBSC: Data shred in BSC is completed");
manager->Complete();
diff --git a/ydb/core/tx/schemeshard/schemeshard__tenant_data_erasure_manager.cpp b/ydb/core/tx/schemeshard/schemeshard__tenant_data_erasure_manager.cpp
index 7df521ee5f1..b9404aa293a 100644
--- a/ydb/core/tx/schemeshard/schemeshard__tenant_data_erasure_manager.cpp
+++ b/ydb/core/tx/schemeshard/schemeshard__tenant_data_erasure_manager.cpp
@@ -445,6 +445,12 @@ void TTenantDataErasureManager::HandleNewPartitioning(const std::vector<TShardId
<< ", Status# " << static_cast<ui32>(Status));
}
+void TTenantDataErasureManager::SyncBscGeneration(NIceDb::TNiceDb&, ui64) {
+ auto ctx = SchemeShard->ActorContext();
+ LOG_WARN_S(ctx, NKikimrServices::FLAT_TX_SCHEMESHARD,
+ "[TenantDataErasureManager] [SyncBscGeneration] Cannot execute in tenant schemeshard: " << SchemeShard->TabletID());
+}
+
void TTenantDataErasureManager::UpdateMetrics() {
SchemeShard->TabletCounters->Simple()[COUNTER_TENANT_DATA_ERASURE_QUEUE_SIZE].Set(Queue->Size());
SchemeShard->TabletCounters->Simple()[COUNTER_TENANT_DATA_ERASURE_QUEUE_RUNNING].Set(Queue->RunningSize());
diff --git a/ydb/core/tx/schemeshard/ut_data_erasure/ut_data_erasure.cpp b/ydb/core/tx/schemeshard/ut_data_erasure/ut_data_erasure.cpp
index ce8bc7ee0d2..e9f989e4b1b 100644
--- a/ydb/core/tx/schemeshard/ut_data_erasure/ut_data_erasure.cpp
+++ b/ydb/core/tx/schemeshard/ut_data_erasure/ut_data_erasure.cpp
@@ -74,7 +74,7 @@ namespace {
}
Y_UNIT_TEST_SUITE(TestDataErasure) {
- void SimpleDataErasureTest(const TSchemeObject& createSchemeObject) {
+ void SimpleDataErasureTest(const TSchemeObject& createSchemeObject, ui64 currentBscGeneration = 0) {
TTestBasicRuntime runtime;
TTestEnv env(runtime);
@@ -92,6 +92,11 @@ Y_UNIT_TEST_SUITE(TestDataErasure) {
dataErasureConfig.SetBlobStorageControllerRequestIntervalSeconds(1);
auto sender = runtime.AllocateEdgeActor();
+ // Change BSC counter value between data erasure iterations
+ if (currentBscGeneration > 1) {
+ auto request = MakeHolder<TEvBlobStorage::TEvControllerShredRequest>(currentBscGeneration);
+ runtime.SendToPipe(MakeBSControllerID(), sender, request.Release(), 0, GetPipeConfigWithRetries());
+ }
RebootTablet(runtime, TTestTxConfig::SchemeShard, sender);
ui64 txId = 100;
@@ -100,7 +105,7 @@ Y_UNIT_TEST_SUITE(TestDataErasure) {
CreateTestExtSubdomain(runtime, env, &txId, "Database2", createSchemeObject);
TDispatchOptions options;
- options.FinalEvents.push_back(TDispatchOptions::TFinalEventCondition(TEvBlobStorage::EvControllerShredResponse, 3));
+ options.FinalEvents.push_back(TDispatchOptions::TFinalEventCondition(TEvBlobStorage::EvControllerShredResponse, (currentBscGeneration > 1 ? 4 : 3)));
runtime.DispatchEvents(options);
auto request = MakeHolder<TEvSchemeShard::TEvDataErasureInfoRequest>();
@@ -109,22 +114,30 @@ Y_UNIT_TEST_SUITE(TestDataErasure) {
TAutoPtr<IEventHandle> handle;
auto response = runtime.GrabEdgeEventRethrow<TEvSchemeShard::TEvDataErasureInfoResponse>(handle);
- UNIT_ASSERT_EQUAL_C(response->Record.GetGeneration(), 1, response->Record.GetGeneration());
+ if (currentBscGeneration > 1) {
+ UNIT_ASSERT_EQUAL_C(response->Record.GetGeneration(), currentBscGeneration + 1, response->Record.GetGeneration());
+ } else {
+ UNIT_ASSERT_EQUAL_C(response->Record.GetGeneration(), 1, response->Record.GetGeneration());
+ }
UNIT_ASSERT_EQUAL(response->Record.GetStatus(), NKikimrScheme::TEvDataErasureInfoResponse::COMPLETED);
}
- Y_UNIT_TEST(SimpleDataErasureTestForTables) {
+ Y_UNIT_TEST(SimpleTestForTables) {
SimpleDataErasureTest({.Table = true, .Topic = false});
}
- Y_UNIT_TEST(SimpleDataErasureTestForTopic) {
+ Y_UNIT_TEST(SimpleTestForTopic) {
SimpleDataErasureTest({.Table = false, .Topic = true});
}
- Y_UNIT_TEST(SimpleDataErasureTestForAllSupportedObjects) {
+ Y_UNIT_TEST(SimpleTestForAllSupportedObjects) {
SimpleDataErasureTest({.Table = true, .Topic = true});
}
+ Y_UNIT_TEST(SchemeShardCounterDoesNotConsistWithBscCounter) {
+ SimpleDataErasureTest({.Table = true, .Topic = false}, /*currentBscGeneration*/ 47);
+ }
+
void DataErasureRun3Cycles(const TSchemeObject& createSchemeObject) {
TTestBasicRuntime runtime;
TTestEnv env(runtime);
@@ -164,15 +177,15 @@ Y_UNIT_TEST_SUITE(TestDataErasure) {
UNIT_ASSERT_EQUAL(response->Record.GetStatus(), NKikimrScheme::TEvDataErasureInfoResponse::COMPLETED);
}
- Y_UNIT_TEST(DataErasureRun3CyclesForTables) {
+ Y_UNIT_TEST(Run3CyclesForTables) {
DataErasureRun3Cycles({.Table = true, .Topic = false});
}
- Y_UNIT_TEST(DataErasureRun3CyclesForTopics) {
+ Y_UNIT_TEST(Run3CyclesForTopics) {
DataErasureRun3Cycles({.Table = false, .Topic = true});
}
- Y_UNIT_TEST(DataErasureRun3CyclesForAllSupportedObjects) {
+ Y_UNIT_TEST(Run3CyclesForAllSupportedObjects) {
DataErasureRun3Cycles({.Table = true, .Topic = true});
}
@@ -220,7 +233,7 @@ Y_UNIT_TEST_SUITE(TestDataErasure) {
UNIT_ASSERT_EQUAL(response->Record.GetStatus(), NKikimrScheme::TEvDataErasureInfoResponse::COMPLETED);
}
- Y_UNIT_TEST(DataErasureManualLaunch3Cycles) {
+ Y_UNIT_TEST(ManualLaunch3Cycles) {
TTestBasicRuntime runtime;
TTestEnv env(runtime);
@@ -271,6 +284,67 @@ Y_UNIT_TEST_SUITE(TestDataErasure) {
RunDataErasure(3);
}
+ Y_UNIT_TEST(ManualLaunch3CyclesWithNotConsistentCountersInSchemeShardAndBSC) {
+ TTestBasicRuntime runtime;
+ TTestEnv env(runtime);
+
+ runtime.SetLogPriority(NKikimrServices::TX_PROXY, NLog::PRI_DEBUG);
+ runtime.SetLogPriority(NKikimrServices::FLAT_TX_SCHEMESHARD, NActors::NLog::PRI_TRACE);
+
+ auto info = CreateTestTabletInfo(MakeBSControllerID(), TTabletTypes::BSController);
+ CreateTestBootstrapper(runtime, info, [](const TActorId &tablet, TTabletStorageInfo *info) -> IActor* {
+ return new TFakeBSController(tablet, info);
+ });
+
+ runtime.GetAppData().FeatureFlags.SetEnableDataErasure(true);
+ auto& dataErasureConfig = runtime.GetAppData().DataErasureConfig;
+ dataErasureConfig.SetDataErasureIntervalSeconds(0); // do not schedule
+ dataErasureConfig.SetBlobStorageControllerRequestIntervalSeconds(1);
+
+ auto sender = runtime.AllocateEdgeActor();
+ RebootTablet(runtime, TTestTxConfig::SchemeShard, sender);
+
+ ui64 txId = 100;
+
+ CreateTestExtSubdomain(runtime, env, &txId, "Database1");
+ CreateTestExtSubdomain(runtime, env, &txId, "Database2");
+
+ auto RunDataErasure = [&runtime] (ui32 expectedGeneration, ui32 requiredCountShredResponses) {
+ auto sender = runtime.AllocateEdgeActor();
+ {
+ auto request = MakeHolder<TEvSchemeShard::TEvDataErasureManualStartupRequest>();
+ runtime.SendToPipe(TTestTxConfig::SchemeShard, sender, request.Release(), 0, GetPipeConfigWithRetries());
+ }
+
+ TDispatchOptions options;
+ options.FinalEvents.push_back(TDispatchOptions::TFinalEventCondition(TEvBlobStorage::EvControllerShredResponse, requiredCountShredResponses));
+ runtime.DispatchEvents(options);
+
+ auto request = MakeHolder<TEvSchemeShard::TEvDataErasureInfoRequest>();
+ runtime.SendToPipe(TTestTxConfig::SchemeShard, sender, request.Release(), 0, GetPipeConfigWithRetries());
+
+ TAutoPtr<IEventHandle> handle;
+ auto response = runtime.GrabEdgeEventRethrow<TEvSchemeShard::TEvDataErasureInfoResponse>(handle);
+
+ UNIT_ASSERT_EQUAL_C(response->Record.GetGeneration(), expectedGeneration, response->Record.GetGeneration());
+ UNIT_ASSERT_EQUAL(response->Record.GetStatus(), NKikimrScheme::TEvDataErasureInfoResponse::COMPLETED);
+ };
+
+ RunDataErasure(1, 3);
+ // Change BSC counter value between data erasure iterations
+ {
+ auto request = MakeHolder<TEvBlobStorage::TEvControllerShredRequest>(50);
+ runtime.SendToPipe(MakeBSControllerID(), sender, request.Release(), 0, GetPipeConfigWithRetries());
+ }
+ RunDataErasure(51, 4);
+ // Change BSC counter value between data erasure iterations
+ {
+ auto request = MakeHolder<TEvBlobStorage::TEvControllerShredRequest>(100);
+ runtime.SendToPipe(MakeBSControllerID(), sender, request.Release(), 0, GetPipeConfigWithRetries());
+ }
+ RunDataErasure(101, 4);
+ }
+
Y_UNIT_TEST(DataErasureWithCopyTable) {
TTestBasicRuntime runtime;
TVector<TIntrusivePtr<NFake::TProxyDS>> dsProxies {
diff --git a/ydb/core/tx/schemeshard/ut_data_erasure/ya.make b/ydb/core/tx/schemeshard/ut_data_erasure/ya.make
index 1eee45270fb..26656e5d129 100644
--- a/ydb/core/tx/schemeshard/ut_data_erasure/ya.make
+++ b/ydb/core/tx/schemeshard/ut_data_erasure/ya.make
@@ -4,8 +4,6 @@ FORK_SUBTESTS()
SPLIT_FACTOR(10)
-TIMEOUT(20)
-
IF (SANITIZER_TYPE == "thread" OR WITH_VALGRIND)
SIZE(LARGE)
TAG(ya:fat)