aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVadim Averin <avevad@ydb.tech>2025-02-17 16:01:22 +0300
committerGitHub <noreply@github.com>2025-02-17 16:01:22 +0300
commit177935828b045fa6cfd678f79445e42f45b964cf (patch)
tree7ac39d04966caef319fcc91e5283844a64438367
parentbf60151e8b40e4fd2b76789b4faaa9beaa6f034d (diff)
downloadydb-177935828b045fa6cfd678f79445e42f45b964cf.tar.gz
Kill tablet on BS failures (#13766)
-rw-r--r--ydb/core/tx/columnshard/blobs_action/bs/gc.cpp14
-rw-r--r--ydb/core/tx/columnshard/blobs_action/bs/gc.h7
-rw-r--r--ydb/core/tx/columnshard/blobs_action/bs/gc_actor.cpp16
-rw-r--r--ydb/core/tx/columnshard/blobs_action/bs/gc_actor.h3
-rw-r--r--ydb/core/tx/columnshard/hooks/abstract/abstract.h6
-rw-r--r--ydb/core/tx/columnshard/test_helper/controllers.h19
-rw-r--r--ydb/core/tx/columnshard/ut_rw/ut_columnshard_read_write.cpp9
7 files changed, 66 insertions, 8 deletions
diff --git a/ydb/core/tx/columnshard/blobs_action/bs/gc.cpp b/ydb/core/tx/columnshard/blobs_action/bs/gc.cpp
index a72c6fb413d..0dd30b5be84 100644
--- a/ydb/core/tx/columnshard/blobs_action/bs/gc.cpp
+++ b/ydb/core/tx/columnshard/blobs_action/bs/gc.cpp
@@ -45,7 +45,9 @@ TGCTask::TGCTask(const TString& storageId, TGCListsByGroup&& listsByGroupId, con
}
void TGCTask::OnGCResult(TEvBlobStorage::TEvCollectGarbageResult::TPtr ev) {
- AFL_VERIFY(ev->Get()->Status == NKikimrProto::OK)("status", ev->Get()->Status)("details", ev->Get()->ToString())("action_id", GetActionGuid());
+ if (ev->Get()->Status != NKikimrProto::OK) {
+ Failures++;
+ }
TBlobAddress bAddress(ev->Cookie, ev->Get()->Channel);
auto itGroup = ListsByGroupId.find(bAddress);
AFL_VERIFY(itGroup != ListsByGroupId.end())("address", bAddress.DebugString());
@@ -59,8 +61,14 @@ static TAtomicCounter PerGenerationCounter = 1;
std::unique_ptr<TEvBlobStorage::TEvCollectGarbage> TGCTask::BuildRequest(const TBlobAddress& address) const {
auto it = ListsByGroupId.find(address);
AFL_VERIFY(it != ListsByGroupId.end());
- AFL_VERIFY(++it->second.RequestsCount < 10)("event", "build_gc_request")("address", address.DebugString())("current_gen", CurrentGen)("gen", CollectGenStepInFlight)
- ("count", it->second.RequestsCount);
+ if (++it->second.RequestsCount >= TGCLists::RequestsLimit) {
+ AFL_CRIT(NKikimrServices::TX_COLUMNSHARD_BLOBS_BS)
+ ("event", "build_gc_request")
+ ("address", address.DebugString())("current_gen", CurrentGen)
+ ("gen", CollectGenStepInFlight)
+ ("count", it->second.RequestsCount);
+ return nullptr;
+ }
AFL_DEBUG(NKikimrServices::TX_COLUMNSHARD_BLOBS_BS)("event", "build_gc_request")("address", address.DebugString())("current_gen", CurrentGen)("gen", CollectGenStepInFlight)
("count", it->second.RequestsCount);
auto result = std::make_unique<TEvBlobStorage::TEvCollectGarbage>(
diff --git a/ydb/core/tx/columnshard/blobs_action/bs/gc.h b/ydb/core/tx/columnshard/blobs_action/bs/gc.h
index 5471fc04c0d..70b872ddb63 100644
--- a/ydb/core/tx/columnshard/blobs_action/bs/gc.h
+++ b/ydb/core/tx/columnshard/blobs_action/bs/gc.h
@@ -17,6 +17,8 @@ public:
THashSet<TLogoBlobID> KeepList;
THashSet<TLogoBlobID> DontKeepList;
mutable ui32 RequestsCount = 0;
+
+ constexpr static ui32 RequestsLimit = 10;
};
using TGCListsByGroup = THashMap<TBlobAddress, TGCLists>;
private:
@@ -26,6 +28,7 @@ private:
const ui64 CurrentGen;
std::deque<TUnifiedBlobId> KeepsToErase;
std::shared_ptr<TBlobManager> Manager;
+ size_t Failures = 0;
protected:
virtual void RemoveBlobIdFromDB(const TTabletId tabletId, const TUnifiedBlobId& blobId, TBlobManagerDb& dbBlobs) override;
virtual void DoOnExecuteTxAfterCleaning(NColumnShard::TColumnShard& self, TBlobManagerDb& dbBlobs) override;
@@ -54,6 +57,10 @@ public:
return ListsByGroupId.empty();
}
+ bool HasFailures() const {
+ return Failures != 0;
+ }
+
void OnGCResult(TEvBlobStorage::TEvCollectGarbageResult::TPtr ev);
std::unique_ptr<TEvBlobStorage::TEvCollectGarbage> BuildRequest(const TBlobAddress& address) const;
diff --git a/ydb/core/tx/columnshard/blobs_action/bs/gc_actor.cpp b/ydb/core/tx/columnshard/blobs_action/bs/gc_actor.cpp
index 2d3a60a439f..e779e95f27a 100644
--- a/ydb/core/tx/columnshard/blobs_action/bs/gc_actor.cpp
+++ b/ydb/core/tx/columnshard/blobs_action/bs/gc_actor.cpp
@@ -1,9 +1,11 @@
#include "gc_actor.h"
#include <ydb/core/tx/columnshard/columnshard_private_events.h>
+#include <ydb/core/tx/columnshard/hooks/abstract/abstract.h>
namespace NKikimr::NOlap::NBlobOperations::NBlobStorage {
void TGarbageCollectionActor::Handle(TEvBlobStorage::TEvCollectGarbageResult::TPtr& ev) {
+ NYDBTest::TControllers::GetColumnShardController()->OnCollectGarbageResult(ev);
ACFL_DEBUG("actor", "TEvCollectGarbageResult");
if (ev->Get()->Status == NKikimrProto::BLOCKED) {
auto g = PassAwayGuard();
@@ -14,7 +16,13 @@ void TGarbageCollectionActor::Handle(TEvBlobStorage::TEvCollectGarbageResult::TP
CheckFinished();
} else {
ACFL_ERROR()("event", "GC_ERROR")("details", ev->Get()->Print(true));
- SendToBSProxy(NActors::TActivationContext::AsActorContext(), ev->Cookie, GCTask->BuildRequest(TBlobAddress(ev->Cookie, ev->Get()->Channel)).release(), ev->Cookie);
+ auto request = GCTask->BuildRequest(TBlobAddress(ev->Cookie, ev->Get()->Channel));
+ if (request) {
+ SendToBSProxy(NActors::TActivationContext::AsActorContext(), ev->Cookie, request.release(), ev->Cookie);
+ } else {
+ GCTask->OnGCResult(ev);
+ CheckFinished();
+ }
}
}
@@ -22,7 +30,11 @@ void TGarbageCollectionActor::CheckFinished() {
if (SharedRemovingFinished && GCTask->IsFinished()) {
auto g = PassAwayGuard();
ACFL_DEBUG("actor", "TGarbageCollectionActor")("event", "finished");
- TActorContext::AsActorContext().Send(TabletActorId, std::make_unique<NColumnShard::TEvPrivate::TEvGarbageCollectionFinished>(GCTask));
+ if (GCTask->HasFailures()) {
+ Send(TabletActorId, new TEvents::TEvPoison);
+ } else {
+ TActorContext::AsActorContext().Send(TabletActorId, std::make_unique<NColumnShard::TEvPrivate::TEvGarbageCollectionFinished>(GCTask));
+ }
}
}
diff --git a/ydb/core/tx/columnshard/blobs_action/bs/gc_actor.h b/ydb/core/tx/columnshard/blobs_action/bs/gc_actor.h
index 006c4ceb4bf..4dee1e4f4e7 100644
--- a/ydb/core/tx/columnshard/blobs_action/bs/gc_actor.h
+++ b/ydb/core/tx/columnshard/blobs_action/bs/gc_actor.h
@@ -12,6 +12,7 @@ private:
using TBase = TSharedBlobsCollectionActor<TGarbageCollectionActor>;
const NActors::TActorId TabletActorId;
std::shared_ptr<TGCTask> GCTask;
+
void Handle(TEvBlobStorage::TEvCollectGarbageResult::TPtr& ev);
void CheckFinished();
@@ -41,7 +42,7 @@ public:
AFL_DEBUG(NKikimrServices::TX_COLUMNSHARD_BLOBS_BS)("actor", "TGarbageCollectionActor")("event", "starting")("action_id", GCTask->GetActionGuid());
for (auto&& i : GCTask->GetListsByGroupId()) {
auto request = GCTask->BuildRequest(i.first);
- AFL_VERIFY(request);
+ AFL_VERIFY(request); // Cannot fail on the first time
SendToBSProxy(ctx, i.first.GetGroupId(), request.release(), i.first.GetGroupId());
}
TBase::Bootstrap(ctx);
diff --git a/ydb/core/tx/columnshard/hooks/abstract/abstract.h b/ydb/core/tx/columnshard/hooks/abstract/abstract.h
index f3bab29413d..3c7d6077896 100644
--- a/ydb/core/tx/columnshard/hooks/abstract/abstract.h
+++ b/ydb/core/tx/columnshard/hooks/abstract/abstract.h
@@ -99,6 +99,8 @@ protected:
}
virtual void DoOnDataSharingStarted(const ui64 /*tabletId*/, const TString& /*sessionId*/) {
}
+ virtual void DoOnCollectGarbageResult(TEvBlobStorage::TEvCollectGarbageResult::TPtr& /*result*/) {
+ }
virtual TDuration DoGetUsedSnapshotLivetime(const TDuration defaultValue) const {
return defaultValue;
@@ -282,6 +284,10 @@ public:
DoOnAfterGCAction(shard, action);
}
+ void OnCollectGarbageResult(TEvBlobStorage::TEvCollectGarbageResult::TPtr& result) {
+ DoOnCollectGarbageResult(result);
+ }
+
bool OnAfterFilterAssembling(const std::shared_ptr<arrow::RecordBatch>& batch) {
return DoOnAfterFilterAssembling(batch);
}
diff --git a/ydb/core/tx/columnshard/test_helper/controllers.h b/ydb/core/tx/columnshard/test_helper/controllers.h
index 281058322ac..97f0eb57174 100644
--- a/ydb/core/tx/columnshard/test_helper/controllers.h
+++ b/ydb/core/tx/columnshard/test_helper/controllers.h
@@ -2,6 +2,7 @@
#include <ydb/core/testlib/basics/runtime.h>
#include <ydb/core/tx/columnshard/hooks/testing/controller.h>
#include <ydb/core/tx/tiering/manager.h>
+#include <ydb/core/tx/columnshard/blobs_action/bs/address.h>
namespace NKikimr::NOlap {
@@ -72,4 +73,22 @@ public:
}
};
+class TFailingBSController: public NKikimr::NYDBTest::NColumnShard::TController {
+ void DoOnCollectGarbageResult(TEvBlobStorage::TEvCollectGarbageResult::TPtr& result) override {
+ NBlobOperations::NBlobStorage::TBlobAddress group(result->Cookie, result->Get()->Channel);
+ if (!FailingGroup.has_value()) {
+ FailingGroup = group;
+ }
+ if (group == FailingGroup.value() && FailsCount < 15) {
+ Cerr << "Dropped EvCollectGarbageResult" << Endl;
+ result->Get()->Status = NKikimrProto::ERROR;
+ FailsCount++;
+ }
+ }
+
+private:
+ std::optional<NBlobOperations::NBlobStorage::TBlobAddress> FailingGroup = std::nullopt;
+ size_t FailsCount = 0;
+};
+
}
diff --git a/ydb/core/tx/columnshard/ut_rw/ut_columnshard_read_write.cpp b/ydb/core/tx/columnshard/ut_rw/ut_columnshard_read_write.cpp
index 08a33d4901f..89f3210a66f 100644
--- a/ydb/core/tx/columnshard/ut_rw/ut_columnshard_read_write.cpp
+++ b/ydb/core/tx/columnshard/ut_rw/ut_columnshard_read_write.cpp
@@ -2421,9 +2421,10 @@ Y_UNIT_TEST_SUITE(TColumnShardTestReadWrite) {
}
}
+ template<typename Controller>
void TestCompactionGC() {
TTestBasicRuntime runtime;
- auto csDefaultControllerGuard = NKikimr::NYDBTest::TControllers::RegisterCSControllerGuard<TDefaultTestsController>();
+ auto csDefaultControllerGuard = NKikimr::NYDBTest::TControllers::RegisterCSControllerGuard<Controller>();
csDefaultControllerGuard->DisableBackground(NKikimr::NYDBTest::ICSController::EBackground::Indexation);
csDefaultControllerGuard->SetOverridePeriodicWakeupActivationPeriod(TDuration::Seconds(1));
csDefaultControllerGuard->SetOverrideBlobSplitSettings(NOlap::NSplitter::TSplitSettings());
@@ -2687,7 +2688,11 @@ Y_UNIT_TEST_SUITE(TColumnShardTestReadWrite) {
}
Y_UNIT_TEST(CompactionGC) {
- TestCompactionGC();
+ TestCompactionGC<TDefaultTestsController>();
+ }
+
+ Y_UNIT_TEST(CompactionGCFailingBs) {
+ TestCompactionGC<NOlap::TFailingBSController>();
}
Y_UNIT_TEST(PortionInfoSize) {