diff options
author | Sergey Belyakov <serg-belyakov@ydb.tech> | 2025-07-30 11:31:19 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-07-30 11:31:19 +0300 |
commit | 48b4a0295db7d06dd98d284d3fdc1fce50938512 (patch) | |
tree | ee35607d17e42617b3ee3cbcc696d8a1ba439bdf | |
parent | 53e0418cb3697140dceb9c0d0b1d7aa570988785 (diff) | |
download | ydb-48b4a0295db7d06dd98d284d3fdc1fce50938512.tar.gz |
More detailed metrics for DeepScrubbing and print reports in a single line (#21888)
-rw-r--r-- | ydb/core/blobstorage/ut_blobstorage/lib/env.h | 25 | ||||
-rw-r--r-- | ydb/core/blobstorage/ut_blobstorage/scrub.cpp | 34 | ||||
-rw-r--r-- | ydb/core/blobstorage/vdisk/common/vdisk_mongroups.h | 61 | ||||
-rw-r--r-- | ydb/core/blobstorage/vdisk/scrub/scrub_actor.cpp | 65 | ||||
-rw-r--r-- | ydb/core/blobstorage/vdisk/scrub/scrub_actor_impl.h | 2 |
5 files changed, 140 insertions, 47 deletions
diff --git a/ydb/core/blobstorage/ut_blobstorage/lib/env.h b/ydb/core/blobstorage/ut_blobstorage/lib/env.h index 04bf43c7890..6847c643e82 100644 --- a/ydb/core/blobstorage/ut_blobstorage/lib/env.h +++ b/ydb/core/blobstorage/ut_blobstorage/lib/env.h @@ -1051,6 +1051,31 @@ struct TEnvironmentSetup { return ctr; } + template <class TCallback> + ui64 AggregateVDiskCountersWithCallback(TString storagePool, ui32 nodesCount, ui32 groupSize, ui32 groupId, + const std::vector<ui32>& pdiskLayout, TCallback callback) { + ui64 ctr = 0; + + for (ui32 nodeId = 1; nodeId <= nodesCount; ++nodeId) { + auto* appData = Runtime->GetNode(nodeId)->AppData.get(); + for (ui32 i = 0; i < groupSize; ++i) { + TStringStream ss; + ss << LeftPad(i, 2, '0'); + TString orderNumber = ss.Str(); + ss.Clear(); + ss << LeftPad(pdiskLayout[i], 9, '0'); + TString pdisk = ss.Str(); + ctr += callback(GetServiceCounters(appData->Counters, "vdisks")-> + GetSubgroup("storagePool", storagePool)-> + GetSubgroup("group", std::to_string(groupId))-> + GetSubgroup("orderNumber", orderNumber)-> + GetSubgroup("pdisk", pdisk)-> + GetSubgroup("media", "rot")); + } + } + return ctr; + } + ui64 AggregateVDiskCounters(TString storagePool, ui32 nodesCount, ui32 groupSize, ui32 groupId, const std::vector<ui32>& pdiskLayout, TString subsystem, TString counter, bool derivative = false) { return AggregateVDiskCountersBase(storagePool, nodesCount, groupSize, groupId, pdiskLayout, diff --git a/ydb/core/blobstorage/ut_blobstorage/scrub.cpp b/ydb/core/blobstorage/ut_blobstorage/scrub.cpp index dff6342a047..ea08f6a45a4 100644 --- a/ydb/core/blobstorage/ut_blobstorage/scrub.cpp +++ b/ydb/core/blobstorage/ut_blobstorage/scrub.cpp @@ -433,6 +433,25 @@ Y_UNIT_TEST_SUITE(DeepScrubbing) { , PartCorruptionMask(partCorruptionMask) {} + struct TAggregateScrubMetrics { + TAggregateScrubMetrics(TString counterName, bool isHuge, TErasureType::EErasureSpecies erasure) + : CounterName(counterName) + , IsHuge(isHuge) + , Erasure(erasure) + {} + + ui64 operator()(TIntrusivePtr<NMonitoring::TDynamicCounters> counters) const { + return counters->GetSubgroup("subsystem", "deepScrubbing") + ->GetSubgroup("blobSize", IsHuge ? "huge" : "small") + ->GetSubgroup("erasure", TErasureType::ErasureSpeciesName(Erasure)) + ->GetCounter(CounterName, true)->Val(); + } + + TString CounterName; + bool IsHuge; + TErasureType::EErasureSpecies Erasure; + }; + void RunTest() { Initialize(); AllocateEdgeActor(true); @@ -506,19 +525,16 @@ Y_UNIT_TEST_SUITE(DeepScrubbing) { } } + bool isHuge = (BlobSize == EBlobSize::Val_HugeBlob); + std::vector<ui32> pdiskLayout = MakePDiskLayout(BaseConfig, groupInfo->GetTopology(), GroupId); ui64 blobsScrubbed = - Env->AggregateVDiskCounters(Env->StoragePoolName, NodeCount, Erasure.BlobSubgroupSize(), - GroupId, pdiskLayout, "deepScrubbing", "SmallBlobsChecked", false) + - Env->AggregateVDiskCounters(Env->StoragePoolName, NodeCount, Erasure.BlobSubgroupSize(), - GroupId, pdiskLayout, "deepScrubbing", "HugeBlobsChecked", false); - + Env->AggregateVDiskCountersWithCallback(Env->StoragePoolName, NodeCount, Erasure.BlobSubgroupSize(), + GroupId, pdiskLayout, TAggregateScrubMetrics("BlobsChecked", isHuge, Erasure.GetErasure())); ui64 dataIssues = - Env->AggregateVDiskCounters(Env->StoragePoolName, NodeCount, Erasure.BlobSubgroupSize(), - GroupId, pdiskLayout, "deepScrubbing", "DataIssuesSmallBlobs", false) + - Env->AggregateVDiskCounters(Env->StoragePoolName, NodeCount, Erasure.BlobSubgroupSize(), - GroupId, pdiskLayout, "deepScrubbing", "DataIssuesHugeBlobs", false); + Env->AggregateVDiskCountersWithCallback(Env->StoragePoolName, NodeCount, Erasure.BlobSubgroupSize(), + GroupId, pdiskLayout, TAggregateScrubMetrics("DataIssues", isHuge, Erasure.GetErasure())); UNIT_ASSERT_VALUES_UNEQUAL_C(blobsScrubbed, 0, makePrefix()); UNIT_ASSERT_VALUES_UNEQUAL_C(dataIssues, 0, makePrefix() diff --git a/ydb/core/blobstorage/vdisk/common/vdisk_mongroups.h b/ydb/core/blobstorage/vdisk/common/vdisk_mongroups.h index 01f80990785..72b2495789c 100644 --- a/ydb/core/blobstorage/vdisk/common/vdisk_mongroups.h +++ b/ydb/core/blobstorage/vdisk/common/vdisk_mongroups.h @@ -933,33 +933,60 @@ public: COUNTER_DEF(BarriersBalance); }; + /////////////////////////////////////////////////////////////////////////////////// + // TDeepScrubbingGroup + /////////////////////////////////////////////////////////////////////////////////// class TDeepScrubbingGroup : public TBase { public: GROUP_CONSTRUCTOR(TDeepScrubbingGroup) { - COUNTER_INIT(SmallBlobsChecked, true); - COUNTER_INIT(HugeBlobsChecked, true); - COUNTER_INIT(CheckIntegritySuccesses, false); - COUNTER_INIT(CheckIntegrityErrors, false); - - COUNTER_INIT(PlacementIssuesSmallBlobs, false); - COUNTER_INIT(DataIssuesSmallBlobs, false); - - COUNTER_INIT(PlacementIssuesHugeBlobs, false); - COUNTER_INIT(DataIssuesHugeBlobs, false); + COUNTER_INIT(BlobsChecked, true); + COUNTER_INIT(CheckIntegritySuccesses, true); + COUNTER_INIT(CheckIntegrityErrors, true); + COUNTER_INIT(UnknownDataStatus, true); + COUNTER_INIT(UnknownPlacementStatus, true); + COUNTER_INIT(DataIssues, true); + COUNTER_INIT(PlacementIssues, true); } - COUNTER_DEF(SmallBlobsChecked); - COUNTER_DEF(HugeBlobsChecked); - + COUNTER_DEF(BlobsChecked); COUNTER_DEF(CheckIntegritySuccesses); COUNTER_DEF(CheckIntegrityErrors); + COUNTER_DEF(UnknownDataStatus); + COUNTER_DEF(UnknownPlacementStatus); + COUNTER_DEF(DataIssues); + COUNTER_DEF(PlacementIssues); + }; - COUNTER_DEF(PlacementIssuesSmallBlobs); - COUNTER_DEF(DataIssuesSmallBlobs); + class TDeepScrubbingSubgroups { + public: + TDeepScrubbingSubgroups(TIntrusivePtr<NMonitoring::TDynamicCounters> counters) { + for (bool isHuge : {true, false}) { + for (TErasureType::EErasureSpecies erasure : + {TErasureType::ErasureNone, TErasureType::Erasure4Plus2Block, + TErasureType::ErasureMirror3of4, TErasureType::ErasureMirror3dc}) { + ::NMonitoring::TDynamicCounterPtr subgroup = counters + ->GetSubgroup("blobSize", isHuge ? "huge" : "small") + ->GetSubgroup("erasure", TErasureType::ErasureSpeciesName(erasure)); + Subgroups.insert({GetKey(isHuge, erasure), TDeepScrubbingGroup(subgroup)}); + } + } + } + + TDeepScrubbingGroup* GetCounters(bool isHuge, TErasureType::EErasureSpecies erasure) { + auto it = Subgroups.find(GetKey(isHuge, erasure)); + if (it == Subgroups.end()) { + return nullptr; + } + return &it->second; + } - COUNTER_DEF(PlacementIssuesHugeBlobs); - COUNTER_DEF(DataIssuesHugeBlobs); + private: + std::unordered_map<ui64, TDeepScrubbingGroup> Subgroups; + + ui64 GetKey(bool isHuge, TErasureType::EErasureSpecies erasure) { + return ((ui64)isHuge << 32) + (ui64)erasure; + } }; /////////////////////////////////////////////////////////////////////////////////// diff --git a/ydb/core/blobstorage/vdisk/scrub/scrub_actor.cpp b/ydb/core/blobstorage/vdisk/scrub/scrub_actor.cpp index a0500be9aa2..b437e823847 100644 --- a/ydb/core/blobstorage/vdisk/scrub/scrub_actor.cpp +++ b/ydb/core/blobstorage/vdisk/scrub/scrub_actor.cpp @@ -14,7 +14,7 @@ namespace NKikimr { , LogPrefix(VCtx->VDiskLogPrefix) , Counters(VCtx->VDiskCounters->GetSubgroup("subsystem", "scrub")) , MonGroup(Counters) - , DeepScrubbingGroup(VCtx->VDiskCounters->GetSubgroup("subsystem", "deepScrubbing")) + , DeepScrubbingSubgroups(VCtx->VDiskCounters->GetSubgroup("subsystem", "deepScrubbing")) , Arena(&TScrubCoroImpl::AllocateRopeArenaChunk) , ScrubEntrypoint(std::move(scrubEntrypoint)) , ScrubEntrypointLsn(scrubEntrypointLsn) @@ -239,38 +239,63 @@ namespace NKikimr { void TScrubCoroImpl::CheckIntegrity(const TLogoBlobID& blobId, bool isHuge) { SendToBSProxy(SelfActorId, Info->GroupID, new TEvBlobStorage::TEvCheckIntegrity(blobId, TInstant::Max(), - NKikimrBlobStorage::EGetHandleClass::LowRead)); + NKikimrBlobStorage::EGetHandleClass::LowRead, true)); auto res = WaitForPDiskEvent<TEvBlobStorage::TEvCheckIntegrityResult>(); - if (isHuge) { - ++DeepScrubbingGroup.HugeBlobsChecked(); - } else { - ++DeepScrubbingGroup.SmallBlobsChecked(); + TErasureType::EErasureSpecies erasure = Info->Type.GetErasure(); + + NMonGroup::TDeepScrubbingGroup* counters = DeepScrubbingSubgroups.GetCounters(isHuge, erasure); + if (counters) { + ++counters->BlobsChecked(); } if (res->Get()->Status != NKikimrProto::OK) { STLOGX(GetActorContext(), PRI_WARN, BS_VDISK_SCRUB, VDS97, VDISKP(LogPrefix, "TEvCheckIntegrity request failed"), (BlobId, blobId), (ErrorReason, res->Get()->ErrorReason)); - ++DeepScrubbingGroup.CheckIntegrityErrors(); + if (counters) { + ++counters->CheckIntegrityErrors(); + } } else { - ++DeepScrubbingGroup.CheckIntegritySuccesses(); - if (res->Get()->PlacementStatus != TEvBlobStorage::TEvCheckIntegrityResult::PS_OK) { + if (counters) { + ++counters->CheckIntegritySuccesses(); + } + + switch (res->Get()->PlacementStatus) { + case TEvBlobStorage::TEvCheckIntegrityResult::PS_UNKNOWN: + case TEvBlobStorage::TEvCheckIntegrityResult::PS_REPLICATION_IN_PROGRESS: + if (counters) { + ++counters->UnknownPlacementStatus(); + } + break; + case TEvBlobStorage::TEvCheckIntegrityResult::PS_BLOB_IS_LOST: + case TEvBlobStorage::TEvCheckIntegrityResult::PS_BLOB_IS_RECOVERABLE: STLOGX(GetActorContext(), PRI_CRIT, BS_VDISK_SCRUB, VDS98, VDISKP(LogPrefix, "TEvCheckIntegrity discovered placement issue"), - (BlobId, blobId), (CheckIntegrityResult, res->Get()->ToString())); - if (isHuge) { - ++DeepScrubbingGroup.PlacementIssuesHugeBlobs(); - } else { - ++DeepScrubbingGroup.PlacementIssuesSmallBlobs(); + (BlobId, blobId), (Erasure, TErasureType::ErasureSpeciesName(erasure)), (CheckIntegrityResult, res->Get()->ToString())); + if (counters) { + ++counters->PlacementIssues(); } + break; + case TEvBlobStorage::TEvCheckIntegrityResult::PS_OK: + default: + break; // nothing to do } - if (res->Get()->DataStatus != TEvBlobStorage::TEvCheckIntegrityResult::DS_OK) { + + switch (res->Get()->DataStatus) { + case TEvBlobStorage::TEvCheckIntegrityResult::DS_UNKNOWN: + if (counters) { + ++counters->UnknownDataStatus(); + } + break; + case TEvBlobStorage::TEvCheckIntegrityResult::DS_ERROR: STLOGX(GetActorContext(), PRI_CRIT, BS_VDISK_SCRUB, VDS99, VDISKP(LogPrefix, "TEvCheckIntegrity discovered data issue"), - (BlobId, blobId), (CheckIntegrityResult, res->Get()->ToString())); - if (isHuge) { - ++DeepScrubbingGroup.DataIssuesHugeBlobs(); - } else { - ++DeepScrubbingGroup.DataIssuesSmallBlobs(); + (BlobId, blobId), (Erasure, TErasureType::ErasureSpeciesName(erasure)), (CheckIntegrityResult, res->Get()->ToString())); + if (counters) { + ++counters->DataIssues(); } + break; + case TEvBlobStorage::TEvCheckIntegrityResult::DS_OK: + default: + break; // nothing to do } } } diff --git a/ydb/core/blobstorage/vdisk/scrub/scrub_actor_impl.h b/ydb/core/blobstorage/vdisk/scrub/scrub_actor_impl.h index 5a74a2505c1..cbc9baeeadd 100644 --- a/ydb/core/blobstorage/vdisk/scrub/scrub_actor_impl.h +++ b/ydb/core/blobstorage/vdisk/scrub/scrub_actor_impl.h @@ -25,7 +25,7 @@ namespace NKikimr { ::NMonitoring::TDynamicCounterPtr Counters; NMonGroup::TScrubGroup MonGroup; - NMonGroup::TDeepScrubbingGroup DeepScrubbingGroup; + NMonGroup::TDeepScrubbingSubgroups DeepScrubbingSubgroups; TRopeArena Arena; |