diff options
author | avpershin <[email protected]> | 2025-10-13 14:23:50 +0300 |
---|---|---|
committer | GitHub <[email protected]> | 2025-10-13 14:23:50 +0300 |
commit | c1a50c99ee4f0c0327f3381feba5730c7b81df05 (patch) | |
tree | eacd9b176e71da77f63f6706013be406a8cd225f | |
parent | b050143e845cbde7257bfc320c805fa8772b758f (diff) |
Refactor the TopUsage container (#26747)
Changelog entry
* Reduced the CPU usage when handling the periodic partition stats updates
Changelog category
* Improvement
Description for reviewers
* Refactored TTopUsage to handle bucket boundaries in a declarative way
* Added unit tests for the TTopCpuUsage class
* Fixed TTableAggregatedStats::UpdateShardStats() to copy TopCpuUsage 2 times instead of 3
-rw-r--r-- | ydb/core/tx/schemeshard/schemeshard_info_types.cpp | 8 | ||||
-rw-r--r-- | ydb/core/tx/schemeshard/schemeshard_info_types.h | 144 | ||||
-rw-r--r-- | ydb/core/tx/schemeshard/ut_partition_stats/ut_top_cpu_usage.cpp | 184 | ||||
-rw-r--r-- | ydb/core/tx/schemeshard/ut_partition_stats/ya.make | 21 | ||||
-rw-r--r-- | ydb/core/tx/schemeshard/ya.make | 1 |
5 files changed, 314 insertions, 44 deletions
diff --git a/ydb/core/tx/schemeshard/schemeshard_info_types.cpp b/ydb/core/tx/schemeshard/schemeshard_info_types.cpp index 7b29ee059fa..acdc4924587 100644 --- a/ydb/core/tx/schemeshard/schemeshard_info_types.cpp +++ b/ydb/core/tx/schemeshard/schemeshard_info_types.cpp @@ -1838,9 +1838,13 @@ void TTableAggregatedStats::UpdateShardStats(TShardIdx datashardIdx, const TPart Aggregated.LocksWholeShard += (newStats.LocksWholeShard - oldStats.LocksWholeShard); Aggregated.LocksBroken += (newStats.LocksBroken - oldStats.LocksBroken); - auto topUsage = oldStats.TopUsage.Update(newStats.TopUsage); + // NOTE: Updating the CPU usage buckets is essentially taking the maximum + // of the latest update time for each bucket. Thus, updating new -> old + // and old -> new are equivalent: the result is the same. + const auto oldTopCpuUsage = oldStats.TopCpuUsage; oldStats = newStats; - oldStats.TopUsage = std::move(topUsage); + oldStats.TopCpuUsage.Update(oldTopCpuUsage); // The left is new stats now! + PartitionStatsUpdated++; // Rescan stats for aggregations only once in a while diff --git a/ydb/core/tx/schemeshard/schemeshard_info_types.h b/ydb/core/tx/schemeshard/schemeshard_info_types.h index 3a4bb911d44..8cc1cd3bf08 100644 --- a/ydb/core/tx/schemeshard/schemeshard_info_types.h +++ b/ydb/core/tx/schemeshard/schemeshard_info_types.h @@ -215,21 +215,104 @@ struct TPartitionConfigMerger { }; struct TPartitionStats { - // Latest timestamps when CPU usage exceeded 2%, 5%, 10%, 20%, 30% - struct TTopUsage { - TInstant Last2PercentLoad; - TInstant Last5PercentLoad; - TInstant Last10PercentLoad; - TInstant Last20PercentLoad; - TInstant Last30PercentLoad; - - const TTopUsage& Update(const TTopUsage& usage) { - Last2PercentLoad = std::max(Last2PercentLoad, usage.Last2PercentLoad); - Last5PercentLoad = std::max(Last5PercentLoad, usage.Last5PercentLoad); - Last10PercentLoad = std::max(Last10PercentLoad, usage.Last10PercentLoad); - Last20PercentLoad = std::max(Last20PercentLoad, usage.Last20PercentLoad); - Last30PercentLoad = std::max(Last30PercentLoad, usage.Last30PercentLoad); - return *this; + /** + * The container for the latest time stamps when the CPU usage exceeded + * specific thresholds: 2%, 5%, 10%, 20%, 30%. + */ + struct TTopCpuUsage { + /** + * Describes the boundaries for a CPU usage bucket. + */ + struct TBucket { + /** + * The low boundary for this bucket. + * + * @note If the current CPU usage exceeds this value, this bucket is updated. + */ + const ui32 LowBoundary; + + /** + * The effective CPU usage value for this bucket. + * + * @note If this bucket falls within the given time period, + * this value is used as the assumed CPU usage percentage. + */ + const ui32 EffectiveValue; + }; + + /** + * The boundaries for all CPU usage buckets tracked by this class. + * + * @warning This list must be sorted by the threshold value (in the ascending order). + */ + static constexpr std::array<TBucket, 5> Buckets = {{ + {2, 5}, // >= 2% --> 5% CPU usage + {5, 10}, // >= 5% --> 10% CPU usage + {10, 20}, // >= 10% --> 20% CPU usage + {20, 30}, // >= 20% --> 30% CPU usage + {30, 40}, // >= 30% --> 40% CPU usage + }}; + + /** + * The time when each usage bucket was updated. + */ + std::array<TInstant, Buckets.size()> BucketUpdateTimes; + + /** + * Update the CPU usage data using values from another container. + * + * @param[in] usage The container to update the usage data from + */ + void Update(const TTopCpuUsage& usage) { + // Keep only the latest time for each bucket + for (ui64 i = 0; i < Buckets.size(); ++i) { + BucketUpdateTimes[i] = std::max(BucketUpdateTimes[i], usage.BucketUpdateTimes[i]); + } + } + + /** + * Update the historical CPU usage. + * + * @param[in] rawCpuUsage The current CPU usage + * @param[in] now The current time + */ + void UpdateCpuUsage(ui64 rawCpuUsage, TInstant now) { + ui32 percent = static_cast<ui32>(rawCpuUsage * 0.000001 * 100); + + // Update all buckets, which have low boundaries below the given CPU usage + for (ui64 i = 0; i < Buckets.size(); ++i) { + if (percent < Buckets[i].LowBoundary) { + return; + } + + BucketUpdateTimes[i] = now; + } + } + + /** + * Get the peak CPU usage percentage that has been observed since the given time. + * + * @note This function does not return the actual peak CPU usage value. + * The return value is one of the preset thresholds, which this class + * tracks (2%, 5%, 10%, 20%, 30% and 40%). + * + * @todo Fix the case when stats were not collected yet + * + * @param[in] since The time from which to calculate the peak CPU usage + * + * @return The peak CPU usage (as a percentage) since the given time + */ + ui32 GetLatestMaxCpuUsagePercent(TInstant since) const { + // Find the highest bucket (from the end of the list), + // which was updated after the given time + for (i64 i = Buckets.size() - 1; i >= 0; --i) { + if (BucketUpdateTimes[i] > since) { + return Buckets[i].EffectiveValue; + } + } + + // No bucket was found, return at least some minimum CPU usage percentage + return 2; } }; @@ -292,42 +375,19 @@ struct TPartitionStats { // Tablet actor started at TInstant StartTime; - TTopUsage TopUsage; + TTopCpuUsage TopCpuUsage; void SetCurrentRawCpuUsage(ui64 rawCpuUsage, TInstant now) { CPU = rawCpuUsage; - float percent = rawCpuUsage * 0.000001 * 100; - if (percent >= 2) - TopUsage.Last2PercentLoad = now; - if (percent >= 5) - TopUsage.Last5PercentLoad = now; - if (percent >= 10) - TopUsage.Last10PercentLoad = now; - if (percent >= 20) - TopUsage.Last20PercentLoad = now; - if (percent >= 30) - TopUsage.Last30PercentLoad = now; + TopCpuUsage.UpdateCpuUsage(rawCpuUsage, now); } ui64 GetCurrentRawCpuUsage() const { return CPU; } - float GetLatestMaxCpuUsagePercent(TInstant since) const { - // TODO: fix the case when stats were not collected yet - - if (TopUsage.Last30PercentLoad > since) - return 40; - if (TopUsage.Last20PercentLoad > since) - return 30; - if (TopUsage.Last10PercentLoad > since) - return 20; - if (TopUsage.Last5PercentLoad > since) - return 10; - if (TopUsage.Last2PercentLoad > since) - return 5; - - return 2; + ui32 GetLatestMaxCpuUsagePercent(TInstant since) const { + return TopCpuUsage.GetLatestMaxCpuUsagePercent(since); } private: diff --git a/ydb/core/tx/schemeshard/ut_partition_stats/ut_top_cpu_usage.cpp b/ydb/core/tx/schemeshard/ut_partition_stats/ut_top_cpu_usage.cpp new file mode 100644 index 00000000000..2928f01f9d3 --- /dev/null +++ b/ydb/core/tx/schemeshard/ut_partition_stats/ut_top_cpu_usage.cpp @@ -0,0 +1,184 @@ +#include <ydb/core/tx/schemeshard/schemeshard_info_types.h> + +#include <library/cpp/testing/unittest/registar.h> + +using namespace NKikimr; +using namespace NSchemeShard; + +namespace { + +/** + * Create the TPartitionStats::TTopCpuUsage container will all times set to unique values. + * + * @return The CPU usage container with all times set to unique values. + */ +TPartitionStats::TTopCpuUsage MakeTopCpuUsageWithUniqueTimes() { + TPartitionStats::TTopCpuUsage top_cpu_usage; + + for (ui64 i = 0; i < top_cpu_usage.BucketUpdateTimes.size(); ++i) { + // Set older time stamps to higher CPU usage values to make it easier + // to verify the time-based lookup requests + top_cpu_usage.BucketUpdateTimes[i] = TInstant::MicroSeconds( + 1000 + top_cpu_usage.BucketUpdateTimes.size() - i + ); + } + + return top_cpu_usage; +} + +/** + * Convert the TPartitionStats::TTopCpuUsage container to a pretty-printed string. + * + * @param[in] top_cpu_usage The container to convert to a string + * + * @return The corresponding pretty-printed string + */ +TString PrintTopCpuUsage(const TPartitionStats::TTopCpuUsage& top_cpu_usage) { + auto builder = TStringBuilder() << "\nTopCpuUsage = [\n"; + + for (ui64 i = 0; i < top_cpu_usage.Buckets.size(); ++i) { + builder << Sprintf( + " %2u%% -> %u\n", + top_cpu_usage.Buckets[i].LowBoundary, + top_cpu_usage.BucketUpdateTimes[i] + ); + } + + return builder << "]\n"; +} + +} // namespace <anonymous> + +/** + * Unit tests for the TPartitionStats::TTopCpuUsage class. + */ +Y_UNIT_TEST_SUITE(TSchemeShardPartitionStatsTopCpuUsageTest) { + /** + * Verify that TTopCpuUsage::Update() works correctly. + */ + Y_UNIT_TEST(Update) { + TPartitionStats::TTopCpuUsage top_cpu_usage1 = MakeTopCpuUsageWithUniqueTimes(); + + // Unset (== 0) and smaller values should be ignore, larger values should be kept + TPartitionStats::TTopCpuUsage top_cpu_usage2; + + top_cpu_usage2.BucketUpdateTimes[1] = TInstant::MicroSeconds(2001); + top_cpu_usage2.BucketUpdateTimes[2] = TInstant::MicroSeconds(202); + top_cpu_usage2.BucketUpdateTimes[3] = TInstant::MicroSeconds(2003); + + top_cpu_usage1.Update(top_cpu_usage2); + + std::array<TInstant, 5> expectedTimes = {{ + TInstant::MicroSeconds(1005), + TInstant::MicroSeconds(2001), + TInstant::MicroSeconds(1003), + TInstant::MicroSeconds(2003), + TInstant::MicroSeconds(1001), + }}; + + UNIT_ASSERT_EQUAL_C( + top_cpu_usage1.BucketUpdateTimes, + expectedTimes, + PrintTopCpuUsage(top_cpu_usage1) + ); + } + + /** + * Verify that TTopCpuUsage::UpdateCpuUsage() does not update any buckets, + * if the given CPU usage percentage does not satisfy any CPU usage threshold. + */ + Y_UNIT_TEST(UpdateCpuUsage_NoBuckets) { + TPartitionStats::TTopCpuUsage top_cpu_usage = MakeTopCpuUsageWithUniqueTimes(); + top_cpu_usage.UpdateCpuUsage(10000 /* 1% */, TInstant::MicroSeconds(123456)); + + std::array<TInstant, 5> expectedTimes = {{ + TInstant::MicroSeconds(1005), + TInstant::MicroSeconds(1004), + TInstant::MicroSeconds(1003), + TInstant::MicroSeconds(1002), + TInstant::MicroSeconds(1001), + }}; + + UNIT_ASSERT_EQUAL_C( + top_cpu_usage.BucketUpdateTimes, + expectedTimes, + PrintTopCpuUsage(top_cpu_usage) + ); + } + + /** + * Verify that TTopCpuUsage::UpdateCpuUsage() updates the correct buckets, + * if the given CPU usage percentage satisfies only some of the CPU usage thresholds. + */ + Y_UNIT_TEST(UpdateCpuUsage_SomeBuckets) { + TPartitionStats::TTopCpuUsage top_cpu_usage(MakeTopCpuUsageWithUniqueTimes()); + top_cpu_usage.UpdateCpuUsage(150000 /* 15% */, TInstant::MicroSeconds(123456)); + + std::array<TInstant, 5> expectedTimes = {{ + TInstant::MicroSeconds(123456), + TInstant::MicroSeconds(123456), + TInstant::MicroSeconds(123456), + TInstant::MicroSeconds(1002), + TInstant::MicroSeconds(1001), + }}; + + UNIT_ASSERT_EQUAL_C( + top_cpu_usage.BucketUpdateTimes, + expectedTimes, + PrintTopCpuUsage(top_cpu_usage) + ); + } + + /** + * Verify that TTopCpuUsage::UpdateCpuUsage() updates all buckets, + * if the given CPU usage percentage satisfies all CPU usage thresholds. + */ + Y_UNIT_TEST(UpdateCpuUsage_AllBuckets) { + TPartitionStats::TTopCpuUsage top_cpu_usage = MakeTopCpuUsageWithUniqueTimes(); + top_cpu_usage.UpdateCpuUsage(310000 /* 31% */, TInstant::MicroSeconds(123456)); + + std::array<TInstant, 5> expectedTimes = {{ + TInstant::MicroSeconds(123456), + TInstant::MicroSeconds(123456), + TInstant::MicroSeconds(123456), + TInstant::MicroSeconds(123456), + TInstant::MicroSeconds(123456), + }}; + + UNIT_ASSERT_EQUAL_C( + top_cpu_usage.BucketUpdateTimes, + expectedTimes, + PrintTopCpuUsage(top_cpu_usage) + ); + } + + /** + * Verify that TTopCpuUsage::GetLatestMaxCpuUsagePercent() works correctly + * for all threshold values. + */ + Y_UNIT_TEST(GetLatestMaxCpuUsagePercent) { + TPartitionStats::TTopCpuUsage top_cpu_usage = MakeTopCpuUsageWithUniqueTimes(); + + for (const auto [since, expectedCpuUsage] : std::array<std::pair<ui64, ui32>, 6>{ + std::make_pair(1005, 2), // The default value - no bucket for this time stamp + std::make_pair(1000, 40), + std::make_pair(1001, 30), + std::make_pair(1002, 20), + std::make_pair(1003, 10), + std::make_pair(1004, 5), + }) { + auto cpuUsage = top_cpu_usage.GetLatestMaxCpuUsagePercent(TInstant::MicroSeconds(since)); + + UNIT_ASSERT_EQUAL_C( + cpuUsage, + expectedCpuUsage, + Sprintf( + "Received %u%% for the time stamp %u, expected %u%%", + cpuUsage, + since, + expectedCpuUsage + ) + ); + } + } +} diff --git a/ydb/core/tx/schemeshard/ut_partition_stats/ya.make b/ydb/core/tx/schemeshard/ut_partition_stats/ya.make new file mode 100644 index 00000000000..b8451019536 --- /dev/null +++ b/ydb/core/tx/schemeshard/ut_partition_stats/ya.make @@ -0,0 +1,21 @@ +UNITTEST_FOR(ydb/core/tx/schemeshard) + +FORK_SUBTESTS() + +SPLIT_FACTOR(60) + +SIZE(SMALL) + +SRCS( + ut_top_cpu_usage.cpp +) + +PEERDIR( + library/cpp/testing/unittest + ydb/core/testlib/pg + yql/essentials/public/udf/service/exception_policy +) + +YQL_LAST_ABI_VERSION() + +END() diff --git a/ydb/core/tx/schemeshard/ya.make b/ydb/core/tx/schemeshard/ya.make index 5fad8289552..000b5d311a0 100644 --- a/ydb/core/tx/schemeshard/ya.make +++ b/ydb/core/tx/schemeshard/ya.make @@ -37,6 +37,7 @@ RECURSE_FOR_TESTS( ut_move_reboots ut_olap ut_olap_reboots + ut_partition_stats ut_pq_reboots ut_reboots ut_replication |