summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authoravpershin <[email protected]>2025-10-13 14:23:50 +0300
committerGitHub <[email protected]>2025-10-13 14:23:50 +0300
commitc1a50c99ee4f0c0327f3381feba5730c7b81df05 (patch)
treeeacd9b176e71da77f63f6706013be406a8cd225f
parentb050143e845cbde7257bfc320c805fa8772b758f (diff)
Refactor the TopUsage container (#26747)
Changelog entry * Reduced the CPU usage when handling the periodic partition stats updates Changelog category * Improvement Description for reviewers * Refactored TTopUsage to handle bucket boundaries in a declarative way * Added unit tests for the TTopCpuUsage class * Fixed TTableAggregatedStats::UpdateShardStats() to copy TopCpuUsage 2 times instead of 3
-rw-r--r--ydb/core/tx/schemeshard/schemeshard_info_types.cpp8
-rw-r--r--ydb/core/tx/schemeshard/schemeshard_info_types.h144
-rw-r--r--ydb/core/tx/schemeshard/ut_partition_stats/ut_top_cpu_usage.cpp184
-rw-r--r--ydb/core/tx/schemeshard/ut_partition_stats/ya.make21
-rw-r--r--ydb/core/tx/schemeshard/ya.make1
5 files changed, 314 insertions, 44 deletions
diff --git a/ydb/core/tx/schemeshard/schemeshard_info_types.cpp b/ydb/core/tx/schemeshard/schemeshard_info_types.cpp
index 7b29ee059fa..acdc4924587 100644
--- a/ydb/core/tx/schemeshard/schemeshard_info_types.cpp
+++ b/ydb/core/tx/schemeshard/schemeshard_info_types.cpp
@@ -1838,9 +1838,13 @@ void TTableAggregatedStats::UpdateShardStats(TShardIdx datashardIdx, const TPart
Aggregated.LocksWholeShard += (newStats.LocksWholeShard - oldStats.LocksWholeShard);
Aggregated.LocksBroken += (newStats.LocksBroken - oldStats.LocksBroken);
- auto topUsage = oldStats.TopUsage.Update(newStats.TopUsage);
+ // NOTE: Updating the CPU usage buckets is essentially taking the maximum
+ // of the latest update time for each bucket. Thus, updating new -> old
+ // and old -> new are equivalent: the result is the same.
+ const auto oldTopCpuUsage = oldStats.TopCpuUsage;
oldStats = newStats;
- oldStats.TopUsage = std::move(topUsage);
+ oldStats.TopCpuUsage.Update(oldTopCpuUsage); // The left is new stats now!
+
PartitionStatsUpdated++;
// Rescan stats for aggregations only once in a while
diff --git a/ydb/core/tx/schemeshard/schemeshard_info_types.h b/ydb/core/tx/schemeshard/schemeshard_info_types.h
index 3a4bb911d44..8cc1cd3bf08 100644
--- a/ydb/core/tx/schemeshard/schemeshard_info_types.h
+++ b/ydb/core/tx/schemeshard/schemeshard_info_types.h
@@ -215,21 +215,104 @@ struct TPartitionConfigMerger {
};
struct TPartitionStats {
- // Latest timestamps when CPU usage exceeded 2%, 5%, 10%, 20%, 30%
- struct TTopUsage {
- TInstant Last2PercentLoad;
- TInstant Last5PercentLoad;
- TInstant Last10PercentLoad;
- TInstant Last20PercentLoad;
- TInstant Last30PercentLoad;
-
- const TTopUsage& Update(const TTopUsage& usage) {
- Last2PercentLoad = std::max(Last2PercentLoad, usage.Last2PercentLoad);
- Last5PercentLoad = std::max(Last5PercentLoad, usage.Last5PercentLoad);
- Last10PercentLoad = std::max(Last10PercentLoad, usage.Last10PercentLoad);
- Last20PercentLoad = std::max(Last20PercentLoad, usage.Last20PercentLoad);
- Last30PercentLoad = std::max(Last30PercentLoad, usage.Last30PercentLoad);
- return *this;
+ /**
+ * The container for the latest time stamps when the CPU usage exceeded
+ * specific thresholds: 2%, 5%, 10%, 20%, 30%.
+ */
+ struct TTopCpuUsage {
+ /**
+ * Describes the boundaries for a CPU usage bucket.
+ */
+ struct TBucket {
+ /**
+ * The low boundary for this bucket.
+ *
+ * @note If the current CPU usage exceeds this value, this bucket is updated.
+ */
+ const ui32 LowBoundary;
+
+ /**
+ * The effective CPU usage value for this bucket.
+ *
+ * @note If this bucket falls within the given time period,
+ * this value is used as the assumed CPU usage percentage.
+ */
+ const ui32 EffectiveValue;
+ };
+
+ /**
+ * The boundaries for all CPU usage buckets tracked by this class.
+ *
+ * @warning This list must be sorted by the threshold value (in the ascending order).
+ */
+ static constexpr std::array<TBucket, 5> Buckets = {{
+ {2, 5}, // >= 2% --> 5% CPU usage
+ {5, 10}, // >= 5% --> 10% CPU usage
+ {10, 20}, // >= 10% --> 20% CPU usage
+ {20, 30}, // >= 20% --> 30% CPU usage
+ {30, 40}, // >= 30% --> 40% CPU usage
+ }};
+
+ /**
+ * The time when each usage bucket was updated.
+ */
+ std::array<TInstant, Buckets.size()> BucketUpdateTimes;
+
+ /**
+ * Update the CPU usage data using values from another container.
+ *
+ * @param[in] usage The container to update the usage data from
+ */
+ void Update(const TTopCpuUsage& usage) {
+ // Keep only the latest time for each bucket
+ for (ui64 i = 0; i < Buckets.size(); ++i) {
+ BucketUpdateTimes[i] = std::max(BucketUpdateTimes[i], usage.BucketUpdateTimes[i]);
+ }
+ }
+
+ /**
+ * Update the historical CPU usage.
+ *
+ * @param[in] rawCpuUsage The current CPU usage
+ * @param[in] now The current time
+ */
+ void UpdateCpuUsage(ui64 rawCpuUsage, TInstant now) {
+ ui32 percent = static_cast<ui32>(rawCpuUsage * 0.000001 * 100);
+
+ // Update all buckets, which have low boundaries below the given CPU usage
+ for (ui64 i = 0; i < Buckets.size(); ++i) {
+ if (percent < Buckets[i].LowBoundary) {
+ return;
+ }
+
+ BucketUpdateTimes[i] = now;
+ }
+ }
+
+ /**
+ * Get the peak CPU usage percentage that has been observed since the given time.
+ *
+ * @note This function does not return the actual peak CPU usage value.
+ * The return value is one of the preset thresholds, which this class
+ * tracks (2%, 5%, 10%, 20%, 30% and 40%).
+ *
+ * @todo Fix the case when stats were not collected yet
+ *
+ * @param[in] since The time from which to calculate the peak CPU usage
+ *
+ * @return The peak CPU usage (as a percentage) since the given time
+ */
+ ui32 GetLatestMaxCpuUsagePercent(TInstant since) const {
+ // Find the highest bucket (from the end of the list),
+ // which was updated after the given time
+ for (i64 i = Buckets.size() - 1; i >= 0; --i) {
+ if (BucketUpdateTimes[i] > since) {
+ return Buckets[i].EffectiveValue;
+ }
+ }
+
+ // No bucket was found, return at least some minimum CPU usage percentage
+ return 2;
}
};
@@ -292,42 +375,19 @@ struct TPartitionStats {
// Tablet actor started at
TInstant StartTime;
- TTopUsage TopUsage;
+ TTopCpuUsage TopCpuUsage;
void SetCurrentRawCpuUsage(ui64 rawCpuUsage, TInstant now) {
CPU = rawCpuUsage;
- float percent = rawCpuUsage * 0.000001 * 100;
- if (percent >= 2)
- TopUsage.Last2PercentLoad = now;
- if (percent >= 5)
- TopUsage.Last5PercentLoad = now;
- if (percent >= 10)
- TopUsage.Last10PercentLoad = now;
- if (percent >= 20)
- TopUsage.Last20PercentLoad = now;
- if (percent >= 30)
- TopUsage.Last30PercentLoad = now;
+ TopCpuUsage.UpdateCpuUsage(rawCpuUsage, now);
}
ui64 GetCurrentRawCpuUsage() const {
return CPU;
}
- float GetLatestMaxCpuUsagePercent(TInstant since) const {
- // TODO: fix the case when stats were not collected yet
-
- if (TopUsage.Last30PercentLoad > since)
- return 40;
- if (TopUsage.Last20PercentLoad > since)
- return 30;
- if (TopUsage.Last10PercentLoad > since)
- return 20;
- if (TopUsage.Last5PercentLoad > since)
- return 10;
- if (TopUsage.Last2PercentLoad > since)
- return 5;
-
- return 2;
+ ui32 GetLatestMaxCpuUsagePercent(TInstant since) const {
+ return TopCpuUsage.GetLatestMaxCpuUsagePercent(since);
}
private:
diff --git a/ydb/core/tx/schemeshard/ut_partition_stats/ut_top_cpu_usage.cpp b/ydb/core/tx/schemeshard/ut_partition_stats/ut_top_cpu_usage.cpp
new file mode 100644
index 00000000000..2928f01f9d3
--- /dev/null
+++ b/ydb/core/tx/schemeshard/ut_partition_stats/ut_top_cpu_usage.cpp
@@ -0,0 +1,184 @@
+#include <ydb/core/tx/schemeshard/schemeshard_info_types.h>
+
+#include <library/cpp/testing/unittest/registar.h>
+
+using namespace NKikimr;
+using namespace NSchemeShard;
+
+namespace {
+
+/**
+ * Create the TPartitionStats::TTopCpuUsage container will all times set to unique values.
+ *
+ * @return The CPU usage container with all times set to unique values.
+ */
+TPartitionStats::TTopCpuUsage MakeTopCpuUsageWithUniqueTimes() {
+ TPartitionStats::TTopCpuUsage top_cpu_usage;
+
+ for (ui64 i = 0; i < top_cpu_usage.BucketUpdateTimes.size(); ++i) {
+ // Set older time stamps to higher CPU usage values to make it easier
+ // to verify the time-based lookup requests
+ top_cpu_usage.BucketUpdateTimes[i] = TInstant::MicroSeconds(
+ 1000 + top_cpu_usage.BucketUpdateTimes.size() - i
+ );
+ }
+
+ return top_cpu_usage;
+}
+
+/**
+ * Convert the TPartitionStats::TTopCpuUsage container to a pretty-printed string.
+ *
+ * @param[in] top_cpu_usage The container to convert to a string
+ *
+ * @return The corresponding pretty-printed string
+ */
+TString PrintTopCpuUsage(const TPartitionStats::TTopCpuUsage& top_cpu_usage) {
+ auto builder = TStringBuilder() << "\nTopCpuUsage = [\n";
+
+ for (ui64 i = 0; i < top_cpu_usage.Buckets.size(); ++i) {
+ builder << Sprintf(
+ " %2u%% -> %u\n",
+ top_cpu_usage.Buckets[i].LowBoundary,
+ top_cpu_usage.BucketUpdateTimes[i]
+ );
+ }
+
+ return builder << "]\n";
+}
+
+} // namespace <anonymous>
+
+/**
+ * Unit tests for the TPartitionStats::TTopCpuUsage class.
+ */
+Y_UNIT_TEST_SUITE(TSchemeShardPartitionStatsTopCpuUsageTest) {
+ /**
+ * Verify that TTopCpuUsage::Update() works correctly.
+ */
+ Y_UNIT_TEST(Update) {
+ TPartitionStats::TTopCpuUsage top_cpu_usage1 = MakeTopCpuUsageWithUniqueTimes();
+
+ // Unset (== 0) and smaller values should be ignore, larger values should be kept
+ TPartitionStats::TTopCpuUsage top_cpu_usage2;
+
+ top_cpu_usage2.BucketUpdateTimes[1] = TInstant::MicroSeconds(2001);
+ top_cpu_usage2.BucketUpdateTimes[2] = TInstant::MicroSeconds(202);
+ top_cpu_usage2.BucketUpdateTimes[3] = TInstant::MicroSeconds(2003);
+
+ top_cpu_usage1.Update(top_cpu_usage2);
+
+ std::array<TInstant, 5> expectedTimes = {{
+ TInstant::MicroSeconds(1005),
+ TInstant::MicroSeconds(2001),
+ TInstant::MicroSeconds(1003),
+ TInstant::MicroSeconds(2003),
+ TInstant::MicroSeconds(1001),
+ }};
+
+ UNIT_ASSERT_EQUAL_C(
+ top_cpu_usage1.BucketUpdateTimes,
+ expectedTimes,
+ PrintTopCpuUsage(top_cpu_usage1)
+ );
+ }
+
+ /**
+ * Verify that TTopCpuUsage::UpdateCpuUsage() does not update any buckets,
+ * if the given CPU usage percentage does not satisfy any CPU usage threshold.
+ */
+ Y_UNIT_TEST(UpdateCpuUsage_NoBuckets) {
+ TPartitionStats::TTopCpuUsage top_cpu_usage = MakeTopCpuUsageWithUniqueTimes();
+ top_cpu_usage.UpdateCpuUsage(10000 /* 1% */, TInstant::MicroSeconds(123456));
+
+ std::array<TInstant, 5> expectedTimes = {{
+ TInstant::MicroSeconds(1005),
+ TInstant::MicroSeconds(1004),
+ TInstant::MicroSeconds(1003),
+ TInstant::MicroSeconds(1002),
+ TInstant::MicroSeconds(1001),
+ }};
+
+ UNIT_ASSERT_EQUAL_C(
+ top_cpu_usage.BucketUpdateTimes,
+ expectedTimes,
+ PrintTopCpuUsage(top_cpu_usage)
+ );
+ }
+
+ /**
+ * Verify that TTopCpuUsage::UpdateCpuUsage() updates the correct buckets,
+ * if the given CPU usage percentage satisfies only some of the CPU usage thresholds.
+ */
+ Y_UNIT_TEST(UpdateCpuUsage_SomeBuckets) {
+ TPartitionStats::TTopCpuUsage top_cpu_usage(MakeTopCpuUsageWithUniqueTimes());
+ top_cpu_usage.UpdateCpuUsage(150000 /* 15% */, TInstant::MicroSeconds(123456));
+
+ std::array<TInstant, 5> expectedTimes = {{
+ TInstant::MicroSeconds(123456),
+ TInstant::MicroSeconds(123456),
+ TInstant::MicroSeconds(123456),
+ TInstant::MicroSeconds(1002),
+ TInstant::MicroSeconds(1001),
+ }};
+
+ UNIT_ASSERT_EQUAL_C(
+ top_cpu_usage.BucketUpdateTimes,
+ expectedTimes,
+ PrintTopCpuUsage(top_cpu_usage)
+ );
+ }
+
+ /**
+ * Verify that TTopCpuUsage::UpdateCpuUsage() updates all buckets,
+ * if the given CPU usage percentage satisfies all CPU usage thresholds.
+ */
+ Y_UNIT_TEST(UpdateCpuUsage_AllBuckets) {
+ TPartitionStats::TTopCpuUsage top_cpu_usage = MakeTopCpuUsageWithUniqueTimes();
+ top_cpu_usage.UpdateCpuUsage(310000 /* 31% */, TInstant::MicroSeconds(123456));
+
+ std::array<TInstant, 5> expectedTimes = {{
+ TInstant::MicroSeconds(123456),
+ TInstant::MicroSeconds(123456),
+ TInstant::MicroSeconds(123456),
+ TInstant::MicroSeconds(123456),
+ TInstant::MicroSeconds(123456),
+ }};
+
+ UNIT_ASSERT_EQUAL_C(
+ top_cpu_usage.BucketUpdateTimes,
+ expectedTimes,
+ PrintTopCpuUsage(top_cpu_usage)
+ );
+ }
+
+ /**
+ * Verify that TTopCpuUsage::GetLatestMaxCpuUsagePercent() works correctly
+ * for all threshold values.
+ */
+ Y_UNIT_TEST(GetLatestMaxCpuUsagePercent) {
+ TPartitionStats::TTopCpuUsage top_cpu_usage = MakeTopCpuUsageWithUniqueTimes();
+
+ for (const auto [since, expectedCpuUsage] : std::array<std::pair<ui64, ui32>, 6>{
+ std::make_pair(1005, 2), // The default value - no bucket for this time stamp
+ std::make_pair(1000, 40),
+ std::make_pair(1001, 30),
+ std::make_pair(1002, 20),
+ std::make_pair(1003, 10),
+ std::make_pair(1004, 5),
+ }) {
+ auto cpuUsage = top_cpu_usage.GetLatestMaxCpuUsagePercent(TInstant::MicroSeconds(since));
+
+ UNIT_ASSERT_EQUAL_C(
+ cpuUsage,
+ expectedCpuUsage,
+ Sprintf(
+ "Received %u%% for the time stamp %u, expected %u%%",
+ cpuUsage,
+ since,
+ expectedCpuUsage
+ )
+ );
+ }
+ }
+}
diff --git a/ydb/core/tx/schemeshard/ut_partition_stats/ya.make b/ydb/core/tx/schemeshard/ut_partition_stats/ya.make
new file mode 100644
index 00000000000..b8451019536
--- /dev/null
+++ b/ydb/core/tx/schemeshard/ut_partition_stats/ya.make
@@ -0,0 +1,21 @@
+UNITTEST_FOR(ydb/core/tx/schemeshard)
+
+FORK_SUBTESTS()
+
+SPLIT_FACTOR(60)
+
+SIZE(SMALL)
+
+SRCS(
+ ut_top_cpu_usage.cpp
+)
+
+PEERDIR(
+ library/cpp/testing/unittest
+ ydb/core/testlib/pg
+ yql/essentials/public/udf/service/exception_policy
+)
+
+YQL_LAST_ABI_VERSION()
+
+END()
diff --git a/ydb/core/tx/schemeshard/ya.make b/ydb/core/tx/schemeshard/ya.make
index 5fad8289552..000b5d311a0 100644
--- a/ydb/core/tx/schemeshard/ya.make
+++ b/ydb/core/tx/schemeshard/ya.make
@@ -37,6 +37,7 @@ RECURSE_FOR_TESTS(
ut_move_reboots
ut_olap
ut_olap_reboots
+ ut_partition_stats
ut_pq_reboots
ut_reboots
ut_replication