aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAlexander Gololobov <davenger@yandex-team.com>2022-02-10 15:16:30 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-02-10 15:58:17 +0300
commitdade832999b60a3b226b79a1a9099fad1404d433 (patch)
treeb1ac7664dfc21809be64142fc0f3c7c9b5925730
parent27d9e7b019b403243c6e8f9a119e577447b6c729 (diff)
downloadydb-dade832999b60a3b226b79a1a9099fad1404d433.tar.gz
Log datashards with huge data (KIKIMR-14282)
ref:200e361ee9f09e9bf6d72617648bed1a7c6fa2e2
-rw-r--r--ydb/core/tx/datashard/datashard.cpp4
-rw-r--r--ydb/core/tx/datashard/datashard__stats.cpp40
-rw-r--r--ydb/core/tx/datashard/datashard_impl.h3
3 files changed, 40 insertions, 7 deletions
diff --git a/ydb/core/tx/datashard/datashard.cpp b/ydb/core/tx/datashard/datashard.cpp
index 297c966a54..d86568b4a1 100644
--- a/ydb/core/tx/datashard/datashard.cpp
+++ b/ydb/core/tx/datashard/datashard.cpp
@@ -131,6 +131,8 @@ TDataShard::TDataShard(const TActorId &tablet, TTabletStorageInfo *info)
, PerShardReadSizeLimit(5368709120, 0, 107374182400)
, CpuUsageReportThreshlodPercent(60, -1, 146)
, CpuUsageReportIntervalSeconds(60, 0, 365*86400)
+ , HighDataSizeReportThreshlodBytes(10ull<<30, -1, Max<i64>())
+ , HighDataSizeReportIntervalSeconds(60, 0, 365*86400)
, DataTxProfileLogThresholdMs(0, 0, 86400000)
, DataTxProfileBufferThresholdMs(0, 0, 86400000)
, DataTxProfileBufferSize(0, 1000, 100)
@@ -290,6 +292,8 @@ void TDataShard::OnActivateExecutor(const TActorContext& ctx) {
AppData(ctx)->Icb->RegisterSharedControl(PerShardReadSizeLimit, "TxLimitControls.PerShardReadSizeLimit");
AppData(ctx)->Icb->RegisterSharedControl(CpuUsageReportThreshlodPercent, "DataShardControls.CpuUsageReportThreshlodPercent");
AppData(ctx)->Icb->RegisterSharedControl(CpuUsageReportIntervalSeconds, "DataShardControls.CpuUsageReportIntervalSeconds");
+ AppData(ctx)->Icb->RegisterSharedControl(HighDataSizeReportThreshlodBytes, "DataShardControls.HighDataSizeReportThreshlodBytes");
+ AppData(ctx)->Icb->RegisterSharedControl(HighDataSizeReportIntervalSeconds, "DataShardControls.HighDataSizeReportIntervalSeconds");
AppData(ctx)->Icb->RegisterSharedControl(ReadColumnsScanEnabled, "DataShardControls.ReadColumnsScanEnabled");
AppData(ctx)->Icb->RegisterSharedControl(ReadColumnsScanInUserPool, "DataShardControls.ReadColumnsScanInUserPool");
diff --git a/ydb/core/tx/datashard/datashard__stats.cpp b/ydb/core/tx/datashard/datashard__stats.cpp
index 436c4d886c..129a14dca5 100644
--- a/ydb/core/tx/datashard/datashard__stats.cpp
+++ b/ydb/core/tx/datashard/datashard__stats.cpp
@@ -165,10 +165,21 @@ void TDataShard::Handle(TEvDataShard::TEvGetTableStats::TPtr& ev, const TActorCo
Executor()->Execute(new TTxGetTableStats(this, ev), ctx);
}
+template <class TTables>
+void ListTableNames(const TTables& tables, TStringBuilder& names) {
+ for (auto& t : tables) {
+ if (!names.Empty()) {
+ names << ", ";
+ }
+ names << "[" << t.second->Path << "]";
+ }
+}
+
void TDataShard::Handle(TEvPrivate::TEvAsyncTableStats::TPtr& ev, const TActorContext& ctx) {
ui64 tableId = ev->Get()->TableId;
LOG_DEBUG(ctx, NKikimrServices::TX_DATASHARD, "Stats rebuilt at datashard %" PRIu64, TabletID());
+ i64 dataSize = 0;
if (TableInfos.contains(tableId)) {
const TUserTable& tableInfo = *TableInfos[tableId];
@@ -183,12 +194,32 @@ void TDataShard::Handle(TEvPrivate::TEvAsyncTableStats::TPtr& ev, const TActorCo
tableInfo.Stats.MemRowCount = ev->Get()->MemRowCount;
tableInfo.Stats.MemDataSize = ev->Get()->MemDataSize;
+ dataSize += tableInfo.Stats.DataStats.DataSize;
+
UpdateSearchHeightStats(tableInfo.Stats, ev->Get()->SearchHeight);
tableInfo.StatsUpdateInProgress = false;
SendPeriodicTableStats(ctx);
}
+
+ if (dataSize > HighDataSizeReportThreshlodBytes) {
+ TInstant now = AppData(ctx)->TimeProvider->Now();
+
+ if (LastDataSizeWarnTime + TDuration::Seconds(HighDataSizeReportIntervalSeconds) > now)
+ return;
+
+ LastDataSizeWarnTime = now;
+
+ TStringBuilder names;
+ ListTableNames(GetUserTables(), names);
+
+ LOG_ERROR_S(ctx, NKikimrServices::TX_DATASHARD, "Data size " << dataSize
+ << " is higher than threshold of " << (i64)HighDataSizeReportThreshlodBytes
+ << " at datashard: " << TabletID()
+ << " table: " << names
+ << " consider reconfiguring table partitioning settings");
+ }
}
@@ -344,7 +375,7 @@ void TDataShard::UpdateSearchHeightStats(TUserTable::TStats& stats, ui64 newSear
void TDataShard::UpdateFullCompactionTsMetric(TUserTable::TStats& stats) {
if (!TabletCounters)
return;
-
+
auto now = AppData()->TimeProvider->Now();
if (now < stats.LastFullCompaction) {
// extra sanity check
@@ -384,12 +415,7 @@ void TDataShard::CollectCpuUsage(const TActorContext &ctx) {
LastCpuWarnTime = now;
TStringBuilder names;
- for (auto &pr : GetUserTables()) {
- if (!names.Empty()) {
- names << ", ";
- }
- names << "[" << pr.second->Path << "]";
- }
+ ListTableNames(GetUserTables(), names);
LOG_ERROR_S(ctx, NKikimrServices::TX_DATASHARD, "CPU usage " << cpuPercent
<< "% is higher than threshold of " << (i64)CpuUsageReportThreshlodPercent
diff --git a/ydb/core/tx/datashard/datashard_impl.h b/ydb/core/tx/datashard/datashard_impl.h
index 3b76663036..11313474f8 100644
--- a/ydb/core/tx/datashard/datashard_impl.h
+++ b/ydb/core/tx/datashard/datashard_impl.h
@@ -1901,6 +1901,7 @@ private:
TInstant LastDbStatsUpdateTime;
TInstant LastDbStatsReportTime;
TInstant LastCpuWarnTime;
+ TInstant LastDataSizeWarnTime;
TActorId DbStatsReportPipe;
TActorId TableResolvePipe;
ui64 StatsReportRound = 0;
@@ -1978,6 +1979,8 @@ private:
TControlWrapper PerShardReadSizeLimit;
TControlWrapper CpuUsageReportThreshlodPercent;
TControlWrapper CpuUsageReportIntervalSeconds;
+ TControlWrapper HighDataSizeReportThreshlodBytes;
+ TControlWrapper HighDataSizeReportIntervalSeconds;
TControlWrapper DataTxProfileLogThresholdMs;
TControlWrapper DataTxProfileBufferThresholdMs;