diff options
author | Alexander Gololobov <davenger@yandex-team.com> | 2022-02-10 15:16:30 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 15:58:17 +0300 |
commit | dade832999b60a3b226b79a1a9099fad1404d433 (patch) | |
tree | b1ac7664dfc21809be64142fc0f3c7c9b5925730 | |
parent | 27d9e7b019b403243c6e8f9a119e577447b6c729 (diff) | |
download | ydb-dade832999b60a3b226b79a1a9099fad1404d433.tar.gz |
Log datashards with huge data (KIKIMR-14282)
ref:200e361ee9f09e9bf6d72617648bed1a7c6fa2e2
-rw-r--r-- | ydb/core/tx/datashard/datashard.cpp | 4 | ||||
-rw-r--r-- | ydb/core/tx/datashard/datashard__stats.cpp | 40 | ||||
-rw-r--r-- | ydb/core/tx/datashard/datashard_impl.h | 3 |
3 files changed, 40 insertions, 7 deletions
diff --git a/ydb/core/tx/datashard/datashard.cpp b/ydb/core/tx/datashard/datashard.cpp index 297c966a54..d86568b4a1 100644 --- a/ydb/core/tx/datashard/datashard.cpp +++ b/ydb/core/tx/datashard/datashard.cpp @@ -131,6 +131,8 @@ TDataShard::TDataShard(const TActorId &tablet, TTabletStorageInfo *info) , PerShardReadSizeLimit(5368709120, 0, 107374182400) , CpuUsageReportThreshlodPercent(60, -1, 146) , CpuUsageReportIntervalSeconds(60, 0, 365*86400) + , HighDataSizeReportThreshlodBytes(10ull<<30, -1, Max<i64>()) + , HighDataSizeReportIntervalSeconds(60, 0, 365*86400) , DataTxProfileLogThresholdMs(0, 0, 86400000) , DataTxProfileBufferThresholdMs(0, 0, 86400000) , DataTxProfileBufferSize(0, 1000, 100) @@ -290,6 +292,8 @@ void TDataShard::OnActivateExecutor(const TActorContext& ctx) { AppData(ctx)->Icb->RegisterSharedControl(PerShardReadSizeLimit, "TxLimitControls.PerShardReadSizeLimit"); AppData(ctx)->Icb->RegisterSharedControl(CpuUsageReportThreshlodPercent, "DataShardControls.CpuUsageReportThreshlodPercent"); AppData(ctx)->Icb->RegisterSharedControl(CpuUsageReportIntervalSeconds, "DataShardControls.CpuUsageReportIntervalSeconds"); + AppData(ctx)->Icb->RegisterSharedControl(HighDataSizeReportThreshlodBytes, "DataShardControls.HighDataSizeReportThreshlodBytes"); + AppData(ctx)->Icb->RegisterSharedControl(HighDataSizeReportIntervalSeconds, "DataShardControls.HighDataSizeReportIntervalSeconds"); AppData(ctx)->Icb->RegisterSharedControl(ReadColumnsScanEnabled, "DataShardControls.ReadColumnsScanEnabled"); AppData(ctx)->Icb->RegisterSharedControl(ReadColumnsScanInUserPool, "DataShardControls.ReadColumnsScanInUserPool"); diff --git a/ydb/core/tx/datashard/datashard__stats.cpp b/ydb/core/tx/datashard/datashard__stats.cpp index 436c4d886c..129a14dca5 100644 --- a/ydb/core/tx/datashard/datashard__stats.cpp +++ b/ydb/core/tx/datashard/datashard__stats.cpp @@ -165,10 +165,21 @@ void TDataShard::Handle(TEvDataShard::TEvGetTableStats::TPtr& ev, const TActorCo Executor()->Execute(new TTxGetTableStats(this, ev), ctx); } +template <class TTables> +void ListTableNames(const TTables& tables, TStringBuilder& names) { + for (auto& t : tables) { + if (!names.Empty()) { + names << ", "; + } + names << "[" << t.second->Path << "]"; + } +} + void TDataShard::Handle(TEvPrivate::TEvAsyncTableStats::TPtr& ev, const TActorContext& ctx) { ui64 tableId = ev->Get()->TableId; LOG_DEBUG(ctx, NKikimrServices::TX_DATASHARD, "Stats rebuilt at datashard %" PRIu64, TabletID()); + i64 dataSize = 0; if (TableInfos.contains(tableId)) { const TUserTable& tableInfo = *TableInfos[tableId]; @@ -183,12 +194,32 @@ void TDataShard::Handle(TEvPrivate::TEvAsyncTableStats::TPtr& ev, const TActorCo tableInfo.Stats.MemRowCount = ev->Get()->MemRowCount; tableInfo.Stats.MemDataSize = ev->Get()->MemDataSize; + dataSize += tableInfo.Stats.DataStats.DataSize; + UpdateSearchHeightStats(tableInfo.Stats, ev->Get()->SearchHeight); tableInfo.StatsUpdateInProgress = false; SendPeriodicTableStats(ctx); } + + if (dataSize > HighDataSizeReportThreshlodBytes) { + TInstant now = AppData(ctx)->TimeProvider->Now(); + + if (LastDataSizeWarnTime + TDuration::Seconds(HighDataSizeReportIntervalSeconds) > now) + return; + + LastDataSizeWarnTime = now; + + TStringBuilder names; + ListTableNames(GetUserTables(), names); + + LOG_ERROR_S(ctx, NKikimrServices::TX_DATASHARD, "Data size " << dataSize + << " is higher than threshold of " << (i64)HighDataSizeReportThreshlodBytes + << " at datashard: " << TabletID() + << " table: " << names + << " consider reconfiguring table partitioning settings"); + } } @@ -344,7 +375,7 @@ void TDataShard::UpdateSearchHeightStats(TUserTable::TStats& stats, ui64 newSear void TDataShard::UpdateFullCompactionTsMetric(TUserTable::TStats& stats) { if (!TabletCounters) return; - + auto now = AppData()->TimeProvider->Now(); if (now < stats.LastFullCompaction) { // extra sanity check @@ -384,12 +415,7 @@ void TDataShard::CollectCpuUsage(const TActorContext &ctx) { LastCpuWarnTime = now; TStringBuilder names; - for (auto &pr : GetUserTables()) { - if (!names.Empty()) { - names << ", "; - } - names << "[" << pr.second->Path << "]"; - } + ListTableNames(GetUserTables(), names); LOG_ERROR_S(ctx, NKikimrServices::TX_DATASHARD, "CPU usage " << cpuPercent << "% is higher than threshold of " << (i64)CpuUsageReportThreshlodPercent diff --git a/ydb/core/tx/datashard/datashard_impl.h b/ydb/core/tx/datashard/datashard_impl.h index 3b76663036..11313474f8 100644 --- a/ydb/core/tx/datashard/datashard_impl.h +++ b/ydb/core/tx/datashard/datashard_impl.h @@ -1901,6 +1901,7 @@ private: TInstant LastDbStatsUpdateTime; TInstant LastDbStatsReportTime; TInstant LastCpuWarnTime; + TInstant LastDataSizeWarnTime; TActorId DbStatsReportPipe; TActorId TableResolvePipe; ui64 StatsReportRound = 0; @@ -1978,6 +1979,8 @@ private: TControlWrapper PerShardReadSizeLimit; TControlWrapper CpuUsageReportThreshlodPercent; TControlWrapper CpuUsageReportIntervalSeconds; + TControlWrapper HighDataSizeReportThreshlodBytes; + TControlWrapper HighDataSizeReportIntervalSeconds; TControlWrapper DataTxProfileLogThresholdMs; TControlWrapper DataTxProfileBufferThresholdMs; |