aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAleksei Borzenkov <snaury@ydb.tech>2024-12-25 15:25:29 +0300
committerGitHub <noreply@github.com>2024-12-25 12:25:29 +0000
commit57f6f6331a567e252cb00980c880e1e32ac25af1 (patch)
tree02f560d01a6ef685b39bd1d792cc51cb556a87a8
parent1049a71ceec5aa03d8fe542b2247a3c75ba76a2b (diff)
downloadydb-57f6f6331a567e252cb00980c880e1e32ac25af1.tar.gz
Add db counters for uncommitted changes and suspicious commits (#12966)
-rw-r--r--ydb/core/protos/counters_datashard.proto1
-rw-r--r--ydb/core/tablet_flat/flat_database.cpp5
-rw-r--r--ydb/core/tablet_flat/flat_database.h5
-rw-r--r--ydb/core/tablet_flat/flat_dbase_misc.h2
-rw-r--r--ydb/core/tablet_flat/flat_dbase_naked.h10
-rw-r--r--ydb/core/tablet_flat/flat_executor.cpp10
-rw-r--r--ydb/core/tablet_flat/flat_executor_counters.h5
-rw-r--r--ydb/core/tablet_flat/flat_table.cpp46
-rw-r--r--ydb/core/tablet_flat/flat_table.h7
-rw-r--r--ydb/core/tablet_flat/flat_table_committed.h30
-rw-r--r--ydb/core/tablet_flat/flat_table_stats.h26
-rw-r--r--ydb/core/tx/datashard/datashard_user_db.cpp5
-rw-r--r--ydb/core/tx/datashard/volatile_tx.cpp4
13 files changed, 148 insertions, 8 deletions
diff --git a/ydb/core/protos/counters_datashard.proto b/ydb/core/protos/counters_datashard.proto
index bb3df89a3f..97cc48631a 100644
--- a/ydb/core/protos/counters_datashard.proto
+++ b/ydb/core/protos/counters_datashard.proto
@@ -153,6 +153,7 @@ enum ECumulativeCounters {
COUNTER_WRITE_BYTES = 111 [(CounterOpts) = {Name: "WriteBytes"}];
COUNTER_WRITE_DISK_SPACE_EXHAUSTED = 112 [(CounterOpts) = {Name: "WriteDiskSpaceExhausted"}];
COUNTER_PREPARE_DISK_SPACE_EXHAUSTED = 113 [(CounterOpts) = {Name: "PrepareSpaceExhausted"}];
+ COUNTER_REMOVED_COMMITTED_TXS = 114 [(CounterOpts) = {Name: "RemovedCommittedTxs"}];
}
enum EPercentileCounters {
diff --git a/ydb/core/tablet_flat/flat_database.cpp b/ydb/core/tablet_flat/flat_database.cpp
index 6f19e4b56b..acc5bd968a 100644
--- a/ydb/core/tablet_flat/flat_database.cpp
+++ b/ydb/core/tablet_flat/flat_database.cpp
@@ -518,6 +518,11 @@ const TDbStats& TDatabase::Counters() const noexcept
return DatabaseImpl->Stats;
}
+TDbRuntimeStats TDatabase::RuntimeCounters() const noexcept
+{
+ return DatabaseImpl->GetRuntimeStats();
+}
+
void TDatabase::UpdateApproximateFreeSharesByChannel(const THashMap<ui32, float>& approximateFreeSpaceShareByChannel)
{
for (auto& [channel, value] : approximateFreeSpaceShareByChannel) {
diff --git a/ydb/core/tablet_flat/flat_database.h b/ydb/core/tablet_flat/flat_database.h
index f8f80cb06d..373e50f371 100644
--- a/ydb/core/tablet_flat/flat_database.h
+++ b/ydb/core/tablet_flat/flat_database.h
@@ -42,7 +42,6 @@ class TDatabase {
public:
using TMemGlobs = TVector<NPageCollection::TMemGlob>;
using TCookieAllocator = NPageCollection::TCookieAllocator;
- using TCounters = TDbStats;
struct TProd {
THolder<TChange> Change;
@@ -221,7 +220,9 @@ public:
ui64 GetTableIndexSize(ui32 table) const;
ui64 GetTableSearchHeight(ui32 table) const;
ui64 EstimateRowSize(ui32 table) const;
- const TCounters& Counters() const noexcept;
+ const TDbStats& Counters() const noexcept;
+ TDbRuntimeStats RuntimeCounters() const noexcept;
+
void UpdateApproximateFreeSharesByChannel(const THashMap<ui32, float>& approximateFreeSpaceShareByChannel);
TString SnapshotToLog(ui32 table, TTxStamp);
diff --git a/ydb/core/tablet_flat/flat_dbase_misc.h b/ydb/core/tablet_flat/flat_dbase_misc.h
index 25b035ebda..81fab73400 100644
--- a/ydb/core/tablet_flat/flat_dbase_misc.h
+++ b/ydb/core/tablet_flat/flat_dbase_misc.h
@@ -32,5 +32,7 @@ namespace NTable {
THashMap<ui32, float> NormalizedFreeSpaceShareByChannel;
};
+ using TDbRuntimeStats = TTableRuntimeStats;
+
}
}
diff --git a/ydb/core/tablet_flat/flat_dbase_naked.h b/ydb/core/tablet_flat/flat_dbase_naked.h
index 5323ca91d3..043fd7a386 100644
--- a/ydb/core/tablet_flat/flat_dbase_naked.h
+++ b/ydb/core/tablet_flat/flat_dbase_naked.h
@@ -781,6 +781,16 @@ namespace NTable {
}
}
+ public:
+ TDbRuntimeStats GetRuntimeStats() const {
+ TDbRuntimeStats stats;
+ for (auto& pr : Tables) {
+ // TODO: use a lazy aggregate to balance many idle tables vs frequent updates
+ stats += pr.second->RuntimeStats();
+ }
+ return stats;
+ }
+
private:
const TIntrusivePtr<TKeyRangeCacheNeedGCList> GCList;
const TTxStamp Weak; /* db bootstrap upper stamp */
diff --git a/ydb/core/tablet_flat/flat_executor.cpp b/ydb/core/tablet_flat/flat_executor.cpp
index 268609fdc7..8d6f3c3360 100644
--- a/ydb/core/tablet_flat/flat_executor.cpp
+++ b/ydb/core/tablet_flat/flat_executor.cpp
@@ -3565,6 +3565,16 @@ void TExecutor::UpdateCounters(const TActorContext &ctx) {
Counters->Simple()[TExecutorCounters::USED_TABLET_MEMORY].Set(UsedTabletMemory);
}
+ // Runtime stats related to uncommitted changes
+ auto runtimeCounters = Database->RuntimeCounters();
+ {
+ Counters->Simple()[TExecutorCounters::DB_OPEN_TX_COUNT].Set(runtimeCounters.OpenTxCount);
+ Counters->Simple()[TExecutorCounters::DB_TXS_WITH_DATA_COUNT].Set(runtimeCounters.TxsWithDataCount);
+ Counters->Simple()[TExecutorCounters::DB_COMMITTED_TX_COUNT].Set(runtimeCounters.CommittedTxCount);
+ Counters->Simple()[TExecutorCounters::DB_REMOVED_TX_COUNT].Set(runtimeCounters.RemovedTxCount);
+ Counters->Simple()[TExecutorCounters::DB_REMOVED_COMMITTED_TXS].Set(runtimeCounters.RemovedCommittedTxs);
+ }
+
if (CommitManager) /* exists only on leader, mostly storage usage data */ {
auto redo = LogicRedo->LogStats();
Counters->Simple()[TExecutorCounters::LOG_REDO_COUNT].Set(redo.Items);
diff --git a/ydb/core/tablet_flat/flat_executor_counters.h b/ydb/core/tablet_flat/flat_executor_counters.h
index 2f4b87c73b..379af6b829 100644
--- a/ydb/core/tablet_flat/flat_executor_counters.h
+++ b/ydb/core/tablet_flat/flat_executor_counters.h
@@ -65,6 +65,11 @@ namespace NTabletFlatExecutor {
XX(DB_FLAT_INDEX_BYTES, "DbFlatIndexBytes") \
XX(DB_B_TREE_INDEX_BYTES, "DbBTreeIndexBytes") \
XX(CACHE_TOTAL_USED, "CacheTotalUsed") \
+ XX(DB_OPEN_TX_COUNT, "DbOpenTxCount") \
+ XX(DB_TXS_WITH_DATA_COUNT, "DbTxsWithDataCount") \
+ XX(DB_COMMITTED_TX_COUNT, "DbCommittedTxCount") \
+ XX(DB_REMOVED_TX_COUNT, "DbRemovedTxCount") \
+ XX(DB_REMOVED_COMMITTED_TXS, "DbRemovedCommittedTxs") \
// don't change order!
#define FLAT_EXECUTOR_CUMULATIVE_COUNTERS_MAP(XX) \
diff --git a/ydb/core/tablet_flat/flat_table.cpp b/ydb/core/tablet_flat/flat_table.cpp
index 3e9adf1dff..16209def28 100644
--- a/ydb/core/tablet_flat/flat_table.cpp
+++ b/ydb/core/tablet_flat/flat_table.cpp
@@ -632,7 +632,11 @@ void TTable::Merge(TIntrusiveConstPtr<TTxStatusPart> txStatus) noexcept
if (const auto* prev = CommittedTransactions.Find(txId); Y_LIKELY(!prev) || *prev > rowVersion) {
CommittedTransactions.Add(txId, rowVersion);
if (!prev) {
- RemovedTransactions.Remove(txId);
+ if (RemovedTransactions.Remove(txId)) {
+ // Transaction was in a removed set and now it's committed
+ // This is not an error in some cases, but may be suspicious
+ RemovedCommittedTxs++;
+ }
}
}
if (!TxRefs.contains(txId)) {
@@ -645,6 +649,10 @@ void TTable::Merge(TIntrusiveConstPtr<TTxStatusPart> txStatus) noexcept
const ui64 txId = item.GetTxId();
if (const auto* prev = CommittedTransactions.Find(txId); Y_LIKELY(!prev)) {
RemovedTransactions.Add(txId);
+ } else {
+ // Transaction is in a committed set but also removed
+ // This is not an error in some cases, but may be suspicious
+ RemovedCommittedTxs++;
}
if (!TxRefs.contains(txId)) {
CheckTransactions.insert(txId);
@@ -944,7 +952,11 @@ void TTable::CommitTx(ui64 txId, TRowVersion rowVersion)
if (RollbackState && RemovedTransactions.Contains(txId)) {
RollbackOps.emplace_back(TRollbackAddRemovedTx{ txId });
}
- RemovedTransactions.Remove(txId);
+ if (RemovedTransactions.Remove(txId)) {
+ // Transaction was in a removed set and now it's committed
+ // This is not an error in some cases, but may be suspicious
+ RemovedCommittedTxs++;
+ }
}
if (auto it = OpenTxs.find(txId); it != OpenTxs.end()) {
if (RollbackState) {
@@ -982,6 +994,10 @@ void TTable::RemoveTx(ui64 txId)
}
OpenTxs.erase(it);
}
+ } else {
+ // Transaction is in a committed set but also removed
+ // This is not an error in some cases, but may be suspicious
+ RemovedCommittedTxs++;
}
}
@@ -1015,6 +1031,32 @@ size_t TTable::GetOpenTxCount() const
return OpenTxs.size();
}
+size_t TTable::GetTxsWithDataCount() const
+{
+ return TxRefs.size();
+}
+
+size_t TTable::GetCommittedTxCount() const
+{
+ return CommittedTransactions.Size();
+}
+
+size_t TTable::GetRemovedTxCount() const
+{
+ return RemovedTransactions.Size();
+}
+
+TTableRuntimeStats TTable::RuntimeStats() const noexcept
+{
+ return TTableRuntimeStats{
+ .OpenTxCount = OpenTxs.size(),
+ .TxsWithDataCount = TxRefs.size(),
+ .CommittedTxCount = CommittedTransactions.Size(),
+ .RemovedTxCount = RemovedTransactions.Size(),
+ .RemovedCommittedTxs = RemovedCommittedTxs,
+ };
+}
+
TMemTable& TTable::MemTable()
{
if (!Mutable) {
diff --git a/ydb/core/tablet_flat/flat_table.h b/ydb/core/tablet_flat/flat_table.h
index 24af7c23d3..493a5ed40d 100644
--- a/ydb/core/tablet_flat/flat_table.h
+++ b/ydb/core/tablet_flat/flat_table.h
@@ -184,6 +184,9 @@ public:
const absl::flat_hash_set<ui64>& GetOpenTxs() const;
size_t GetOpenTxCount() const;
+ size_t GetTxsWithDataCount() const;
+ size_t GetCommittedTxCount() const;
+ size_t GetRemovedTxCount() const;
TPartView GetPartView(const TLogoBlobID &bundle) const
{
@@ -240,6 +243,8 @@ public:
return Stat_;
}
+ TTableRuntimeStats RuntimeStats() const noexcept;
+
ui64 GetMemSize(TEpoch epoch = TEpoch::Max()) const noexcept
{
if (Y_LIKELY(epoch == TEpoch::Max())) {
@@ -364,6 +369,8 @@ private:
TTransactionSet DecidedTransactions;
TIntrusivePtr<ITableObserver> TableObserver;
+ ui64 RemovedCommittedTxs = 0;
+
private:
struct TRollbackRemoveTxRef {
ui64 TxId;
diff --git a/ydb/core/tablet_flat/flat_table_committed.h b/ydb/core/tablet_flat/flat_table_committed.h
index 8df020d59f..9080d80cd0 100644
--- a/ydb/core/tablet_flat/flat_table_committed.h
+++ b/ydb/core/tablet_flat/flat_table_committed.h
@@ -226,9 +226,20 @@ namespace NTable {
Unshare()[txId] = value;
}
- void Remove(ui64 txId) {
+ bool Remove(ui64 txId) {
if (State_ && State_->contains(txId)) {
Unshare().erase(txId);
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ size_t Size() const {
+ if (State_) {
+ return State_->size();
+ } else {
+ return 0;
}
}
@@ -345,13 +356,24 @@ namespace NTable {
State_.Reset();
}
- void Add(ui64 txId) {
- Unshare().insert(txId);
+ bool Add(ui64 txId) {
+ return Unshare().insert(txId).second;
}
- void Remove(ui64 txId) {
+ bool Remove(ui64 txId) {
if (State_ && State_->contains(txId)) {
Unshare().erase(txId);
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ size_t Size() const {
+ if (State_) {
+ return State_->size();
+ } else {
+ return 0;
}
}
diff --git a/ydb/core/tablet_flat/flat_table_stats.h b/ydb/core/tablet_flat/flat_table_stats.h
index 4d735d3fcc..16e02ccc00 100644
--- a/ydb/core/tablet_flat/flat_table_stats.h
+++ b/ydb/core/tablet_flat/flat_table_stats.h
@@ -55,5 +55,31 @@ namespace NTable {
ui64 MemDataWaste = 0;
};
+ struct TTableRuntimeStats {
+ ui64 OpenTxCount = 0;
+ ui64 TxsWithDataCount = 0;
+ ui64 CommittedTxCount = 0;
+ ui64 RemovedTxCount = 0;
+ ui64 RemovedCommittedTxs = 0;
+
+ TTableRuntimeStats& operator+=(const TTableRuntimeStats& s) noexcept {
+ OpenTxCount += s.OpenTxCount;
+ TxsWithDataCount += s.TxsWithDataCount;
+ CommittedTxCount += s.CommittedTxCount;
+ RemovedTxCount += s.RemovedTxCount;
+ RemovedCommittedTxs += s.RemovedCommittedTxs;
+ return *this;
+ }
+
+ TTableRuntimeStats& operator-=(const TTableRuntimeStats& s) noexcept {
+ OpenTxCount -= s.OpenTxCount;
+ TxsWithDataCount -= s.TxsWithDataCount;
+ CommittedTxCount -= s.CommittedTxCount;
+ RemovedTxCount -= s.RemovedTxCount;
+ RemovedCommittedTxs -= s.RemovedCommittedTxs;
+ return *this;
+ }
+ };
+
}
}
diff --git a/ydb/core/tx/datashard/datashard_user_db.cpp b/ydb/core/tx/datashard/datashard_user_db.cpp
index a82e0ef9db..0098c16588 100644
--- a/ydb/core/tx/datashard/datashard_user_db.cpp
+++ b/ydb/core/tx/datashard/datashard_user_db.cpp
@@ -321,6 +321,11 @@ void TDataShardUserDb::CommitChanges(const TTableId& tableId, ui64 lockId, const
Y_VERIFY_S(localTid, "Unexpected failure to find table " << tableId << " in datashard " << Self.TabletID());
if (!Db.HasOpenTx(localTid, lockId)) {
+ if (Db.HasRemovedTx(localTid, lockId)) {
+ LOG_CRIT_S(*TlsActivationContext, NKikimrServices::TX_DATASHARD,
+ "Committing removed changes lockId# " << lockId << " tid# " << localTid << " shard# " << Self.TabletID());
+ Self.IncCounter(COUNTER_REMOVED_COMMITTED_TXS);
+ }
return;
}
diff --git a/ydb/core/tx/datashard/volatile_tx.cpp b/ydb/core/tx/datashard/volatile_tx.cpp
index 7712a3a71b..9e0271c258 100644
--- a/ydb/core/tx/datashard/volatile_tx.cpp
+++ b/ydb/core/tx/datashard/volatile_tx.cpp
@@ -64,6 +64,10 @@ namespace NKikimr::NDataShard {
if (txc.DB.HasOpenTx(tid, commitTxId)) {
txc.DB.CommitTx(tid, commitTxId, info->Version);
Self->GetConflictsCache().GetTableCache(tid).RemoveUncommittedWrites(commitTxId, txc.DB);
+ } else if (txc.DB.HasRemovedTx(tid, commitTxId)) {
+ LOG_CRIT_S(*TlsActivationContext, NKikimrServices::TX_DATASHARD,
+ "Committing removed changes txId# " << commitTxId << " tid# " << tid << " shard# " << Self->TabletID());
+ Self->IncCounter(COUNTER_REMOVED_COMMITTED_TXS);
}
}
}