diff options
author | Alexey Efimov <xeno@ydb.tech> | 2024-10-04 11:45:56 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-10-04 09:45:56 +0000 |
commit | f2805c67d45546a33b9605ea33bb2fde285498c9 (patch) | |
tree | 4798dafa24e087b4c234b2822eaadc35663b4adc | |
parent | a4dca6dd1ca3f5c8f68a8c86d833d586df9c3893 (diff) | |
download | ydb-f2805c67d45546a33b9605ea33bb2fde285498c9.tar.gz |
fix nodes grouping and uptime calculations (#10075)
-rw-r--r-- | ydb/core/viewer/json_handlers_viewer.cpp | 2 | ||||
-rw-r--r-- | ydb/core/viewer/protos/viewer.proto | 5 | ||||
-rw-r--r-- | ydb/core/viewer/viewer_nodes.h | 167 |
3 files changed, 116 insertions, 58 deletions
diff --git a/ydb/core/viewer/json_handlers_viewer.cpp b/ydb/core/viewer/json_handlers_viewer.cpp index 7b0acb1948..56180de9da 100644 --- a/ydb/core/viewer/json_handlers_viewer.cpp +++ b/ydb/core/viewer/json_handlers_viewer.cpp @@ -243,7 +243,7 @@ void InitViewerHealthCheckJsonHandler(TJsonHandlers& handlers) { } void InitViewerNodesJsonHandler(TJsonHandlers& handlers) { - handlers.AddHandler("/viewer/nodes", new TJsonHandler<TJsonNodes>(TJsonNodes::GetSwagger()), 8); + handlers.AddHandler("/viewer/nodes", new TJsonHandler<TJsonNodes>(TJsonNodes::GetSwagger()), 9); } void InitViewerACLJsonHandler(TJsonHandlers &jsonHandlers) { diff --git a/ydb/core/viewer/protos/viewer.proto b/ydb/core/viewer/protos/viewer.proto index de758460fe..a88f666630 100644 --- a/ydb/core/viewer/protos/viewer.proto +++ b/ydb/core/viewer/protos/viewer.proto @@ -525,6 +525,11 @@ message TNodeGroup { message TNodeInfo { uint32 NodeId = 1; + string Database = 2; + int32 UptimeSeconds = 3; // negative for disconnect time + bool Disconnected = 4; + float CpuUsage = 5; + float DiskSpaceUsage = 6; NKikimrWhiteboard.TSystemStateInfo SystemState = 10; repeated NKikimrWhiteboard.TPDiskStateInfo PDisks = 20; repeated NKikimrWhiteboard.TVDiskStateInfo VDisks = 30; diff --git a/ydb/core/viewer/viewer_nodes.h b/ydb/core/viewer/viewer_nodes.h index acbf7bfa26..ee7f42ceae 100644 --- a/ydb/core/viewer/viewer_nodes.h +++ b/ydb/core/viewer/viewer_nodes.h @@ -109,7 +109,7 @@ class TJsonNodes : public TViewerPipeClient { std::unordered_set<ui32> FilterGroupIds; std::optional<std::size_t> Offset; std::optional<std::size_t> Limit; - ui32 UptimeSeconds = 0; + int UptimeSeconds = 0; bool ProblemNodesOnly = false; TString Filter; bool AllWhiteboardFields = false; @@ -145,7 +145,7 @@ class TJsonNodes : public TViewerPipeClient { bool OffloadMerge = true; size_t OffloadMergeAttempts = 2; - using TGroupSortKey = std::variant<TString, ui64, float>; + using TGroupSortKey = std::variant<TString, ui64, float, int>; struct TNode { TEvInterconnect::TNodeInfo NodeInfo; @@ -167,6 +167,7 @@ class TJsonNodes : public TViewerPipeClient { bool HasDisks = false; bool GotDatabaseFromDatabaseBoardInfo = false; bool GotDatabaseFromResourceBoardInfo = false; + int UptimeSeconds = 0; TNodeId GetNodeId() const { return NodeInfo.NodeId; @@ -284,7 +285,7 @@ class TJsonNodes : public TViewerPipeClient { disconnectTime = std::max(disconnectTime, TInstant::MicroSeconds(pdisk.GetStatusChangeTimestamp())); } if (disconnectTime) { - SystemState.SetDisconnectTime(disconnectTime.Seconds()); + SystemState.SetDisconnectTime(disconnectTime.MilliSeconds()); } } } @@ -376,38 +377,56 @@ class TJsonNodes : public TViewerPipeClient { return TStringBuilder() << std::floor(std::clamp<float>(DiskSpaceUsage, 0, 100) / 5) * 5 << '%'; } - TString GetUptimeForGroup(TInstant now) const { - if (!Disconnected) { - auto uptime = static_cast<int>(now.Seconds()) - SystemState.GetStartTime(); - if (uptime < 60 * 10) { - return "uptime < 10m"; + TInstant GetStartTime() const { + return TInstant::MilliSeconds(SystemState.GetStartTime()); + } + + TInstant GetDisconnectTime() const { + return TInstant::MilliSeconds(SystemState.GetDisconnectTime()); + } + + int GetUptimeSeconds(TInstant now) const { + if (Disconnected) { + return static_cast<int>(GetDisconnectTime().Seconds()) - static_cast<int>(now.Seconds()); // negative for disconnected nodes + } else { + return static_cast<int>(now.Seconds()) - static_cast<int>(GetStartTime().Seconds()); + } + } + + void CalcUptimeSeconds(TInstant now) { + UptimeSeconds = GetUptimeSeconds(now); + } + + TString GetUptimeForGroup() const { + if (!Disconnected && UptimeSeconds >= 0) { + if (UptimeSeconds < 60 * 10) { + return "up <10m"; } - if (uptime < 60 * 60) { - return "uptime < 1h"; + if (UptimeSeconds < 60 * 60) { + return "up <1h"; } - if (uptime < 60 * 60 * 24) { - return "uptime < 24h"; + if (UptimeSeconds < 60 * 60 * 24) { + return "up <24h"; } - if (uptime < 60 * 60 * 24 * 7) { - return "uptime < 1 week"; + if (UptimeSeconds < 60 * 60 * 24 * 7) { + return "up 24h+"; } - return "uptime > 1 week"; + return "up 1 week+"; } else { if (SystemState.HasDisconnectTime()) { - auto downtime = static_cast<int>(now.Seconds()) - SystemState.GetDisconnectTime(); - if (downtime < 60 * 10) { - return "downtime < 10m"; + if (UptimeSeconds > -60 * 10) { + return "down <10m"; } - if (downtime < 60 * 60) { - return "downtime < 1h"; + if (UptimeSeconds > -60 * 60) { + return "down <1h"; } - if (downtime < 60 * 60 * 24) { - return "downtime < 24h"; + if (UptimeSeconds > -60 * 60 * 24) { + return "down <24h"; } - if (downtime < 60 * 60 * 24 * 7) { - return "downtime < 1 week"; + if (UptimeSeconds > -60 * 60 * 24 * 7) { + return "down 24h+"; } - return "downtime > 1 week"; + return "down 1 week+"; } else { return "disconnected"; } @@ -430,7 +449,7 @@ class TJsonNodes : public TViewerPipeClient { return SubDomainKey == subDomainKey; } - TString GetGroupName(ENodeFields groupBy, TInstant now) const { + TString GetGroupName(ENodeFields groupBy) const { TString groupName; switch (groupBy) { case ENodeFields::NodeId: @@ -458,7 +477,7 @@ class TJsonNodes : public TViewerPipeClient { groupName = ToString(MissingDisks); break; case ENodeFields::Uptime: - groupName = GetUptimeForGroup(now); + groupName = GetUptimeForGroup(); break; case ENodeFields::Version: groupName = GetVersionForGroup(); @@ -472,7 +491,7 @@ class TJsonNodes : public TViewerPipeClient { return groupName; } - TGroupSortKey GetGroupSortKey(ENodeFields groupBy, TInstant now) const { + TGroupSortKey GetGroupSortKey(ENodeFields groupBy) const { switch (groupBy) { case ENodeFields::NodeId: case ENodeFields::HostName: @@ -481,24 +500,25 @@ class TJsonNodes : public TViewerPipeClient { case ENodeFields::DC: case ENodeFields::Rack: case ENodeFields::Version: - return GetGroupName(groupBy, now); + return GetGroupName(groupBy); case ENodeFields::DiskSpaceUsage: return DiskSpaceUsage; case ENodeFields::Missing: return MissingDisks; case ENodeFields::Uptime: - return static_cast<ui64>(now.Seconds()) - (Disconnected ? SystemState.GetDisconnectTime() : SystemState.GetStartTime()); + return UptimeSeconds; default: return TString(); } } - void MergeFrom(const NKikimrWhiteboard::TSystemStateInfo& systemState) { + void MergeFrom(const NKikimrWhiteboard::TSystemStateInfo& systemState, TInstant now) { SystemState.MergeFrom(systemState); Cleanup(); CalcDatabase(); CalcCpuUsage(); CalcLoadAverage(); + CalcUptimeSeconds(now); } }; @@ -650,7 +670,7 @@ public: InitConfig(params); Timeout = FromStringWithDefault<ui32>(params.Get("timeout"), 10000); FieldsRequired.set(+ENodeFields::NodeId); - UptimeSeconds = FromStringWithDefault<ui32>(params.Get("uptime"), 0); + UptimeSeconds = FromStringWithDefault<int>(params.Get("uptime"), 0); ProblemNodesOnly = FromStringWithDefault<bool>(params.Get("problems_only"), ProblemNodesOnly); Filter = params.Get("filter"); if (UptimeSeconds || ProblemNodesOnly || !Filter.empty()) { @@ -993,10 +1013,9 @@ public: InvalidateNodes(); } if (UptimeSeconds > 0 && FieldsAvailable.test(+ENodeFields::SystemState)) { - ui64 limitMilliSeconds = TInstant::Now().MilliSeconds() - UptimeSeconds * 1000; TNodeView nodeView; for (TNode* node : NodeView) { - if (node->SystemState.GetStartTime() >= limitMilliSeconds) { + if (node->UptimeSeconds < UptimeSeconds) { nodeView.push_back(node); } } @@ -1008,19 +1027,11 @@ public: TVector<TString> filterWords = SplitString(Filter, " "); TNodeView nodeView; for (TNode* node : NodeView) { - bool match = false; for (const TString& word : filterWords) { - if (node->GetHostName().Contains(word)) { - match = true; - break; - } else if (::ToString(node->GetNodeId()).Contains(word)) { - match = true; - break; + if (node->GetHostName().Contains(word) || ::ToString(node->GetNodeId()).Contains(word)) { + nodeView.push_back(node); } } - if (match) { - nodeView.push_back(node); - } } NodeView.swap(nodeView); Filter.clear(); @@ -1028,9 +1039,8 @@ public: } if (!FilterGroup.empty() && FieldsAvailable.test(+FilterGroupBy)) { TNodeView nodeView; - auto now = TInstant::Now(); for (TNode* node : NodeView) { - if (node->GetGroupName(FilterGroupBy, now) == FilterGroup) { + if (node->GetGroupName(FilterGroupBy) == FilterGroup) { nodeView.push_back(node); } } @@ -1044,18 +1054,17 @@ public: } void GroupCollection() { - auto now = TInstant::Now(); std::unordered_map<TString, size_t> nodeGroups; NodeGroups.clear(); for (TNode* node : NodeView) { - auto gb = node->GetGroupName(GroupBy, now); + auto gb = node->GetGroupName(GroupBy); TNodeGroup* nodeGroup = nullptr; auto it = nodeGroups.find(gb); if (it == nodeGroups.end()) { nodeGroups.emplace(gb, NodeGroups.size()); nodeGroup = &NodeGroups.emplace_back(); nodeGroup->Name = gb; - nodeGroup->SortKey = node->GetGroupSortKey(GroupBy, now); + nodeGroup->SortKey = node->GetGroupSortKey(GroupBy); } else { nodeGroup = &NodeGroups[it->second]; } @@ -1070,16 +1079,20 @@ public: case ENodeFields::HostName: case ENodeFields::NodeName: case ENodeFields::Database: - case ENodeFields::DiskSpaceUsage: case ENodeFields::DC: case ENodeFields::Rack: - case ENodeFields::Missing: case ENodeFields::Uptime: - case ENodeFields::Version: GroupCollection(); SortCollection(NodeGroups, [](const TNodeGroup& nodeGroup) { return nodeGroup.SortKey; }); NeedGroup = false; break; + case ENodeFields::DiskSpaceUsage: + case ENodeFields::Missing: + case ENodeFields::Version: + GroupCollection(); + SortCollection(NodeGroups, [](const TNodeGroup& nodeGroup) { return nodeGroup.SortKey; }, true); + NeedGroup = false; + break; case ENodeFields::NodeInfo: case ENodeFields::SystemState: case ENodeFields::PDisks: @@ -1125,7 +1138,7 @@ public: NeedSort = false; break; case ENodeFields::Uptime: - SortCollection(NodeView, [](const TNode* node) { return node->SystemState.GetStartTime(); }, ReverseSort); + SortCollection(NodeView, [](const TNode* node) { return node->UptimeSeconds; }, ReverseSort); NeedSort = false; break; case ENodeFields::Memory: @@ -1543,6 +1556,7 @@ public: for (const auto& [hiveId, nodeStats] : HiveNodeStats) { if (nodeStats.IsDone()) { if (nodeStats.IsOk()) { + TInstant now = TInstant::Now(); for (const NKikimrHive::THiveNodeStats& nodeStats : nodeStats.Get()->Record.GetNodeStats()) { ui32 nodeId = nodeStats.GetNodeId(); TNode* node = FindNode(nodeId); @@ -1557,7 +1571,8 @@ public: } } if (nodeStats.HasLastAliveTimestamp()) { - node->SystemState.SetDisconnectTime(std::max(node->SystemState.GetDisconnectTime(), nodeStats.GetLastAliveTimestamp() / 1000)); // seconds + node->SystemState.SetDisconnectTime(std::max(node->SystemState.GetDisconnectTime(), nodeStats.GetLastAliveTimestamp())); // milliseconds + node->CalcUptimeSeconds(now); FieldsAvailable.set(+ENodeFields::DisconnectTime); } if (nodeStats.HasNodeDomain()) { @@ -1704,7 +1719,7 @@ public: viewerRequest->Record.MutableLocation()->AddNodeId(node->GetNodeId()); } TabletViewerResponse.emplace(nodeId, MakeViewerRequest(nodeId, viewerRequest.release())); - NodeBatches.emplace(nodeId, batch); + NodeBatches.emplace(nodeId, batch); // ignore second insert because they are the same ++WhiteboardStateRequestsInFlight; } } else { @@ -1767,15 +1782,21 @@ public: void ProcessWhiteboard() { if (FieldsNeeded(FieldsSystemState)) { + TInstant now = TInstant::Now(); std::unordered_set<TNodeId> removeNodes; for (const auto& [responseNodeId, response] : SystemViewerResponse) { if (response.IsOk()) { const auto& systemResponse(response.Get()->Record.GetSystemResponse()); + std::unordered_set<TNodeId> nodesWithoutData; + for (auto nodeId : response.Get()->Record.GetLocationResponded().GetNodeId()) { + nodesWithoutData.insert(nodeId); + } for (const auto& systemInfo : systemResponse.GetSystemStateInfo()) { TNodeId nodeId = systemInfo.GetNodeId(); TNode* node = FindNode(nodeId); if (node) { - node->MergeFrom(systemInfo); + nodesWithoutData.erase(nodeId); + node->MergeFrom(systemInfo, now); if (Database && node->Database) { if (node->Database != Database && (!SharedDatabase || node->Database != SharedDatabase)) { removeNodes.insert(nodeId); @@ -1783,6 +1804,17 @@ public: } } } + for (auto nodeId : nodesWithoutData) { + TNode* node = FindNode(nodeId); + if (node) { + node->DisconnectNode(); + } + } + } else { + TNode* node = FindNode(responseNodeId); + if (node) { + node->DisconnectNode(); + } } } for (const auto& [nodeId, response] : SystemStateResponse) { @@ -1791,7 +1823,7 @@ public: if (systemState.SystemStateInfoSize() > 0) { TNode* node = FindNode(nodeId); if (node) { - node->MergeFrom(systemState.GetSystemStateInfo(0)); + node->MergeFrom(systemState.GetSystemStateInfo(0), now); if (Database && node->Database) { if (node->Database != Database && (!SharedDatabase || node->Database != SharedDatabase)) { removeNodes.insert(nodeId); @@ -1799,6 +1831,11 @@ public: } } } + } else { + TNode* node = FindNode(nodeId); + if (node) { + node->DisconnectNode(); + } } } if (!removeNodes.empty()) { @@ -2029,6 +2066,7 @@ public: if (FieldsRequired.test(+ENodeFields::PDisks) || FieldsRequired.test(+ENodeFields::VDisks)) { node->RemapDisks(); } + node->CalcUptimeSeconds(TInstant::Now()); } TString error("NodeDisconnected"); { @@ -2279,6 +2317,21 @@ public: if (FieldsAvailable.test(+ENodeFields::NodeInfo)) { jsonNode.SetNodeId(node->GetNodeId()); } + if (node->Database) { + jsonNode.SetDatabase(node->Database); + } + if (node->UptimeSeconds) { + jsonNode.SetUptimeSeconds(node->UptimeSeconds); + } + if (node->Disconnected) { + jsonNode.SetDisconnected(node->Disconnected); + } + if (node->CpuUsage) { + jsonNode.SetCpuUsage(node->CpuUsage); + } + if (node->DiskSpaceUsage) { + jsonNode.SetDiskSpaceUsage(node->DiskSpaceUsage); + } if (FieldsAvailable.test(+ENodeFields::NodeInfo) || FieldsAvailable.test(+ENodeFields::SystemState)) { *jsonNode.MutableSystemState() = std::move(node->SystemState); } |