diff options
author | Devtools Arcadia <arcadia-devtools@yandex-team.ru> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/actors/interconnect | |
download | ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/actors/interconnect')
78 files changed, 13871 insertions, 0 deletions
diff --git a/library/cpp/actors/interconnect/channel_scheduler.h b/library/cpp/actors/interconnect/channel_scheduler.h new file mode 100644 index 0000000000..551a4cb61a --- /dev/null +++ b/library/cpp/actors/interconnect/channel_scheduler.h @@ -0,0 +1,120 @@ +#pragma once + +#include "interconnect_channel.h" +#include "event_holder_pool.h" + +#include <memory> + +namespace NActors { + + class TChannelScheduler { + const ui32 PeerNodeId; + std::array<std::optional<TEventOutputChannel>, 16> ChannelArray; + THashMap<ui16, TEventOutputChannel> ChannelMap; + std::shared_ptr<IInterconnectMetrics> Metrics; + TEventHolderPool& Pool; + const ui32 MaxSerializedEventSize; + const TSessionParams Params; + + struct THeapItem { + TEventOutputChannel *Channel; + ui64 WeightConsumed = 0; + + friend bool operator <(const THeapItem& x, const THeapItem& y) { + return x.WeightConsumed > y.WeightConsumed; + } + }; + + std::vector<THeapItem> Heap; + + public: + TChannelScheduler(ui32 peerNodeId, const TChannelsConfig& predefinedChannels, + std::shared_ptr<IInterconnectMetrics> metrics, TEventHolderPool& pool, ui32 maxSerializedEventSize, + TSessionParams params) + : PeerNodeId(peerNodeId) + , Metrics(std::move(metrics)) + , Pool(pool) + , MaxSerializedEventSize(maxSerializedEventSize) + , Params(std::move(params)) + { + for (const auto& item : predefinedChannels) { + GetOutputChannel(item.first); + } + } + + TEventOutputChannel *PickChannelWithLeastConsumedWeight() { + Y_VERIFY(!Heap.empty()); + return Heap.front().Channel; + } + + void AddToHeap(TEventOutputChannel& channel, ui64 counter) { + if (channel.IsWorking()) { + ui64 weight = channel.WeightConsumedOnPause; + weight -= Min(weight, counter - channel.EqualizeCounterOnPause); + Heap.push_back(THeapItem{&channel, weight}); + std::push_heap(Heap.begin(), Heap.end()); + } + } + + void FinishPick(ui64 weightConsumed, ui64 counter) { + std::pop_heap(Heap.begin(), Heap.end()); + auto& item = Heap.back(); + item.WeightConsumed += weightConsumed; + if (item.Channel->IsWorking()) { // reschedule + std::push_heap(Heap.begin(), Heap.end()); + } else { // remove from heap + item.Channel->EqualizeCounterOnPause = counter; + item.Channel->WeightConsumedOnPause = item.WeightConsumed; + Heap.pop_back(); + } + } + + TEventOutputChannel& GetOutputChannel(ui16 channel) { + if (channel < ChannelArray.size()) { + auto& res = ChannelArray[channel]; + if (Y_UNLIKELY(!res)) { + res.emplace(Pool, channel, PeerNodeId, MaxSerializedEventSize, Metrics, + Params); + } + return *res; + } else { + auto it = ChannelMap.find(channel); + if (Y_UNLIKELY(it == ChannelMap.end())) { + it = ChannelMap.emplace(std::piecewise_construct, std::forward_as_tuple(channel), + std::forward_as_tuple(Pool, channel, PeerNodeId, MaxSerializedEventSize, + Metrics, Params)).first; + } + return it->second; + } + } + + ui64 Equalize() { + if (Heap.empty()) { + return 0; // nothing to do here -- no working channels + } + + // find the minimum consumed weight among working channels and then adjust weights + ui64 min = Max<ui64>(); + for (THeapItem& item : Heap) { + min = Min(min, item.WeightConsumed); + } + for (THeapItem& item : Heap) { + item.WeightConsumed -= min; + } + return min; + } + + template<typename TCallback> + void ForEach(TCallback&& callback) { + for (auto& channel : ChannelArray) { + if (channel) { + callback(*channel); + } + } + for (auto& [id, channel] : ChannelMap) { + callback(channel); + } + } + }; + +} // NActors diff --git a/library/cpp/actors/interconnect/event_filter.h b/library/cpp/actors/interconnect/event_filter.h new file mode 100644 index 0000000000..47dabf5f16 --- /dev/null +++ b/library/cpp/actors/interconnect/event_filter.h @@ -0,0 +1,72 @@ +#pragma once + +#include <library/cpp/actors/core/event.h> + +namespace NActors { + + enum class ENodeClass { + SYSTEM, + LOCAL_TENANT, + PEER_TENANT, + COUNT + }; + + class TEventFilter : TNonCopyable { + using TRouteMask = ui16; + + TVector<TVector<TRouteMask>> ScopeRoutes; + + public: + TEventFilter() + : ScopeRoutes(65536) + {} + + void RegisterEvent(ui32 type, TRouteMask routes) { + auto& evSpaceIndex = ScopeRoutes[type >> 16]; + const ui16 subtype = type & 65535; + size_t size = (subtype + 512) & ~511; + if (evSpaceIndex.size() < size) { + evSpaceIndex.resize(size); + } + evSpaceIndex[subtype] = routes; + } + + bool CheckIncomingEvent(const IEventHandle& ev, const TScopeId& localScopeId) const { + TRouteMask routes = 0; + if (const auto& evSpaceIndex = ScopeRoutes[ev.Type >> 16]) { + const ui16 subtype = ev.Type & 65535; + routes = subtype < evSpaceIndex.size() ? evSpaceIndex[subtype] : 0; + } else { + routes = ~TRouteMask(); // allow unfilled event spaces by default + } + return routes & MakeRouteMask(GetNodeClass(ev.OriginScopeId, localScopeId), GetNodeClass(localScopeId, ev.OriginScopeId)); + } + + static ENodeClass GetNodeClass(const TScopeId& scopeId, const TScopeId& localScopeId) { + if (scopeId.first == 0) { + // system scope, or null scope + return scopeId.second ? ENodeClass::SYSTEM : ENodeClass::COUNT; + } else if (scopeId == localScopeId) { + return ENodeClass::LOCAL_TENANT; + } else { + return ENodeClass::PEER_TENANT; + } + } + + static TRouteMask MakeRouteMask(ENodeClass from, ENodeClass to) { + if (from == ENodeClass::COUNT || to == ENodeClass::COUNT) { + return 0; + } + return 1U << (static_cast<unsigned>(from) * static_cast<unsigned>(ENodeClass::COUNT) + static_cast<unsigned>(to)); + } + + static TRouteMask MakeRouteMask(std::initializer_list<std::pair<ENodeClass, ENodeClass>> items) { + TRouteMask mask = 0; + for (const auto& p : items) { + mask |= MakeRouteMask(p.first, p.second); + } + return mask; + } + }; + +} // NActors diff --git a/library/cpp/actors/interconnect/event_holder_pool.h b/library/cpp/actors/interconnect/event_holder_pool.h new file mode 100644 index 0000000000..b6090a3bc8 --- /dev/null +++ b/library/cpp/actors/interconnect/event_holder_pool.h @@ -0,0 +1,128 @@ +#pragma once + +#include <library/cpp/containers/stack_vector/stack_vec.h> + +#include "packet.h" + +namespace NActors { + struct TEvFreeItems : TEventLocal<TEvFreeItems, EventSpaceBegin(TEvents::ES_PRIVATE)> { + static constexpr size_t MaxEvents = 256; + + TList<TTcpPacketOutTask> Items; + std::list<TEventHolder> FreeQueue; + TStackVec<THolder<IEventBase>, MaxEvents> Events; + TStackVec<THolder<TEventSerializedData>, MaxEvents> Buffers; + std::shared_ptr<std::atomic<TAtomicBase>> Counter; + ui64 NumBytes = 0; + + ~TEvFreeItems() { + if (Counter) { + TAtomicBase res = Counter->fetch_sub(NumBytes) - NumBytes; + Y_VERIFY(res >= 0); + } + } + + bool GetInLineForDestruction(const TIntrusivePtr<TInterconnectProxyCommon>& common) { + Y_VERIFY(!Counter); + const auto& counter = common->DestructorQueueSize; + const auto& max = common->MaxDestructorQueueSize; + if (counter && (TAtomicBase)(counter->fetch_add(NumBytes) + NumBytes) > max) { + counter->fetch_sub(NumBytes); + return false; + } + Counter = counter; + return true; + } + }; + + class TEventHolderPool { + using TDestroyCallback = std::function<void(THolder<IEventBase>)>; + + static constexpr size_t MaxFreeQueueItems = 32; + static constexpr size_t FreeQueueTrimThreshold = MaxFreeQueueItems * 2; + static constexpr ui64 MaxBytesPerMessage = 10 * 1024 * 1024; + + TIntrusivePtr<TInterconnectProxyCommon> Common; + std::list<TEventHolder> Cache; + THolder<TEvFreeItems> PendingFreeEvent; + TDestroyCallback DestroyCallback; + + public: + TEventHolderPool(TIntrusivePtr<TInterconnectProxyCommon> common, + TDestroyCallback destroyCallback) + : Common(std::move(common)) + , DestroyCallback(std::move(destroyCallback)) + {} + + TEventHolder& Allocate(std::list<TEventHolder>& queue) { + if (Cache.empty()) { + queue.emplace_back(); + } else { + queue.splice(queue.end(), Cache, Cache.begin()); + } + return queue.back(); + } + + void Release(std::list<TEventHolder>& queue) { + for (auto it = queue.begin(); it != queue.end(); ) { + Release(queue, it++); + } + } + + void Release(std::list<TEventHolder>& queue, std::list<TEventHolder>::iterator event) { + bool trim = false; + + // release held event, if any + if (THolder<IEventBase> ev = std::move(event->Event)) { + auto p = GetPendingEvent(); + p->NumBytes += event->EventSerializedSize; + auto& events = p->Events; + events.push_back(std::move(ev)); + trim = trim || events.size() >= TEvFreeItems::MaxEvents || p->NumBytes >= MaxBytesPerMessage; + } + + // release buffer, if any + if (event->Buffer && event->Buffer.RefCount() == 1) { + auto p = GetPendingEvent(); + p->NumBytes += event->EventSerializedSize; + auto& buffers = p->Buffers; + buffers.emplace_back(event->Buffer.Release()); + trim = trim || buffers.size() >= TEvFreeItems::MaxEvents || p->NumBytes >= MaxBytesPerMessage; + } + + // free event and trim the cache if its size is exceeded + event->Clear(); + Cache.splice(Cache.end(), queue, event); + if (Cache.size() >= FreeQueueTrimThreshold) { + auto& freeQueue = GetPendingEvent()->FreeQueue; + auto it = Cache.begin(); + std::advance(it, Cache.size() - MaxFreeQueueItems); + freeQueue.splice(freeQueue.end(), Cache, Cache.begin(), it); + trim = true; + } + + // release items if we have hit the limit + if (trim) { + Trim(); + } + } + + void Trim() { + if (auto ev = std::move(PendingFreeEvent); ev && ev->GetInLineForDestruction(Common)) { + DestroyCallback(std::move(ev)); + } + + // ensure it is dropped + PendingFreeEvent.Reset(); + } + + private: + TEvFreeItems* GetPendingEvent() { + if (!PendingFreeEvent) { + PendingFreeEvent.Reset(new TEvFreeItems); + } + return PendingFreeEvent.Get(); + } + }; + +} diff --git a/library/cpp/actors/interconnect/events_local.h b/library/cpp/actors/interconnect/events_local.h new file mode 100644 index 0000000000..8a46ffd535 --- /dev/null +++ b/library/cpp/actors/interconnect/events_local.h @@ -0,0 +1,403 @@ +#pragma once + +#include <library/cpp/actors/core/events.h> +#include <library/cpp/actors/core/event_local.h> +#include <library/cpp/actors/protos/interconnect.pb.h> +#include <util/generic/deque.h> +#include <util/network/address.h> + +#include "interconnect_stream.h" +#include "packet.h" +#include "types.h" + +namespace NActors { + struct TProgramInfo { + ui64 PID = 0; + ui64 StartTime = 0; + ui64 Serial = 0; + }; + + enum class ENetwork : ui32 { + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // local messages + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + Start = EventSpaceBegin(TEvents::ES_INTERCONNECT_TCP), + + SocketReadyRead = Start, + SocketReadyWrite, + SocketError, + Connect, + Disconnect, + IncomingConnection, + HandshakeAsk, + HandshakeAck, + HandshakeNak, + HandshakeDone, + HandshakeFail, + Kick, + Flush, + NodeInfo, + BunchOfEventsToDestroy, + HandshakeRequest, + HandshakeReplyOK, + HandshakeReplyError, + ResolveAddress, + AddressInfo, + ResolveError, + HTTPStreamStatus, + HTTPSendContent, + ConnectProtocolWakeup, + HTTPProtocolRetry, + EvPollerRegister, + EvPollerRegisterResult, + EvPollerReady, + EvUpdateFromInputSession, + EvConfirmUpdate, + EvSessionBufferSizeRequest, + EvSessionBufferSizeResponse, + EvProcessPingRequest, + EvGetSecureSocket, + EvSecureSocket, + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // nonlocal messages; their indices must be preserved in order to work properly while doing rolling update + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + // interconnect load test message + EvLoadMessage = Start + 256, + }; + + struct TEvSocketReadyRead: public TEventLocal<TEvSocketReadyRead, ui32(ENetwork::SocketReadyRead)> { + DEFINE_SIMPLE_LOCAL_EVENT(TEvSocketReadyRead, "Network: TEvSocketReadyRead") + }; + + struct TEvSocketReadyWrite: public TEventLocal<TEvSocketReadyWrite, ui32(ENetwork::SocketReadyWrite)> { + DEFINE_SIMPLE_LOCAL_EVENT(TEvSocketReadyWrite, "Network: TEvSocketReadyWrite") + }; + + struct TEvSocketError: public TEventLocal<TEvSocketError, ui32(ENetwork::SocketError)> { + DEFINE_SIMPLE_LOCAL_EVENT(TEvSocketError, ::strerror(Error)) + TString GetReason() const { + return ::strerror(Error); + } + const int Error; + TIntrusivePtr<NInterconnect::TStreamSocket> Socket; + + TEvSocketError(int error, TIntrusivePtr<NInterconnect::TStreamSocket> sock) + : Error(error) + , Socket(std::move(sock)) + { + } + }; + + struct TEvSocketConnect: public TEventLocal<TEvSocketConnect, ui32(ENetwork::Connect)> { + DEFINE_SIMPLE_LOCAL_EVENT(TEvSocketConnect, "Network: TEvSocketConnect") + }; + + struct TEvSocketDisconnect: public TEventLocal<TEvSocketDisconnect, ui32(ENetwork::Disconnect)> { + DEFINE_SIMPLE_LOCAL_EVENT(TEvSocketDisconnect, "Network: TEvSocketDisconnect") + TDisconnectReason Reason; + + TEvSocketDisconnect(TDisconnectReason reason) + : Reason(std::move(reason)) + { + } + }; + + struct TEvHandshakeAsk: public TEventLocal<TEvHandshakeAsk, ui32(ENetwork::HandshakeAsk)> { + DEFINE_SIMPLE_LOCAL_EVENT(TEvHandshakeAsk, "Network: TEvHandshakeAsk") + TEvHandshakeAsk(const TActorId& self, + const TActorId& peer, + ui64 counter) + : Self(self) + , Peer(peer) + , Counter(counter) + { + } + const TActorId Self; + const TActorId Peer; + const ui64 Counter; + }; + + struct TEvHandshakeAck: public TEventLocal<TEvHandshakeAck, ui32(ENetwork::HandshakeAck)> { + DEFINE_SIMPLE_LOCAL_EVENT(TEvHandshakeAck, "Network: TEvHandshakeAck") + + TEvHandshakeAck(const TActorId& self, ui64 nextPacket, TSessionParams params) + : Self(self) + , NextPacket(nextPacket) + , Params(std::move(params)) + {} + + const TActorId Self; + const ui64 NextPacket; + const TSessionParams Params; + }; + + struct TEvHandshakeNak : TEventLocal<TEvHandshakeNak, ui32(ENetwork::HandshakeNak)> { + DEFINE_SIMPLE_LOCAL_EVENT(TEvSocketReadyRead, "Network: TEvHandshakeNak") + }; + + struct TEvHandshakeRequest + : public TEventLocal<TEvHandshakeRequest, + ui32(ENetwork::HandshakeRequest)> { + DEFINE_SIMPLE_LOCAL_EVENT(TEvHandshakeRequest, + "Network: TEvHandshakeRequest") + + NActorsInterconnect::THandshakeRequest Record; + }; + + struct TEvHandshakeReplyOK + : public TEventLocal<TEvHandshakeReplyOK, + ui32(ENetwork::HandshakeReplyOK)> { + DEFINE_SIMPLE_LOCAL_EVENT(TEvHandshakeReplyOK, + "Network: TEvHandshakeReplyOK") + + NActorsInterconnect::THandshakeReply Record; + }; + + struct TEvHandshakeReplyError + : public TEventLocal<TEvHandshakeReplyError, + ui32(ENetwork::HandshakeReplyError)> { + DEFINE_SIMPLE_LOCAL_EVENT(TEvHandshakeReplyError, + "Network: TEvHandshakeReplyError") + + TEvHandshakeReplyError(TString error) { + Record.SetErrorExplaination(error); + } + + NActorsInterconnect::THandshakeReply Record; + }; + + struct TEvIncomingConnection: public TEventLocal<TEvIncomingConnection, ui32(ENetwork::IncomingConnection)> { + DEFINE_SIMPLE_LOCAL_EVENT(TEvIncomingConnection, "Network: TEvIncomingConnection") + TIntrusivePtr<NInterconnect::TStreamSocket> Socket; + NInterconnect::TAddress Address; + + TEvIncomingConnection(TIntrusivePtr<NInterconnect::TStreamSocket> socket, NInterconnect::TAddress address) + : Socket(std::move(socket)) + , Address(std::move(address)) + {} + }; + + struct TEvHandshakeDone: public TEventLocal<TEvHandshakeDone, ui32(ENetwork::HandshakeDone)> { + DEFINE_SIMPLE_LOCAL_EVENT(TEvHandshakeDone, "Network: TEvHandshakeDone") + + TEvHandshakeDone( + TIntrusivePtr<NInterconnect::TStreamSocket> socket, + const TActorId& peer, + const TActorId& self, + ui64 nextPacket, + TAutoPtr<TProgramInfo>&& programInfo, + TSessionParams params) + : Socket(std::move(socket)) + , Peer(peer) + , Self(self) + , NextPacket(nextPacket) + , ProgramInfo(std::move(programInfo)) + , Params(std::move(params)) + { + } + + TIntrusivePtr<NInterconnect::TStreamSocket> Socket; + const TActorId Peer; + const TActorId Self; + const ui64 NextPacket; + TAutoPtr<TProgramInfo> ProgramInfo; + const TSessionParams Params; + }; + + struct TEvHandshakeFail: public TEventLocal<TEvHandshakeFail, ui32(ENetwork::HandshakeFail)> { + DEFINE_SIMPLE_LOCAL_EVENT(TEvHandshakeFail, "Network: TEvHandshakeFail") + + enum EnumHandshakeFail { + HANDSHAKE_FAIL_TRANSIENT, + HANDSHAKE_FAIL_PERMANENT, + HANDSHAKE_FAIL_SESSION_MISMATCH, + }; + + TEvHandshakeFail(EnumHandshakeFail temporary, TString explanation) + : Temporary(temporary) + , Explanation(std::move(explanation)) + { + } + + const EnumHandshakeFail Temporary; + const TString Explanation; + }; + + struct TEvKick: public TEventLocal<TEvKick, ui32(ENetwork::Kick)> { + DEFINE_SIMPLE_LOCAL_EVENT(TEvKick, "Network: TEvKick") + }; + + struct TEvFlush: public TEventLocal<TEvFlush, ui32(ENetwork::Flush)> { + DEFINE_SIMPLE_LOCAL_EVENT(TEvFlush, "Network: TEvFlush") + }; + + struct TEvLocalNodeInfo + : public TEventLocal<TEvLocalNodeInfo, ui32(ENetwork::NodeInfo)> { + DEFINE_SIMPLE_LOCAL_EVENT(TEvLocalNodeInfo, "Network: TEvLocalNodeInfo") + + ui32 NodeId; + NAddr::IRemoteAddrPtr Address; + }; + + struct TEvBunchOfEventsToDestroy : TEventLocal<TEvBunchOfEventsToDestroy, ui32(ENetwork::BunchOfEventsToDestroy)> { + DEFINE_SIMPLE_LOCAL_EVENT(TEvBunchOfEventsToDestroy, + "Network: TEvBunchOfEventsToDestroy") + + TEvBunchOfEventsToDestroy(TDeque<TAutoPtr<IEventBase>> events) + : Events(std::move(events)) + { + } + + TDeque<TAutoPtr<IEventBase>> Events; + }; + + struct TEvResolveAddress + : public TEventLocal<TEvResolveAddress, ui32(ENetwork::ResolveAddress)> { + DEFINE_SIMPLE_LOCAL_EVENT(TEvResolveAddress, "Network: TEvResolveAddress") + + TString Address; + ui16 Port; + }; + + struct TEvAddressInfo + : public TEventLocal<TEvAddressInfo, ui32(ENetwork::AddressInfo)> { + DEFINE_SIMPLE_LOCAL_EVENT(TEvAddressInfo, "Network: TEvAddressInfo") + + NAddr::IRemoteAddrPtr Address; + }; + + struct TEvResolveError + : public TEventLocal<TEvResolveError, ui32(ENetwork::ResolveError)> { + DEFINE_SIMPLE_LOCAL_EVENT(TEvResolveError, "Network: TEvResolveError") + + TString Explain; + }; + + struct TEvHTTPStreamStatus + : public TEventLocal<TEvHTTPStreamStatus, ui32(ENetwork::HTTPStreamStatus)> { + DEFINE_SIMPLE_LOCAL_EVENT(TEvHTTPStreamStatus, + "Network: TEvHTTPStreamStatus") + enum EStatus { + READY, + COMPLETE, + ERROR, + }; + + EStatus Status; + TString Error; + TString HttpHeaders; + }; + + struct TEvHTTPSendContent + : public TEventLocal<TEvHTTPSendContent, ui32(ENetwork::HTTPSendContent)> { + DEFINE_SIMPLE_LOCAL_EVENT(TEvHTTPSendContent, "Network: TEvHTTPSendContent") + + const char* Data; + size_t Len; + bool Last; + }; + + struct TEvConnectWakeup + : public TEventLocal<TEvConnectWakeup, + ui32(ENetwork::ConnectProtocolWakeup)> { + DEFINE_SIMPLE_LOCAL_EVENT(TEvConnectWakeup, "Protocols: TEvConnectWakeup") + }; + + struct TEvHTTPProtocolRetry + : public TEventLocal<TEvHTTPProtocolRetry, + ui32(ENetwork::HTTPProtocolRetry)> { + DEFINE_SIMPLE_LOCAL_EVENT(TEvHTTPProtocolRetry, + "Protocols: TEvHTTPProtocolRetry") + }; + + struct TEvLoadMessage + : TEventPB<TEvLoadMessage, NActorsInterconnect::TEvLoadMessage, static_cast<ui32>(ENetwork::EvLoadMessage)> { + TEvLoadMessage() = default; + + template <typename TContainer> + TEvLoadMessage(const TContainer& route, const TString& id, const TString* payload) { + for (const TActorId& actorId : route) { + auto* hop = Record.AddHops(); + if (actorId) { + ActorIdToProto(actorId, hop->MutableNextHop()); + } + } + Record.SetId(id); + if (payload) { + Record.SetPayload(*payload); + } + } + + template <typename TContainer> + TEvLoadMessage(const TContainer& route, const TString& id, TRope&& payload) { + for (const TActorId& actorId : route) { + auto* hop = Record.AddHops(); + if (actorId) { + ActorIdToProto(actorId, hop->MutableNextHop()); + } + } + Record.SetId(id); + AddPayload(std::move(payload)); + } + }; + + struct TEvUpdateFromInputSession : TEventLocal<TEvUpdateFromInputSession, static_cast<ui32>(ENetwork::EvUpdateFromInputSession)> { + ui64 ConfirmedByInput; // latest Confirm value from processed input packet + ui64 NumDataBytes; + TDuration Ping; + + TEvUpdateFromInputSession(ui64 confirmedByInput, ui64 numDataBytes, TDuration ping) + : ConfirmedByInput(confirmedByInput) + , NumDataBytes(numDataBytes) + , Ping(ping) + { + } + }; + + struct TEvConfirmUpdate : TEventLocal<TEvConfirmUpdate, static_cast<ui32>(ENetwork::EvConfirmUpdate)> + {}; + + struct TEvSessionBufferSizeRequest : TEventLocal<TEvSessionBufferSizeRequest, static_cast<ui32>(ENetwork::EvSessionBufferSizeRequest)> { + //DEFINE_SIMPLE_LOCAL_EVENT(TEvSessionBufferSizeRequest, "Session: TEvSessionBufferSizeRequest") + DEFINE_SIMPLE_LOCAL_EVENT(TEvSessionBufferSizeRequest, "Network: TEvSessionBufferSizeRequest"); + }; + + struct TEvSessionBufferSizeResponse : TEventLocal<TEvSessionBufferSizeResponse, static_cast<ui32>(ENetwork::EvSessionBufferSizeResponse)> { + TEvSessionBufferSizeResponse(const TActorId& sessionId, ui64 outputBufferSize) + : SessionID(sessionId) + , BufferSize(outputBufferSize) + { + } + + TActorId SessionID; + ui64 BufferSize; + }; + + struct TEvProcessPingRequest : TEventLocal<TEvProcessPingRequest, static_cast<ui32>(ENetwork::EvProcessPingRequest)> { + const ui64 Payload; + + TEvProcessPingRequest(ui64 payload) + : Payload(payload) + {} + }; + + struct TEvGetSecureSocket : TEventLocal<TEvGetSecureSocket, (ui32)ENetwork::EvGetSecureSocket> { + TIntrusivePtr<NInterconnect::TStreamSocket> Socket; + + TEvGetSecureSocket(TIntrusivePtr<NInterconnect::TStreamSocket> socket) + : Socket(std::move(socket)) + {} + }; + + struct TEvSecureSocket : TEventLocal<TEvSecureSocket, (ui32)ENetwork::EvSecureSocket> { + TIntrusivePtr<NInterconnect::TSecureSocket> Socket; + + TEvSecureSocket(TIntrusivePtr<NInterconnect::TSecureSocket> socket) + : Socket(std::move(socket)) + {} + }; + +} diff --git a/library/cpp/actors/interconnect/interconnect.h b/library/cpp/actors/interconnect/interconnect.h new file mode 100644 index 0000000000..225a5243fd --- /dev/null +++ b/library/cpp/actors/interconnect/interconnect.h @@ -0,0 +1,179 @@ +#pragma once + +#include <library/cpp/actors/core/actorsystem.h> +#include <library/cpp/actors/core/interconnect.h> +#include <util/generic/map.h> +#include <util/network/address.h> + +namespace NActors { + struct TInterconnectGlobalState: public TThrRefBase { + TString SelfAddress; + ui32 SelfPort; + + TVector<TActorId> GlobalNameservers; // todo: add some info about (like expected reply time) + }; + + struct TInterconnectProxySetup: public TThrRefBase { + // synchronous (session -> proxy) + struct IProxy : TNonCopyable { + virtual ~IProxy() { + } + + virtual void ActivateSession(const TActorContext& ctx) = 0; // session activated + virtual void DetachSession(const TActorContext& ctx) = 0; // session is dead + }; + + // synchronous (proxy -> session) + struct ISession : TNonCopyable { + virtual ~ISession() { + } + + virtual void DetachSession(const TActorContext& ownerCtx, const TActorContext& sessionCtx) = 0; // kill yourself + virtual void ForwardPacket(TAutoPtr<IEventHandle>& ev, const TActorContext& ownerCtx, const TActorContext& sessionCtx) = 0; // receive packet for forward + virtual void Connect(const TActorContext& ownerCtx, const TActorContext& sessionCtx) = 0; // begin connection + virtual bool ReceiveIncomingSession(TAutoPtr<IEventHandle>& ev, const TActorContext& ownerCtx, const TActorContext& sessionCtx) = 0; // handle incoming session, if returns true - then session is dead and must be recreated with new one + }; + + ui32 DestinationNode; + + TString StaticAddress; // if set - would be used as main destination address + int StaticPort; + + TIntrusivePtr<TInterconnectGlobalState> GlobalState; + + virtual IActor* CreateSession(const TActorId& ownerId, IProxy* owner) = 0; // returned actor is session and would be attached to same mailbox as proxy to allow sync calls + virtual TActorSetupCmd CreateAcceptor() = 0; + }; + + struct TNameserverSetup { + TActorId ServiceID; + + TIntrusivePtr<TInterconnectGlobalState> GlobalState; + }; + + struct TTableNameserverSetup: public TThrRefBase { + struct TNodeInfo { + TString Address; + TString Host; + TString ResolveHost; + ui16 Port; + TNodeLocation Location; + TString& first; + ui16& second; + + TNodeInfo() + : first(Address) + , second(Port) + { + } + + TNodeInfo(const TNodeInfo&) = default; + + // for testing purposes only + TNodeInfo(const TString& address, const TString& host, ui16 port) + : TNodeInfo() + { + Address = address; + Host = host; + ResolveHost = host; + Port = port; + } + + TNodeInfo(const TString& address, + const TString& host, + const TString& resolveHost, + ui16 port, + const TNodeLocation& location) + : TNodeInfo() + { + Address = address; + Host = host; + ResolveHost = resolveHost; + Port = port; + Location = location; + } + + // for testing purposes only + TNodeInfo& operator=(const std::pair<TString, ui32>& pr) { + Address = pr.first; + Host = pr.first; + ResolveHost = pr.first; + Port = pr.second; + return *this; + } + + TNodeInfo& operator=(const TNodeInfo& ni) { + Address = ni.Address; + Host = ni.Host; + ResolveHost = ni.ResolveHost; + Port = ni.Port; + Location = ni.Location; + return *this; + } + }; + + TMap<ui32, TNodeInfo> StaticNodeTable; + + bool IsEntriesUnique() const; + }; + + struct TNodeRegistrarSetup { + TActorId ServiceID; + + TIntrusivePtr<TInterconnectGlobalState> GlobalState; + }; + + TActorId GetNameserviceActorId(); + + /** + * Const table-lookup based name service + */ + + IActor* CreateNameserverTable( + const TIntrusivePtr<TTableNameserverSetup>& setup, + ui32 poolId = 0); + + /** + * Name service which can be paired with external discovery service. + * Copies information from setup on the start (table may be empty). + * Handles TEvNodesInfo to change list of known nodes. + * + * If PendingPeriod is not zero, wait for unknown nodeId + */ + + IActor* CreateDynamicNameserver( + const TIntrusivePtr<TTableNameserverSetup>& setup, + const TDuration& pendingPeriod = TDuration::Zero(), + ui32 poolId = 0); + + /** + * Creates an actor that resolves host/port and replies with either: + * + * - TEvLocalNodeInfo on success + * - TEvResolveError on errors + * + * Optional defaultAddress may be used as fallback. + */ + IActor* CreateResolveActor( + const TString& host, ui16 port, ui32 nodeId, const TString& defaultAddress, + const TActorId& replyTo, const TActorId& replyFrom, TInstant deadline); + + inline IActor* CreateResolveActor( + ui32 nodeId, const TTableNameserverSetup::TNodeInfo& nodeInfo, + const TActorId& replyTo, const TActorId& replyFrom, TInstant deadline) + { + return CreateResolveActor(nodeInfo.ResolveHost, nodeInfo.Port, nodeId, nodeInfo.Address, + replyTo, replyFrom, deadline); + } + + /** + * Creates an actor that resolves host/port and replies with either: + * + * - TEvAddressInfo on success + * - TEvResolveError on errors + */ + IActor* CreateResolveActor( + const TString& host, ui16 port, + const TActorId& replyTo, const TActorId& replyFrom, TInstant deadline); + +} diff --git a/library/cpp/actors/interconnect/interconnect_address.cpp b/library/cpp/actors/interconnect/interconnect_address.cpp new file mode 100644 index 0000000000..8f474f5a39 --- /dev/null +++ b/library/cpp/actors/interconnect/interconnect_address.cpp @@ -0,0 +1,94 @@ +#include "interconnect_address.h" + +#include <util/string/cast.h> +#include <util/system/file.h> + +#if defined(_linux_) +#include <sys/un.h> +#include <sys/stat.h> +#endif + +namespace NInterconnect { + TAddress::TAddress() { + memset(&Addr, 0, sizeof(Addr)); + } + + TAddress::TAddress(NAddr::IRemoteAddr& addr) { + socklen_t len = addr.Len(); + Y_VERIFY(len <= sizeof(Addr)); + memcpy(&Addr.Generic, addr.Addr(), len); + } + + int TAddress::GetFamily() const { + return Addr.Generic.sa_family; + } + + socklen_t TAddress::Size() const { + switch (Addr.Generic.sa_family) { + case AF_INET6: + return sizeof(sockaddr_in6); + case AF_INET: + return sizeof(sockaddr_in); + default: + return 0; + } + } + + sockaddr* TAddress::SockAddr() { + return &Addr.Generic; + } + + const sockaddr* TAddress::SockAddr() const { + return &Addr.Generic; + } + + ui16 TAddress::GetPort() const { + switch (Addr.Generic.sa_family) { + case AF_INET6: + return ntohs(Addr.Ipv6.sin6_port); + case AF_INET: + return ntohs(Addr.Ipv4.sin_port); + default: + return 0; + } + } + + TString TAddress::ToString() const { + return GetAddress() + ":" + ::ToString(GetPort()); + } + + TAddress::TAddress(const char* addr, ui16 port) { + memset(&Addr, 0, sizeof(Addr)); + if (inet_pton(Addr.Ipv6.sin6_family = AF_INET6, addr, &Addr.Ipv6.sin6_addr)) { + Addr.Ipv6.sin6_port = htons(port); + } else if (inet_pton(Addr.Ipv4.sin_family = AF_INET, addr, &Addr.Ipv4.sin_addr)) { + Addr.Ipv4.sin_port = htons(port); + } + } + + TAddress::TAddress(const TString& addr, ui16 port) + : TAddress(addr.data(), port) + {} + + TString TAddress::GetAddress() const { + const void *src; + socklen_t size; + + switch (Addr.Generic.sa_family) { + case AF_INET6: + std::tie(src, size) = std::make_tuple(&Addr.Ipv6.sin6_addr, INET6_ADDRSTRLEN); + break; + + case AF_INET: + std::tie(src, size) = std::make_tuple(&Addr.Ipv4.sin_addr, INET_ADDRSTRLEN); + break; + + default: + return TString(); + } + + char *buffer = static_cast<char*>(alloca(size)); + const char *p = inet_ntop(Addr.Generic.sa_family, const_cast<void*>(src), buffer, size); + return p ? TString(p) : TString(); + } +} diff --git a/library/cpp/actors/interconnect/interconnect_address.h b/library/cpp/actors/interconnect/interconnect_address.h new file mode 100644 index 0000000000..e9e0faec81 --- /dev/null +++ b/library/cpp/actors/interconnect/interconnect_address.h @@ -0,0 +1,29 @@ +#pragma once + +#include <util/system/defaults.h> +#include <util/network/init.h> +#include <util/network/address.h> +#include <util/generic/string.h> + +namespace NInterconnect { + class TAddress { + union { + sockaddr Generic; + sockaddr_in Ipv4; + sockaddr_in6 Ipv6; + } Addr; + + public: + TAddress(); + TAddress(const char* addr, ui16 port); + TAddress(const TString& addr, ui16 port); + TAddress(NAddr::IRemoteAddr& addr); + int GetFamily() const; + socklen_t Size() const; + ::sockaddr* SockAddr(); + const ::sockaddr* SockAddr() const; + ui16 GetPort() const; + TString GetAddress() const; + TString ToString() const; + }; +} diff --git a/library/cpp/actors/interconnect/interconnect_channel.cpp b/library/cpp/actors/interconnect/interconnect_channel.cpp new file mode 100644 index 0000000000..a66ba2a154 --- /dev/null +++ b/library/cpp/actors/interconnect/interconnect_channel.cpp @@ -0,0 +1,176 @@ +#include "interconnect_channel.h" + +#include <library/cpp/actors/core/events.h> +#include <library/cpp/actors/core/executor_thread.h> +#include <library/cpp/actors/core/log.h> +#include <library/cpp/actors/core/probes.h> +#include <library/cpp/actors/protos/services_common.pb.h> +#include <library/cpp/actors/prof/tag.h> +#include <library/cpp/digest/crc32c/crc32c.h> + +LWTRACE_USING(ACTORLIB_PROVIDER); + +namespace NActors { + DECLARE_WILSON_EVENT(EventSentToSocket); + DECLARE_WILSON_EVENT(EventReceivedFromSocket); + + bool TEventOutputChannel::FeedDescriptor(TTcpPacketOutTask& task, TEventHolder& event, ui64 *weightConsumed) { + const size_t amount = sizeof(TChannelPart) + sizeof(TEventDescr); + if (task.GetVirtualFreeAmount() < amount) { + return false; + } + + NWilson::TTraceId traceId(event.Descr.TraceId); +// if (ctx) { +// WILSON_TRACE(*ctx, &traceId, EventSentToSocket); +// } + traceId.Serialize(&event.Descr.TraceId); + LWTRACK(SerializeToPacketEnd, event.Orbit, PeerNodeId, ChannelId, OutputQueueSize, task.GetDataSize()); + task.Orbit.Take(event.Orbit); + + event.Descr.Flags = (event.Descr.Flags & ~IEventHandle::FlagForwardOnNondelivery) | + (ExtendedFormat ? IEventHandle::FlagExtendedFormat : 0); + + TChannelPart *part = static_cast<TChannelPart*>(task.GetFreeArea()); + part->Channel = ChannelId | TChannelPart::LastPartFlag; + part->Size = sizeof(TEventDescr); + memcpy(part + 1, &event.Descr, sizeof(TEventDescr)); + task.AppendBuf(part, amount); + *weightConsumed += amount; + OutputQueueSize -= part->Size; + Metrics->UpdateOutputChannelEvents(ChannelId); + + return true; + } + + void TEventOutputChannel::DropConfirmed(ui64 confirm) { + LOG_DEBUG_IC_SESSION("ICOCH98", "Dropping confirmed messages"); + for (auto it = NotYetConfirmed.begin(); it != NotYetConfirmed.end() && it->Serial <= confirm; ) { + Pool.Release(NotYetConfirmed, it++); + } + } + + bool TEventOutputChannel::FeedBuf(TTcpPacketOutTask& task, ui64 serial, ui64 *weightConsumed) { + for (;;) { + Y_VERIFY(!Queue.empty()); + TEventHolder& event = Queue.front(); + + switch (State) { + case EState::INITIAL: + event.InitChecksum(); + LWTRACK(SerializeToPacketBegin, event.Orbit, PeerNodeId, ChannelId, OutputQueueSize); + if (event.Event) { + State = EState::CHUNKER; + IEventBase *base = event.Event.Get(); + Chunker.SetSerializingEvent(base); + ExtendedFormat = base->IsExtendedFormat(); + } else if (event.Buffer) { + State = EState::BUFFER; + Iter = event.Buffer->GetBeginIter(); + ExtendedFormat = event.Buffer->IsExtendedFormat(); + } else { + State = EState::DESCRIPTOR; + ExtendedFormat = false; + } + break; + + case EState::CHUNKER: + case EState::BUFFER: { + size_t maxBytes = task.GetVirtualFreeAmount(); + if (maxBytes <= sizeof(TChannelPart)) { + return false; + } + + TChannelPart *part = static_cast<TChannelPart*>(task.GetFreeArea()); + part->Channel = ChannelId; + part->Size = 0; + task.AppendBuf(part, sizeof(TChannelPart)); + maxBytes -= sizeof(TChannelPart); + Y_VERIFY(maxBytes); + + auto addChunk = [&](const void *data, size_t len) { + event.UpdateChecksum(Params, data, len); + task.AppendBuf(data, len); + part->Size += len; + Y_VERIFY_DEBUG(maxBytes >= len); + maxBytes -= len; + + event.EventActuallySerialized += len; + if (event.EventActuallySerialized > MaxSerializedEventSize) { + throw TExSerializedEventTooLarge(event.Descr.Type); + } + }; + + bool complete = false; + if (State == EState::CHUNKER) { + Y_VERIFY_DEBUG(task.GetFreeArea() == part + 1); + while (!complete && maxBytes) { + const auto [first, last] = Chunker.FeedBuf(task.GetFreeArea(), maxBytes); + for (auto p = first; p != last; ++p) { + addChunk(p->first, p->second); + } + complete = Chunker.IsComplete(); + } + Y_VERIFY(!complete || Chunker.IsSuccessfull()); + Y_VERIFY_DEBUG(complete || !maxBytes); + } else { // BUFFER + while (const size_t numb = Min(maxBytes, Iter.ContiguousSize())) { + const char *obuf = Iter.ContiguousData(); + addChunk(obuf, numb); + Iter += numb; + } + complete = !Iter.Valid(); + } + if (complete) { + Y_VERIFY(event.EventActuallySerialized == event.EventSerializedSize, + "EventActuallySerialized# %" PRIu32 " EventSerializedSize# %" PRIu32 " Type# 0x%08" PRIx32, + event.EventActuallySerialized, event.EventSerializedSize, event.Descr.Type); + } + + if (!part->Size) { + task.Undo(sizeof(TChannelPart)); + } else { + *weightConsumed += sizeof(TChannelPart) + part->Size; + OutputQueueSize -= part->Size; + } + if (complete) { + State = EState::DESCRIPTOR; + } + break; + } + + case EState::DESCRIPTOR: + if (!FeedDescriptor(task, event, weightConsumed)) { + return false; + } + event.Serial = serial; + NotYetConfirmed.splice(NotYetConfirmed.end(), Queue, Queue.begin()); // move event to not-yet-confirmed queue + State = EState::INITIAL; + return true; // we have processed whole event, signal to the caller + } + } + } + + void TEventOutputChannel::NotifyUndelivered() { + LOG_DEBUG_IC_SESSION("ICOCH89", "Notyfying about Undelivered messages! NotYetConfirmed size: %zu, Queue size: %zu", NotYetConfirmed.size(), Queue.size()); + if (State == EState::CHUNKER) { + Y_VERIFY(!Chunker.IsComplete()); // chunk must have an event being serialized + Y_VERIFY(!Queue.empty()); // this event must be the first event in queue + TEventHolder& event = Queue.front(); + Y_VERIFY(Chunker.GetCurrentEvent() == event.Event.Get()); // ensure the event is valid + Chunker.Abort(); // stop serializing current event + Y_VERIFY(Chunker.IsComplete()); + } + for (auto& item : NotYetConfirmed) { + if (item.Descr.Flags & IEventHandle::FlagGenerateUnsureUndelivered) { // notify only when unsure flag is set + item.ForwardOnNondelivery(true); + } + } + Pool.Release(NotYetConfirmed); + for (auto& item : Queue) { + item.ForwardOnNondelivery(false); + } + Pool.Release(Queue); + } + +} diff --git a/library/cpp/actors/interconnect/interconnect_channel.h b/library/cpp/actors/interconnect/interconnect_channel.h new file mode 100644 index 0000000000..e4a0ae3cda --- /dev/null +++ b/library/cpp/actors/interconnect/interconnect_channel.h @@ -0,0 +1,127 @@ +#pragma once + +#include <library/cpp/monlib/dynamic_counters/counters.h> +#include <library/cpp/actors/core/actorsystem.h> +#include <library/cpp/actors/core/event_load.h> +#include <library/cpp/actors/util/rope.h> +#include <util/generic/deque.h> +#include <util/generic/vector.h> +#include <util/generic/map.h> +#include <util/stream/walk.h> +#include <library/cpp/actors/wilson/wilson_event.h> +#include <library/cpp/actors/helpers/mon_histogram_helper.h> + +#include "interconnect_common.h" +#include "interconnect_counters.h" +#include "packet.h" +#include "event_holder_pool.h" + +namespace NActors { +#pragma pack(push, 1) + struct TChannelPart { + ui16 Channel; + ui16 Size; + + static constexpr ui16 LastPartFlag = ui16(1) << 15; + + TString ToString() const { + return TStringBuilder() << "{Channel# " << (Channel & ~LastPartFlag) + << " LastPartFlag# " << ((Channel & LastPartFlag) ? "true" : "false") + << " Size# " << Size << "}"; + } + }; +#pragma pack(pop) + + struct TExSerializedEventTooLarge : std::exception { + const ui32 Type; + + TExSerializedEventTooLarge(ui32 type) + : Type(type) + {} + }; + + class TEventOutputChannel : public TInterconnectLoggingBase { + public: + TEventOutputChannel(TEventHolderPool& pool, ui16 id, ui32 peerNodeId, ui32 maxSerializedEventSize, + std::shared_ptr<IInterconnectMetrics> metrics, TSessionParams params) + : TInterconnectLoggingBase(Sprintf("OutputChannel %" PRIu16 " [node %" PRIu32 "]", id, peerNodeId)) + , Pool(pool) + , PeerNodeId(peerNodeId) + , ChannelId(id) + , Metrics(std::move(metrics)) + , Params(std::move(params)) + , MaxSerializedEventSize(maxSerializedEventSize) + {} + + ~TEventOutputChannel() { + } + + std::pair<ui32, TEventHolder*> Push(IEventHandle& ev) { + TEventHolder& event = Pool.Allocate(Queue); + const ui32 bytes = event.Fill(ev) + sizeof(TEventDescr); + OutputQueueSize += bytes; + return std::make_pair(bytes, &event); + } + + void DropConfirmed(ui64 confirm); + + bool FeedBuf(TTcpPacketOutTask& task, ui64 serial, ui64 *weightConsumed); + + bool IsEmpty() const { + return Queue.empty(); + } + + bool IsWorking() const { + return !IsEmpty(); + } + + ui32 GetQueueSize() const { + return (ui32)Queue.size(); + } + + ui64 GetBufferedAmountOfData() const { + return OutputQueueSize; + } + + void NotifyUndelivered(); + + TEventHolderPool& Pool; + const ui32 PeerNodeId; + const ui16 ChannelId; + std::shared_ptr<IInterconnectMetrics> Metrics; + const TSessionParams Params; + const ui32 MaxSerializedEventSize; + ui64 UnaccountedTraffic = 0; + ui64 EqualizeCounterOnPause = 0; + ui64 WeightConsumedOnPause = 0; + + enum class EState { + INITIAL, + CHUNKER, + BUFFER, + DESCRIPTOR, + }; + EState State = EState::INITIAL; + + static constexpr ui16 MinimumFreeSpace = sizeof(TChannelPart) + sizeof(TEventDescr); + + protected: + ui64 OutputQueueSize = 0; + + std::list<TEventHolder> Queue; + std::list<TEventHolder> NotYetConfirmed; + TRope::TConstIterator Iter; + TCoroutineChunkSerializer Chunker; + bool ExtendedFormat = false; + + bool FeedDescriptor(TTcpPacketOutTask& task, TEventHolder& event, ui64 *weightConsumed); + + void AccountTraffic() { + if (const ui64 amount = std::exchange(UnaccountedTraffic, 0)) { + Metrics->UpdateOutputChannelTraffic(ChannelId, amount); + } + } + + friend class TInterconnectSessionTCP; + }; +} diff --git a/library/cpp/actors/interconnect/interconnect_common.h b/library/cpp/actors/interconnect/interconnect_common.h new file mode 100644 index 0000000000..285709a00c --- /dev/null +++ b/library/cpp/actors/interconnect/interconnect_common.h @@ -0,0 +1,106 @@ +#pragma once + +#include <library/cpp/actors/core/actorid.h> +#include <library/cpp/actors/core/actorsystem.h> +#include <library/cpp/actors/util/datetime.h> +#include <library/cpp/monlib/dynamic_counters/counters.h> +#include <library/cpp/monlib/metrics/metric_registry.h> +#include <util/generic/map.h> +#include <util/generic/set.h> +#include <util/system/datetime.h> + +#include "poller_tcp.h" +#include "logging.h" +#include "event_filter.h" + +#include <atomic> + +namespace NActors { + enum class EEncryptionMode { + DISABLED, // no encryption is required at all + OPTIONAL, // encryption is enabled when supported by both peers + REQUIRED, // encryption is mandatory + }; + + struct TInterconnectSettings { + TDuration Handshake; + TDuration DeadPeer; + TDuration CloseOnIdle; + ui32 SendBufferDieLimitInMB = 0; + ui64 OutputBuffersTotalSizeLimitInMB = 0; + ui32 TotalInflightAmountOfData = 0; + bool MergePerPeerCounters = false; + bool MergePerDataCenterCounters = false; + ui32 TCPSocketBufferSize = 0; + TDuration PingPeriod = TDuration::Seconds(3); + TDuration ForceConfirmPeriod = TDuration::Seconds(1); + TDuration LostConnection; + TDuration BatchPeriod; + bool BindOnAllAddresses = true; + EEncryptionMode EncryptionMode = EEncryptionMode::DISABLED; + bool TlsAuthOnly = false; + TString Certificate; // certificate data in PEM format + TString PrivateKey; // private key for the certificate in PEM format + TString CaFilePath; // path to certificate authority file + TString CipherList; // encryption algorithms + TDuration MessagePendingTimeout = TDuration::Seconds(1); // timeout for which messages are queued while in PendingConnection state + ui64 MessagePendingSize = Max<ui64>(); // size of the queue + ui32 MaxSerializedEventSize = NActors::EventMaxByteSize; + + ui32 GetSendBufferSize() const { + ui32 res = 512 * 1024; // 512 kb is the default value for send buffer + if (TCPSocketBufferSize) { + res = TCPSocketBufferSize; + } + return res; + } + }; + + struct TChannelSettings { + ui16 Weight; + }; + + typedef TMap<ui16, TChannelSettings> TChannelsConfig; + + using TRegisterMonPageCallback = std::function<void(const TString& path, const TString& title, + TActorSystem* actorSystem, const TActorId& actorId)>; + + using TInitWhiteboardCallback = std::function<void(ui16 icPort, TActorSystem* actorSystem)>; + + using TUpdateWhiteboardCallback = std::function<void(const TString& peer, bool connected, bool green, bool yellow, + bool orange, bool red, TActorSystem* actorSystem)>; + + struct TInterconnectProxyCommon : TAtomicRefCount<TInterconnectProxyCommon> { + TActorId NameserviceId; + NMonitoring::TDynamicCounterPtr MonCounters; + std::shared_ptr<NMonitoring::IMetricRegistry> Metrics; + TChannelsConfig ChannelsConfig; + TInterconnectSettings Settings; + TRegisterMonPageCallback RegisterMonPage; + TActorId DestructorId; + std::shared_ptr<std::atomic<TAtomicBase>> DestructorQueueSize; + TAtomicBase MaxDestructorQueueSize = 1024 * 1024 * 1024; + TString ClusterUUID; + TVector<TString> AcceptUUID; + ui64 StartTime = GetCycleCountFast(); + TString TechnicalSelfHostName; + TInitWhiteboardCallback InitWhiteboard; + TUpdateWhiteboardCallback UpdateWhiteboard; + ui32 HandshakeBallastSize = 0; + TAtomic StartedSessionKiller = 0; + TScopeId LocalScopeId; + std::shared_ptr<TEventFilter> EventFilter; + TString Cookie; // unique random identifier of a node instance (generated randomly at every start) + std::unordered_map<ui16, TString> ChannelName; + + struct TVersionInfo { + TString Tag; // version tag for this node + TSet<TString> AcceptedTags; // we accept all enlisted version tags of peer nodes, but no others; empty = accept all + }; + + TMaybe<TVersionInfo> VersionInfo; + + using TPtr = TIntrusivePtr<TInterconnectProxyCommon>; + }; + +} diff --git a/library/cpp/actors/interconnect/interconnect_counters.cpp b/library/cpp/actors/interconnect/interconnect_counters.cpp new file mode 100644 index 0000000000..ba674f664b --- /dev/null +++ b/library/cpp/actors/interconnect/interconnect_counters.cpp @@ -0,0 +1,692 @@ +#include "interconnect_counters.h" + +#include <library/cpp/monlib/metrics/metric_registry.h> +#include <library/cpp/monlib/metrics/metric_sub_registry.h> + +#include <unordered_map> + +namespace NActors { + +namespace { + + class TInterconnectCounters: public IInterconnectMetrics { + public: + struct TOutputChannel { + NMonitoring::TDynamicCounters::TCounterPtr Traffic; + NMonitoring::TDynamicCounters::TCounterPtr Events; + NMonitoring::TDynamicCounters::TCounterPtr OutgoingTraffic; + NMonitoring::TDynamicCounters::TCounterPtr OutgoingEvents; + + TOutputChannel() = default; + + TOutputChannel(const TIntrusivePtr<NMonitoring::TDynamicCounters>& counters, + NMonitoring::TDynamicCounters::TCounterPtr traffic, + NMonitoring::TDynamicCounters::TCounterPtr events) + : Traffic(std::move(traffic)) + , Events(std::move(events)) + , OutgoingTraffic(counters->GetCounter("OutgoingTraffic", true)) + , OutgoingEvents(counters->GetCounter("OutgoingEvents", true)) + {} + + TOutputChannel(const TOutputChannel&) = default; + }; + + struct TInputChannel { + NMonitoring::TDynamicCounters::TCounterPtr Traffic; + NMonitoring::TDynamicCounters::TCounterPtr Events; + NMonitoring::TDynamicCounters::TCounterPtr ScopeErrors; + NMonitoring::TDynamicCounters::TCounterPtr IncomingTraffic; + NMonitoring::TDynamicCounters::TCounterPtr IncomingEvents; + + TInputChannel() = default; + + TInputChannel(const TIntrusivePtr<NMonitoring::TDynamicCounters>& counters, + NMonitoring::TDynamicCounters::TCounterPtr traffic, + NMonitoring::TDynamicCounters::TCounterPtr events, + NMonitoring::TDynamicCounters::TCounterPtr scopeErrors) + : Traffic(std::move(traffic)) + , Events(std::move(events)) + , ScopeErrors(std::move(scopeErrors)) + , IncomingTraffic(counters->GetCounter("IncomingTraffic", true)) + , IncomingEvents(counters->GetCounter("IncomingEvents", true)) + {} + + TInputChannel(const TInputChannel&) = default; + }; + + struct TInputChannels : std::unordered_map<ui16, TInputChannel> { + TInputChannel OtherInputChannel; + + TInputChannels() = default; + + TInputChannels(const TIntrusivePtr<NMonitoring::TDynamicCounters>& counters, + const std::unordered_map<ui16, TString>& names, + NMonitoring::TDynamicCounters::TCounterPtr traffic, + NMonitoring::TDynamicCounters::TCounterPtr events, + NMonitoring::TDynamicCounters::TCounterPtr scopeErrors) + : OtherInputChannel(counters->GetSubgroup("channel", "other"), traffic, events, scopeErrors) + { + for (const auto& [id, name] : names) { + try_emplace(id, counters->GetSubgroup("channel", name), traffic, events, scopeErrors); + } + } + + TInputChannels(const TInputChannels&) = default; + + const TInputChannel& Get(ui16 id) const { + const auto it = find(id); + return it != end() ? it->second : OtherInputChannel; + } + }; + + private: + const TInterconnectProxyCommon::TPtr Common; + const bool MergePerDataCenterCounters; + const bool MergePerPeerCounters; + NMonitoring::TDynamicCounterPtr Counters; + NMonitoring::TDynamicCounterPtr PerSessionCounters; + NMonitoring::TDynamicCounterPtr PerDataCenterCounters; + NMonitoring::TDynamicCounterPtr& AdaptiveCounters; + + bool Initialized = false; + + NMonitoring::TDynamicCounters::TCounterPtr Traffic; + NMonitoring::TDynamicCounters::TCounterPtr Events; + NMonitoring::TDynamicCounters::TCounterPtr ScopeErrors; + + public: + TInterconnectCounters(const TInterconnectProxyCommon::TPtr& common) + : Common(common) + , MergePerDataCenterCounters(common->Settings.MergePerDataCenterCounters) + , MergePerPeerCounters(common->Settings.MergePerPeerCounters) + , Counters(common->MonCounters) + , AdaptiveCounters(MergePerDataCenterCounters + ? PerDataCenterCounters : + MergePerPeerCounters ? Counters : PerSessionCounters) + {} + + void AddInflightDataAmount(ui64 value) override { + *InflightDataAmount += value; + } + + void SubInflightDataAmount(ui64 value) override { + *InflightDataAmount -= value; + } + + void AddTotalBytesWritten(ui64 value) override { + *TotalBytesWritten += value; + } + + void SetClockSkewMicrosec(i64 value) override { + *ClockSkewMicrosec = value; + } + + void IncSessionDeaths() override { + ++*SessionDeaths; + } + + void IncHandshakeFails() override { + ++*HandshakeFails; + } + + void SetConnected(ui32 value) override { + *Connected = value; + } + + void IncSubscribersCount() override { + ++*SubscribersCount; + } + + void SubSubscribersCount(ui32 value) override { + *SubscribersCount -= value; + } + + void SubOutputBuffersTotalSize(ui64 value) override { + *OutputBuffersTotalSize -= value; + } + + void AddOutputBuffersTotalSize(ui64 value) override { + *OutputBuffersTotalSize += value; + } + + ui64 GetOutputBuffersTotalSize() const override { + return *OutputBuffersTotalSize; + } + + void IncDisconnections() override { + ++*Disconnections; + } + + void IncUsefulWriteWakeups() override { + ++*UsefulWriteWakeups; + } + + void IncSpuriousWriteWakeups() override { + ++*SpuriousWriteWakeups; + } + + void IncSendSyscalls() override { + ++*SendSyscalls; + } + + void IncInflyLimitReach() override { + ++*InflyLimitReach; + } + + void IncUsefulReadWakeups() override { + ++*UsefulReadWakeups; + } + + void IncSpuriousReadWakeups() override { + ++*SpuriousReadWakeups; + } + + void IncDisconnectByReason(const TString& s) override { + if (auto it = DisconnectByReason.find(s); it != DisconnectByReason.end()) { + it->second->Inc(); + } + } + + void AddInputChannelsIncomingTraffic(ui16 channel, ui64 incomingTraffic) override { + auto& ch = InputChannels.Get(channel); + *ch.IncomingTraffic += incomingTraffic; + } + + void IncInputChannelsIncomingEvents(ui16 channel) override { + auto& ch = InputChannels.Get(channel); + ++*ch.IncomingEvents; + } + + void IncRecvSyscalls() override { + ++*RecvSyscalls; + } + + void AddTotalBytesRead(ui64 value) override { + *TotalBytesRead += value; + } + + void UpdateLegacyPingTimeHist(ui64 value) override { + LegacyPingTimeHist.Add(value); + PingTimeHistogram->Collect(value); + } + + void UpdateOutputChannelTraffic(ui16 channel, ui64 value) override { + if (GetOutputChannel(channel).OutgoingTraffic) { + *(GetOutputChannel(channel).OutgoingTraffic) += value; + } + if (GetOutputChannel(channel).Traffic) { + *(GetOutputChannel(channel).Traffic) += value; + } + } + + void UpdateOutputChannelEvents(ui16 channel) override { + if (GetOutputChannel(channel).OutgoingEvents) { + ++*(GetOutputChannel(channel).OutgoingEvents); + } + if (GetOutputChannel(channel).Events) { + ++*(GetOutputChannel(channel).Events); + } + } + + void SetPeerInfo(const TString& name, const TString& dataCenterId) override { + if (name != std::exchange(HumanFriendlyPeerHostName, name)) { + PerSessionCounters.Reset(); + } + VALGRIND_MAKE_READABLE(&DataCenterId, sizeof(DataCenterId)); + if (dataCenterId != std::exchange(DataCenterId, dataCenterId)) { + PerDataCenterCounters.Reset(); + } + + const bool updatePerDataCenter = !PerDataCenterCounters && MergePerDataCenterCounters; + if (updatePerDataCenter) { + PerDataCenterCounters = Counters->GetSubgroup("dataCenterId", *DataCenterId); + } + + const bool updatePerSession = !PerSessionCounters || updatePerDataCenter; + if (updatePerSession) { + auto base = MergePerDataCenterCounters ? PerDataCenterCounters : Counters; + PerSessionCounters = base->GetSubgroup("peer", *HumanFriendlyPeerHostName); + } + + const bool updateGlobal = !Initialized; + + const bool updateAdaptive = + &AdaptiveCounters == &Counters ? updateGlobal : + &AdaptiveCounters == &PerSessionCounters ? updatePerSession : + &AdaptiveCounters == &PerDataCenterCounters ? updatePerDataCenter : + false; + + if (updatePerSession) { + Connected = PerSessionCounters->GetCounter("Connected"); + Disconnections = PerSessionCounters->GetCounter("Disconnections", true); + ClockSkewMicrosec = PerSessionCounters->GetCounter("ClockSkewMicrosec"); + Traffic = PerSessionCounters->GetCounter("Traffic", true); + Events = PerSessionCounters->GetCounter("Events", true); + ScopeErrors = PerSessionCounters->GetCounter("ScopeErrors", true); + + for (const auto& [id, name] : Common->ChannelName) { + OutputChannels.try_emplace(id, Counters->GetSubgroup("channel", name), Traffic, Events); + } + OtherOutputChannel = TOutputChannel(Counters->GetSubgroup("channel", "other"), Traffic, Events); + + InputChannels = TInputChannels(Counters, Common->ChannelName, Traffic, Events, ScopeErrors); + } + + if (updateAdaptive) { + SessionDeaths = AdaptiveCounters->GetCounter("Session_Deaths", true); + HandshakeFails = AdaptiveCounters->GetCounter("Handshake_Fails", true); + InflyLimitReach = AdaptiveCounters->GetCounter("InflyLimitReach", true); + InflightDataAmount = AdaptiveCounters->GetCounter("Inflight_Data"); + + LegacyPingTimeHist = {}; + LegacyPingTimeHist.Init(AdaptiveCounters.Get(), "PingTimeHist", "mks", 125, 18); + + PingTimeHistogram = AdaptiveCounters->GetHistogram( + "PingTimeUs", NMonitoring::ExponentialHistogram(18, 2, 125)); + } + + if (updateGlobal) { + OutputBuffersTotalSize = Counters->GetCounter("OutputBuffersTotalSize"); + SendSyscalls = Counters->GetCounter("SendSyscalls", true); + RecvSyscalls = Counters->GetCounter("RecvSyscalls", true); + SpuriousReadWakeups = Counters->GetCounter("SpuriousReadWakeups", true); + UsefulReadWakeups = Counters->GetCounter("UsefulReadWakeups", true); + SpuriousWriteWakeups = Counters->GetCounter("SpuriousWriteWakeups", true); + UsefulWriteWakeups = Counters->GetCounter("UsefulWriteWakeups", true); + SubscribersCount = AdaptiveCounters->GetCounter("SubscribersCount"); + TotalBytesWritten = Counters->GetCounter("TotalBytesWritten", true); + TotalBytesRead = Counters->GetCounter("TotalBytesRead", true); + + auto disconnectReasonGroup = Counters->GetSubgroup("subsystem", "disconnectReason"); + for (const char *reason : TDisconnectReason::Reasons) { + DisconnectByReason[reason] = disconnectReasonGroup->GetNamedCounter("reason", reason, true); + } + } + + Initialized = true; + } + + TOutputChannel GetOutputChannel(ui16 index) const { + Y_VERIFY(Initialized); + const auto it = OutputChannels.find(index); + return it != OutputChannels.end() ? it->second : OtherOutputChannel; + } + + private: + NMonitoring::TDynamicCounters::TCounterPtr SessionDeaths; + NMonitoring::TDynamicCounters::TCounterPtr HandshakeFails; + NMonitoring::TDynamicCounters::TCounterPtr Connected; + NMonitoring::TDynamicCounters::TCounterPtr Disconnections; + NMonitoring::TDynamicCounters::TCounterPtr InflightDataAmount; + NMonitoring::TDynamicCounters::TCounterPtr InflyLimitReach; + NMonitoring::TDynamicCounters::TCounterPtr OutputBuffersTotalSize; + NMonitoring::TDynamicCounters::TCounterPtr QueueUtilization; + NMonitoring::TDynamicCounters::TCounterPtr SubscribersCount; + NMonitoring::TDynamicCounters::TCounterPtr SendSyscalls; + NMonitoring::TDynamicCounters::TCounterPtr ClockSkewMicrosec; + NMonitoring::TDynamicCounters::TCounterPtr RecvSyscalls; + NMonitoring::TDynamicCounters::TCounterPtr UsefulReadWakeups; + NMonitoring::TDynamicCounters::TCounterPtr SpuriousReadWakeups; + NMonitoring::TDynamicCounters::TCounterPtr UsefulWriteWakeups; + NMonitoring::TDynamicCounters::TCounterPtr SpuriousWriteWakeups; + NMon::THistogramCounterHelper LegacyPingTimeHist; + NMonitoring::THistogramPtr PingTimeHistogram; + + std::unordered_map<ui16, TOutputChannel> OutputChannels; + TOutputChannel OtherOutputChannel; + TInputChannels InputChannels; + THashMap<TString, NMonitoring::TDynamicCounters::TCounterPtr> DisconnectByReason; + + NMonitoring::TDynamicCounters::TCounterPtr TotalBytesWritten, TotalBytesRead; + }; + + class TInterconnectMetrics: public IInterconnectMetrics { + public: + struct TOutputChannel { + NMonitoring::IRate* Traffic; + NMonitoring::IRate* Events; + NMonitoring::IRate* OutgoingTraffic; + NMonitoring::IRate* OutgoingEvents; + + TOutputChannel() = default; + + TOutputChannel(const std::shared_ptr<NMonitoring::IMetricRegistry>& metrics, + NMonitoring::IRate* traffic, + NMonitoring::IRate* events) + : Traffic(traffic) + , Events(events) + , OutgoingTraffic(metrics->Rate(NMonitoring::MakeLabels({{"sensor", "interconnect.outgoing_traffic"}}))) + , OutgoingEvents(metrics->Rate(NMonitoring::MakeLabels({{"sensor", "interconnect.outgoing_events"}}))) + {} + + TOutputChannel(const TOutputChannel&) = default; + }; + + struct TInputChannel { + NMonitoring::IRate* Traffic; + NMonitoring::IRate* Events; + NMonitoring::IRate* ScopeErrors; + NMonitoring::IRate* IncomingTraffic; + NMonitoring::IRate* IncomingEvents; + + TInputChannel() = default; + + TInputChannel(const std::shared_ptr<NMonitoring::IMetricRegistry>& metrics, + NMonitoring::IRate* traffic, NMonitoring::IRate* events, + NMonitoring::IRate* scopeErrors) + : Traffic(traffic) + , Events(events) + , ScopeErrors(scopeErrors) + , IncomingTraffic(metrics->Rate(NMonitoring::MakeLabels({{"sensor", "interconnect.incoming_traffic"}}))) + , IncomingEvents(metrics->Rate(NMonitoring::MakeLabels({{"sensor", "interconnect.incoming_events"}}))) + {} + + TInputChannel(const TInputChannel&) = default; + }; + + struct TInputChannels : std::unordered_map<ui16, TInputChannel> { + TInputChannel OtherInputChannel; + + TInputChannels() = default; + + TInputChannels(const std::shared_ptr<NMonitoring::IMetricRegistry>& metrics, + const std::unordered_map<ui16, TString>& names, + NMonitoring::IRate* traffic, NMonitoring::IRate* events, + NMonitoring::IRate* scopeErrors) + : OtherInputChannel(std::make_shared<NMonitoring::TMetricSubRegistry>( + NMonitoring::TLabels{{"channel", "other"}}, metrics), traffic, events, scopeErrors) + { + for (const auto& [id, name] : names) { + try_emplace(id, std::make_shared<NMonitoring::TMetricSubRegistry>(NMonitoring::TLabels{{"channel", name}}, metrics), + traffic, events, scopeErrors); + } + } + + TInputChannels(const TInputChannels&) = default; + + const TInputChannel& Get(ui16 id) const { + const auto it = find(id); + return it != end() ? it->second : OtherInputChannel; + } + }; + + TInterconnectMetrics(const TInterconnectProxyCommon::TPtr& common) + : Common(common) + , MergePerDataCenterMetrics_(common->Settings.MergePerDataCenterCounters) + , MergePerPeerMetrics_(common->Settings.MergePerPeerCounters) + , Metrics_(common->Metrics) + , AdaptiveMetrics_(MergePerDataCenterMetrics_ + ? PerDataCenterMetrics_ : + MergePerPeerMetrics_ ? Metrics_ : PerSessionMetrics_) + {} + + void AddInflightDataAmount(ui64 value) override { + InflightDataAmount_->Add(value); + } + + void SubInflightDataAmount(ui64 value) override { + InflightDataAmount_->Add(-value); + } + + void AddTotalBytesWritten(ui64 value) override { + TotalBytesWritten_->Add(value); + } + + void SetClockSkewMicrosec(i64 value) override { + ClockSkewMicrosec_->Set(value); + } + + void IncSessionDeaths() override { + SessionDeaths_->Inc(); + } + + void IncHandshakeFails() override { + HandshakeFails_->Inc(); + } + + void SetConnected(ui32 value) override { + Connected_->Set(value); + } + + void IncSubscribersCount() override { + SubscribersCount_->Inc(); + } + + void SubSubscribersCount(ui32 value) override { + SubscribersCount_->Add(-value); + } + + void SubOutputBuffersTotalSize(ui64 value) override { + OutputBuffersTotalSize_->Add(-value); + } + + void AddOutputBuffersTotalSize(ui64 value) override { + OutputBuffersTotalSize_->Add(value); + } + + ui64 GetOutputBuffersTotalSize() const override { + return OutputBuffersTotalSize_->Get(); + } + + void IncDisconnections() override { + Disconnections_->Inc(); + } + + void IncUsefulWriteWakeups() override { + UsefulWriteWakeups_->Inc(); + } + + void IncSpuriousWriteWakeups() override { + SpuriousWriteWakeups_->Inc(); + } + + void IncSendSyscalls() override { + SendSyscalls_->Inc(); + } + + void IncInflyLimitReach() override { + InflyLimitReach_->Inc(); + } + + void IncUsefulReadWakeups() override { + UsefulReadWakeups_->Inc(); + } + + void IncSpuriousReadWakeups() override { + SpuriousReadWakeups_->Inc(); + } + + void IncDisconnectByReason(const TString& s) override { + if (auto it = DisconnectByReason_.find(s); it != DisconnectByReason_.end()) { + it->second->Inc(); + } + } + + void AddInputChannelsIncomingTraffic(ui16 channel, ui64 incomingTraffic) override { + auto& ch = InputChannels_.Get(channel); + ch.IncomingTraffic->Add(incomingTraffic); + } + + void IncInputChannelsIncomingEvents(ui16 channel) override { + auto& ch = InputChannels_.Get(channel); + ch.IncomingEvents->Inc(); + } + + void IncRecvSyscalls() override { + RecvSyscalls_->Inc(); + } + + void AddTotalBytesRead(ui64 value) override { + TotalBytesRead_->Add(value); + } + + void UpdateLegacyPingTimeHist(ui64 value) override { + PingTimeHistogram_->Record(value); + } + + void UpdateOutputChannelTraffic(ui16 channel, ui64 value) override { + if (GetOutputChannel(channel).OutgoingTraffic) { + GetOutputChannel(channel).OutgoingTraffic->Add(value); + } + if (GetOutputChannel(channel).Traffic) { + GetOutputChannel(channel).Traffic->Add(value); + } + } + + void UpdateOutputChannelEvents(ui16 channel) override { + if (GetOutputChannel(channel).OutgoingEvents) { + GetOutputChannel(channel).OutgoingEvents->Inc(); + } + if (GetOutputChannel(channel).Events) { + GetOutputChannel(channel).Events->Inc(); + } + } + + void SetPeerInfo(const TString& name, const TString& dataCenterId) override { + if (name != std::exchange(HumanFriendlyPeerHostName, name)) { + PerSessionMetrics_.reset(); + } + VALGRIND_MAKE_READABLE(&DataCenterId, sizeof(DataCenterId)); + if (dataCenterId != std::exchange(DataCenterId, dataCenterId)) { + PerDataCenterMetrics_.reset(); + } + + const bool updatePerDataCenter = !PerDataCenterMetrics_ && MergePerDataCenterMetrics_; + if (updatePerDataCenter) { + PerDataCenterMetrics_ = std::make_shared<NMonitoring::TMetricSubRegistry>( + NMonitoring::TLabels{{"datacenter_id", *DataCenterId}}, Metrics_); + } + + const bool updatePerSession = !PerSessionMetrics_ || updatePerDataCenter; + if (updatePerSession) { + auto base = MergePerDataCenterMetrics_ ? PerDataCenterMetrics_ : Metrics_; + PerSessionMetrics_ = std::make_shared<NMonitoring::TMetricSubRegistry>( + NMonitoring::TLabels{{"peer", *HumanFriendlyPeerHostName}}, base); + } + + const bool updateGlobal = !Initialized_; + + const bool updateAdaptive = + &AdaptiveMetrics_ == &Metrics_ ? updateGlobal : + &AdaptiveMetrics_ == &PerSessionMetrics_ ? updatePerSession : + &AdaptiveMetrics_ == &PerDataCenterMetrics_ ? updatePerDataCenter : + false; + + auto createRate = [](std::shared_ptr<NMonitoring::IMetricRegistry> metrics, TStringBuf name) mutable { + return metrics->Rate(NMonitoring::MakeLabels(NMonitoring::TLabels{{"sensor", name}})); + }; + auto createIntGauge = [](std::shared_ptr<NMonitoring::IMetricRegistry> metrics, TStringBuf name) mutable { + return metrics->IntGauge(NMonitoring::MakeLabels(NMonitoring::TLabels{{"sensor", name}})); + }; + + if (updatePerSession) { + Connected_ = createIntGauge(PerSessionMetrics_, "interconnect.connected"); + Disconnections_ = createRate(PerSessionMetrics_, "interconnect.disconnections"); + ClockSkewMicrosec_ = createIntGauge(PerSessionMetrics_, "interconnect.clock_skew_microsec"); + Traffic_ = createRate(PerSessionMetrics_, "interconnect.traffic"); + Events_ = createRate(PerSessionMetrics_, "interconnect.events"); + ScopeErrors_ = createRate(PerSessionMetrics_, "interconnect.scope_errors"); + + for (const auto& [id, name] : Common->ChannelName) { + OutputChannels_.try_emplace(id, std::make_shared<NMonitoring::TMetricSubRegistry>( + NMonitoring::TLabels{{"channel", name}}, Metrics_), Traffic_, Events_); + } + OtherOutputChannel_ = TOutputChannel(std::make_shared<NMonitoring::TMetricSubRegistry>( + NMonitoring::TLabels{{"channel", "other"}}, Metrics_), Traffic_, Events_); + + InputChannels_ = TInputChannels(Metrics_, Common->ChannelName, Traffic_, Events_, ScopeErrors_); + } + + if (updateAdaptive) { + SessionDeaths_ = createRate(AdaptiveMetrics_, "interconnect.session_deaths"); + HandshakeFails_ = createRate(AdaptiveMetrics_, "interconnect.handshake_fails"); + InflyLimitReach_ = createRate(AdaptiveMetrics_, "interconnect.infly_limit_reach"); + InflightDataAmount_ = createRate(AdaptiveMetrics_, "interconnect.inflight_data"); + PingTimeHistogram_ = AdaptiveMetrics_->HistogramRate( + NMonitoring::MakeLabels({{"sensor", "interconnect.ping_time_us"}}), NMonitoring::ExponentialHistogram(18, 2, 125)); + } + + if (updateGlobal) { + OutputBuffersTotalSize_ = createRate(Metrics_, "interconnect.output_buffers_total_size"); + SendSyscalls_ = createRate(Metrics_, "interconnect.send_syscalls"); + RecvSyscalls_ = createRate(Metrics_, "interconnect.recv_syscalls"); + SpuriousReadWakeups_ = createRate(Metrics_, "interconnect.spurious_read_wakeups"); + UsefulReadWakeups_ = createRate(Metrics_, "interconnect.useful_read_wakeups"); + SpuriousWriteWakeups_ = createRate(Metrics_, "interconnect.spurious_write_wakeups"); + UsefulWriteWakeups_ = createRate(Metrics_, "interconnect.useful_write_wakeups"); + SubscribersCount_ = createIntGauge(AdaptiveMetrics_, "interconnect.subscribers_count"); + TotalBytesWritten_ = createRate(Metrics_, "interconnect.total_bytes_written"); + TotalBytesRead_ = createRate(Metrics_, "interconnect.total_bytes_read"); + + for (const char *reason : TDisconnectReason::Reasons) { + DisconnectByReason_[reason] = Metrics_->Rate( + NMonitoring::MakeLabels({ + {"sensor", "interconnect.disconnect_reason"}, + {"reason", reason}, + })); + } + } + + Initialized_ = true; + } + + TOutputChannel GetOutputChannel(ui16 index) const { + Y_VERIFY(Initialized_); + const auto it = OutputChannels_.find(index); + return it != OutputChannels_.end() ? it->second : OtherOutputChannel_; + } + + private: + const TInterconnectProxyCommon::TPtr Common; + const bool MergePerDataCenterMetrics_; + const bool MergePerPeerMetrics_; + std::shared_ptr<NMonitoring::IMetricRegistry> Metrics_; + std::shared_ptr<NMonitoring::IMetricRegistry> PerSessionMetrics_; + std::shared_ptr<NMonitoring::IMetricRegistry> PerDataCenterMetrics_; + std::shared_ptr<NMonitoring::IMetricRegistry>& AdaptiveMetrics_; + bool Initialized_ = false; + + NMonitoring::IRate* Traffic_; + + NMonitoring::IRate* Events_; + NMonitoring::IRate* ScopeErrors_; + NMonitoring::IRate* Disconnections_; + NMonitoring::IIntGauge* Connected_; + + NMonitoring::IRate* SessionDeaths_; + NMonitoring::IRate* HandshakeFails_; + NMonitoring::IRate* InflyLimitReach_; + NMonitoring::IRate* InflightDataAmount_; + NMonitoring::IRate* OutputBuffersTotalSize_; + NMonitoring::IIntGauge* SubscribersCount_; + NMonitoring::IRate* SendSyscalls_; + NMonitoring::IRate* RecvSyscalls_; + NMonitoring::IRate* SpuriousWriteWakeups_; + NMonitoring::IRate* UsefulWriteWakeups_; + NMonitoring::IRate* SpuriousReadWakeups_; + NMonitoring::IRate* UsefulReadWakeups_; + NMonitoring::IIntGauge* ClockSkewMicrosec_; + + NMonitoring::IHistogram* PingTimeHistogram_; + + std::unordered_map<ui16, TOutputChannel> OutputChannels_; + TOutputChannel OtherOutputChannel_; + TInputChannels InputChannels_; + + THashMap<TString, NMonitoring::IRate*> DisconnectByReason_; + + NMonitoring::IRate* TotalBytesWritten_; + NMonitoring::IRate* TotalBytesRead_; + }; + +} // namespace + +std::unique_ptr<IInterconnectMetrics> CreateInterconnectCounters(const TInterconnectProxyCommon::TPtr& common) { + return std::make_unique<TInterconnectCounters>(common); +} + +std::unique_ptr<IInterconnectMetrics> CreateInterconnectMetrics(const TInterconnectProxyCommon::TPtr& common) { + return std::make_unique<TInterconnectMetrics>(common); +} + +} // NActors diff --git a/library/cpp/actors/interconnect/interconnect_counters.h b/library/cpp/actors/interconnect/interconnect_counters.h new file mode 100644 index 0000000000..e30f03a0bc --- /dev/null +++ b/library/cpp/actors/interconnect/interconnect_counters.h @@ -0,0 +1,59 @@ +#pragma once + +#include <library/cpp/actors/helpers/mon_histogram_helper.h> + +#include <util/system/valgrind.h> + +#include "types.h" + +#include "interconnect_common.h" + +#include <memory> +#include <optional> + +namespace NActors { + +class IInterconnectMetrics { +public: + virtual ~IInterconnectMetrics() = default; + + virtual void AddInflightDataAmount(ui64 value) = 0; + virtual void SubInflightDataAmount(ui64 value) = 0; + virtual void AddTotalBytesWritten(ui64 value) = 0; + virtual void SetClockSkewMicrosec(i64 value) = 0; + virtual void IncSessionDeaths() = 0; + virtual void IncHandshakeFails() = 0; + virtual void SetConnected(ui32 value) = 0; + virtual void IncSubscribersCount() = 0; + virtual void SubSubscribersCount(ui32 value) = 0; + virtual void SubOutputBuffersTotalSize(ui64 value) = 0; + virtual void AddOutputBuffersTotalSize(ui64 value) = 0; + virtual ui64 GetOutputBuffersTotalSize() const = 0; + virtual void IncDisconnections() = 0; + virtual void IncUsefulWriteWakeups() = 0; + virtual void IncSpuriousWriteWakeups() = 0; + virtual void IncSendSyscalls() = 0; + virtual void IncInflyLimitReach() = 0; + virtual void IncDisconnectByReason(const TString& s) = 0; + virtual void IncUsefulReadWakeups() = 0; + virtual void IncSpuriousReadWakeups() = 0; + virtual void SetPeerInfo(const TString& name, const TString& dataCenterId) = 0; + virtual void AddInputChannelsIncomingTraffic(ui16 channel, ui64 incomingTraffic) = 0; + virtual void IncInputChannelsIncomingEvents(ui16 channel) = 0; + virtual void IncRecvSyscalls() = 0; + virtual void AddTotalBytesRead(ui64 value) = 0; + virtual void UpdateLegacyPingTimeHist(ui64 value) = 0; + virtual void UpdateOutputChannelTraffic(ui16 channel, ui64 value) = 0; + virtual void UpdateOutputChannelEvents(ui16 channel) = 0; + TString GetHumanFriendlyPeerHostName() const { + return HumanFriendlyPeerHostName.value_or(TString()); + } + +protected: + std::optional<TString> DataCenterId; + std::optional<TString> HumanFriendlyPeerHostName; +}; + +std::unique_ptr<IInterconnectMetrics> CreateInterconnectCounters(const NActors::TInterconnectProxyCommon::TPtr& common); +std::unique_ptr<IInterconnectMetrics> CreateInterconnectMetrics(const NActors::TInterconnectProxyCommon::TPtr& common); +} // NActors diff --git a/library/cpp/actors/interconnect/interconnect_handshake.cpp b/library/cpp/actors/interconnect/interconnect_handshake.cpp new file mode 100644 index 0000000000..9ede998d8e --- /dev/null +++ b/library/cpp/actors/interconnect/interconnect_handshake.cpp @@ -0,0 +1,995 @@ +#include "interconnect_handshake.h" +#include "interconnect_tcp_proxy.h" + +#include <library/cpp/actors/core/actor_coroutine.h> +#include <library/cpp/actors/core/log.h> +#include <library/cpp/actors/protos/services_common.pb.h> +#include <util/system/getpid.h> + +#include <google/protobuf/text_format.h> + +#include <variant> + +namespace NActors { + static constexpr size_t StackSize = 64 * 1024; // 64k should be enough + + class THandshakeActor + : public TActorCoroImpl + , public TInterconnectLoggingBase + { + struct TExHandshakeFailed : yexception {}; + + static constexpr TDuration ResolveTimeout = TDuration::Seconds(1); + +#pragma pack(push, 1) + + struct TInitialPacket { + struct { + TActorId SelfVirtualId; + TActorId PeerVirtualId; + ui64 NextPacket; + ui64 Version; + } Header; + ui32 Checksum; + + TInitialPacket() = default; + + TInitialPacket(const TActorId& self, const TActorId& peer, ui64 nextPacket, ui64 version) { + Header.SelfVirtualId = self; + Header.PeerVirtualId = peer; + Header.NextPacket = nextPacket; + Header.Version = version; + Checksum = Crc32cExtendMSanCompatible(0, &Header, sizeof(Header)); + } + + bool Check() const { + return Checksum == Crc32cExtendMSanCompatible(0, &Header, sizeof(Header)); + } + + TString ToString() const { + return TStringBuilder() + << "{SelfVirtualId# " << Header.SelfVirtualId.ToString() + << " PeerVirtualId# " << Header.PeerVirtualId.ToString() + << " NextPacket# " << Header.NextPacket + << " Version# " << Header.Version + << "}"; + } + }; + + struct TExHeader { + static constexpr ui32 MaxSize = 1024 * 1024; + + ui32 Checksum; + ui32 Size; + + ui32 CalculateChecksum(const void* data, size_t len) const { + return Crc32cExtendMSanCompatible(Crc32cExtendMSanCompatible(0, &Size, sizeof(Size)), data, len); + } + + void Sign(const void* data, size_t len) { + Checksum = CalculateChecksum(data, len); + } + + bool Check(const void* data, size_t len) const { + return Checksum == CalculateChecksum(data, len); + } + }; + +#pragma pack(pop) + + private: + TInterconnectProxyCommon::TPtr Common; + TActorId SelfVirtualId; + TActorId PeerVirtualId; + ui32 PeerNodeId = 0; + ui64 NextPacketToPeer = 0; + TMaybe<ui64> NextPacketFromPeer; // will be obtained from incoming initial packet + TString PeerHostName; + TString PeerAddr; + TSocketPtr Socket; + TPollerToken::TPtr PollerToken; + TString State; + TString HandshakeKind; + TMaybe<THolder<TProgramInfo>> ProgramInfo; // filled in in case of successful handshake; even if null + TSessionParams Params; + bool ResolveTimedOut = false; + THashMap<ui32, TInstant> LastLogNotice; + const TDuration MuteDuration = TDuration::Seconds(15); + TInstant Deadline; + + public: + static constexpr IActor::EActivityType ActorActivityType() { + return IActor::INTERCONNECT_HANDSHAKE; + } + + THandshakeActor(TInterconnectProxyCommon::TPtr common, const TActorId& self, const TActorId& peer, + ui32 nodeId, ui64 nextPacket, TString peerHostName, TSessionParams params) + : TActorCoroImpl(StackSize, true, true) // allow unhandled poison pills and dtors + , Common(std::move(common)) + , SelfVirtualId(self) + , PeerVirtualId(peer) + , PeerNodeId(nodeId) + , NextPacketToPeer(nextPacket) + , PeerHostName(std::move(peerHostName)) + , HandshakeKind("outgoing handshake") + , Params(std::move(params)) + { + Y_VERIFY(SelfVirtualId); + Y_VERIFY(SelfVirtualId.NodeId()); + Y_VERIFY(PeerNodeId); + } + + THandshakeActor(TInterconnectProxyCommon::TPtr common, TSocketPtr socket) + : TActorCoroImpl(StackSize, true, true) // allow unhandled poison pills and dtors + , Common(std::move(common)) + , Socket(std::move(socket)) + , HandshakeKind("incoming handshake") + { + Y_VERIFY(Socket); + PeerAddr = TString::Uninitialized(1024); + if (GetRemoteAddr(*Socket, PeerAddr.Detach(), PeerAddr.size())) { + PeerAddr.resize(strlen(PeerAddr.data())); + } else { + PeerAddr.clear(); + } + } + + void UpdatePrefix() { + SetPrefix(Sprintf("Handshake %s [node %" PRIu32 "]", SelfActorId.ToString().data(), PeerNodeId)); + } + + void Run() override { + UpdatePrefix(); + + // set up overall handshake process timer + TDuration timeout = Common->Settings.Handshake; + if (timeout == TDuration::Zero()) { + timeout = DEFAULT_HANDSHAKE_TIMEOUT; + } + timeout += ResolveTimeout * 2; + Deadline = Now() + timeout; + Schedule(Deadline, new TEvents::TEvWakeup); + + try { + if (Socket) { + PerformIncomingHandshake(); + } else { + PerformOutgoingHandshake(); + } + + // establish encrypted channel, or, in case when encryption is disabled, check if it matches settings + if (ProgramInfo) { + if (Params.Encryption) { + EstablishSecureConnection(); + } else if (Common->Settings.EncryptionMode == EEncryptionMode::REQUIRED && !Params.AuthOnly) { + Fail(TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT, "Peer doesn't support encryption, which is required"); + } + } + } catch (const TExHandshakeFailed&) { + ProgramInfo.Clear(); + } + + if (ProgramInfo) { + LOG_LOG_IC_X(NActorsServices::INTERCONNECT, "ICH04", NLog::PRI_INFO, "handshake succeeded"); + Y_VERIFY(NextPacketFromPeer); + if (PollerToken) { + Y_VERIFY(PollerToken->RefCount() == 1); + PollerToken.Reset(); // ensure we are going to destroy poller token here as we will re-register the socket within other actor + } + SendToProxy(MakeHolder<TEvHandshakeDone>(std::move(Socket), PeerVirtualId, SelfVirtualId, + *NextPacketFromPeer, ProgramInfo->Release(), std::move(Params))); + } + + Socket.Reset(); + } + + void EstablishSecureConnection() { + Y_VERIFY(PollerToken && PollerToken->RefCount() == 1); + PollerToken.Reset(); + auto ev = AskProxy<TEvSecureSocket>(MakeHolder<TEvGetSecureSocket>(Socket), "AskProxy(TEvSecureContext)"); + Socket = std::move(ev->Get()->Socket); + RegisterInPoller(); + const ui32 myNodeId = GetActorSystem()->NodeId; + const bool server = myNodeId < PeerNodeId; // keep server/client role permanent to enable easy TLS session resuming + for (;;) { + TString err; + auto& secure = static_cast<NInterconnect::TSecureSocket&>(*Socket); + switch (secure.Establish(server, Params.AuthOnly, err)) { + case NInterconnect::TSecureSocket::EStatus::SUCCESS: + if (Params.AuthOnly) { + Params.Encryption = false; + Params.AuthCN = secure.GetPeerCommonName(); + Y_VERIFY(PollerToken && PollerToken->RefCount() == 1); + PollerToken.Reset(); + Socket = secure.Detach(); + } + return; + + case NInterconnect::TSecureSocket::EStatus::ERROR: + Fail(TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT, err, true); + [[fallthrough]]; + + case NInterconnect::TSecureSocket::EStatus::WANT_READ: + WaitPoller(true, false, "ReadEstablish"); + break; + + case NInterconnect::TSecureSocket::EStatus::WANT_WRITE: + WaitPoller(false, true, "WriteEstablish"); + break; + } + } + } + + void ProcessUnexpectedEvent(TAutoPtr<IEventHandle> ev) override { + switch (const ui32 type = ev->GetTypeRewrite()) { + case TEvents::TSystem::Wakeup: + Fail(TEvHandshakeFail::HANDSHAKE_FAIL_TRANSIENT, Sprintf("Handshake timed out, State# %s", State.data()), true); + [[fallthrough]]; + + case ui32(ENetwork::NodeInfo): + case TEvInterconnect::EvNodeAddress: + case ui32(ENetwork::ResolveError): + break; // most likely a race with resolve timeout + + case TEvPollerReady::EventType: + break; + + default: + Y_FAIL("unexpected event 0x%08" PRIx32, type); + } + } + + template<typename T> + void SetupVersionTag(T& proto) { + if (Common->VersionInfo) { + proto.SetVersionTag(Common->VersionInfo->Tag); + for (const TString& accepted : Common->VersionInfo->AcceptedTags) { + proto.AddAcceptedVersionTags(accepted); + } + } + } + + template<typename T> + void SetupClusterUUID(T& proto) { + auto *pb = proto.MutableClusterUUIDs(); + pb->SetClusterUUID(Common->ClusterUUID); + for (const TString& uuid : Common->AcceptUUID) { + pb->AddAcceptUUID(uuid); + } + } + + template<typename T, typename TCallback> + void ValidateVersionTag(const T& proto, TCallback&& errorCallback) { + // check if we will accept peer's version tag (if peer provides one and if we have accepted list non-empty) + if (Common->VersionInfo) { + if (!proto.HasVersionTag()) { + LOG_LOG_IC_X(NActorsServices::INTERCONNECT, "ICH06", NLog::PRI_WARN, + "peer did not report VersionTag, accepting by default"); + } else if (!Common->VersionInfo->AcceptedTags.count(proto.GetVersionTag())) { + // we will not accept peer's tag, so check if remote peer would accept our version tag + size_t i; + for (i = 0; i < proto.AcceptedVersionTagsSize() && Common->VersionInfo->Tag != proto.GetAcceptedVersionTags(i); ++i) + {} + if (i == proto.AcceptedVersionTagsSize()) { + // peer will neither accept our version -- this is total failure + TStringStream s("local/peer version tags did not match accepted ones"); + s << " local Tag# " << Common->VersionInfo->Tag << " accepted Tags# ["; + bool first = true; + for (const auto& tag : Common->VersionInfo->AcceptedTags) { + s << (std::exchange(first, false) ? "" : " ") << tag; + } + s << "] peer Tag# " << proto.GetVersionTag() << " accepted Tags# ["; + first = true; + for (const auto& tag : proto.GetAcceptedVersionTags()) { + s << (std::exchange(first, false) ? "" : " ") << tag; + } + s << "]"; + errorCallback(s.Str()); + } + } + } + } + + template<typename T, typename TCallback> + void ValidateClusterUUID(const T& proto, TCallback&& errorCallback, const TMaybe<TString>& uuid = {}) { + auto formatList = [](const auto& list) { + TStringStream s; + s << "["; + for (auto it = list.begin(); it != list.end(); ++it) { + if (it != list.begin()) { + s << " "; + } + s << *it; + } + s << "]"; + return s.Str(); + }; + if (!Common->AcceptUUID) { + return; // promiscuous mode -- we accept every other peer + } + if (!proto.HasClusterUUIDs()) { + if (uuid) { + // old-style checking, peer does not support symmetric protoocol + bool matching = false; + for (const TString& accepted : Common->AcceptUUID) { + if (*uuid == accepted) { + matching = true; + break; + } + } + if (!matching) { + errorCallback(Sprintf("Peer ClusterUUID# %s mismatch, AcceptUUID# %s", uuid->data(), formatList(Common->AcceptUUID).data())); + } + } + return; // remote side did not fill in this field -- old version, symmetric protocol is not supported + } + + const auto& uuids = proto.GetClusterUUIDs(); + + // check if our UUID matches remote accept list + for (const TString& item : uuids.GetAcceptUUID()) { + if (item == Common->ClusterUUID) { + return; // match + } + } + + // check if remote UUID matches our accept list + const TString& remoteUUID = uuids.GetClusterUUID(); + for (const TString& item : Common->AcceptUUID) { + if (item == remoteUUID) { + return; // match + } + } + + // no match + errorCallback(Sprintf("Peer ClusterUUID# %s mismatch, AcceptUUID# %s", remoteUUID.data(), formatList(Common->AcceptUUID).data())); + } + + void ParsePeerScopeId(const NActorsInterconnect::TScopeId& proto) { + Params.PeerScopeId = {proto.GetX1(), proto.GetX2()}; + } + + void FillInScopeId(NActorsInterconnect::TScopeId& proto) { + const TScopeId& scope = Common->LocalScopeId; + proto.SetX1(scope.first); + proto.SetX2(scope.second); + } + + template<typename T> + void ReportProto(const T& protobuf, const char *msg) { + auto formatString = [&] { + google::protobuf::TextFormat::Printer p; + p.SetSingleLineMode(true); + TString s; + p.PrintToString(protobuf, &s); + return s; + }; + LOG_LOG_IC_X(NActorsServices::INTERCONNECT, "ICH07", NLog::PRI_DEBUG, "%s %s", msg, + formatString().data()); + } + + bool CheckPeerCookie(const TString& cookie, TString *error) { + // create a temporary socket to connect to the peer + TSocketPtr tempSocket; + std::swap(tempSocket, Socket); + TPollerToken::TPtr tempPollerToken; + std::swap(tempPollerToken, PollerToken); + + // set up virtual self id to ensure peer will not drop our connection + char buf[12] = {'c', 'o', 'o', 'k', 'i', 'e', ' ', 'c', 'h', 'e', 'c', 'k'}; + SelfVirtualId = TActorId(SelfActorId.NodeId(), TStringBuf(buf, 12)); + + bool success = true; + try { + // issue connection and send initial packet + Connect(false); + SendInitialPacket(); + + // wait for basic response + TInitialPacket response; + ReceiveData(&response, sizeof(response), "ReceiveResponse"); + if (!response.Check()) { + Fail(TEvHandshakeFail::HANDSHAKE_FAIL_TRANSIENT, "Initial packet CRC error"); + } else if (response.Header.Version != INTERCONNECT_PROTOCOL_VERSION) { + Fail(TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT, Sprintf("Incompatible protocol %" PRIu64, response.Header.Version)); + } + + // issue cookie check request + NActorsInterconnect::THandshakeRequest request; + request.SetProtocol(INTERCONNECT_PROTOCOL_VERSION); + request.SetProgramPID(0); + request.SetProgramStartTime(0); + request.SetSerial(0); + request.SetReceiverNodeId(0); + request.SetSenderActorId(TString()); + request.SetCookie(cookie); + request.SetDoCheckCookie(true); + SendExBlock(request, "SendExBlockDoCheckCookie"); + + // process cookie check reply + NActorsInterconnect::THandshakeReply reply; + if (!reply.ParseFromString(ReceiveExBlock("ReceiveExBlockDoCheckCookie"))) { + Fail(TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT, "Incorrect packet from peer"); + } else if (reply.HasCookieCheckResult() && !reply.GetCookieCheckResult()) { + Fail(TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT, "Cookie check error -- possible network problem"); + } + } catch (const TExHandshakeFailed& e) { + *error = e.what(); + success = false; + } + + // restore state + SelfVirtualId = TActorId(); + std::swap(tempSocket, Socket); + std::swap(tempPollerToken, PollerToken); + return success; + } + + void PerformOutgoingHandshake() { + LOG_LOG_IC_X(NActorsServices::INTERCONNECT, "ICH01", NLog::PRI_DEBUG, + "starting outgoing handshake"); + + // perform connection + Connect(true); + + // send initial request packet + SendInitialPacket(); + + TInitialPacket response; + ReceiveData(&response, sizeof(response), "ReceiveResponse"); + if (!response.Check()) { + Fail(TEvHandshakeFail::HANDSHAKE_FAIL_TRANSIENT, "Initial packet CRC error"); + } else if (response.Header.Version != INTERCONNECT_PROTOCOL_VERSION) { + Fail(TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT, Sprintf("Incompatible protocol %" PRIu64, response.Header.Version)); + } + + // extract next packet + NextPacketFromPeer = response.Header.NextPacket; + + if (!PeerVirtualId) { + // creating new session -- we have to generate request + NActorsInterconnect::THandshakeRequest request; + + request.SetProtocol(INTERCONNECT_PROTOCOL_VERSION); + request.SetProgramPID(GetPID()); + request.SetProgramStartTime(Common->StartTime); + request.SetSerial(SelfVirtualId.LocalId()); + request.SetReceiverNodeId(PeerNodeId); + request.SetSenderActorId(SelfVirtualId.ToString()); + request.SetSenderHostName(Common->TechnicalSelfHostName); + request.SetReceiverHostName(PeerHostName); + + if (Common->LocalScopeId != TScopeId()) { + FillInScopeId(*request.MutableClientScopeId()); + } + + if (Common->Cookie) { + request.SetCookie(Common->Cookie); + } + if (Common->ClusterUUID) { + request.SetUUID(Common->ClusterUUID); + } + SetupClusterUUID(request); + SetupVersionTag(request); + + if (const ui32 size = Common->HandshakeBallastSize) { + TString ballast(size, 0); + char* data = ballast.Detach(); + for (ui32 i = 0; i < size; ++i) { + data[i] = i; + } + request.SetBallast(ballast); + } + + switch (Common->Settings.EncryptionMode) { + case EEncryptionMode::DISABLED: + break; + + case EEncryptionMode::OPTIONAL: + request.SetRequireEncryption(false); + break; + + case EEncryptionMode::REQUIRED: + request.SetRequireEncryption(true); + break; + } + + request.SetRequestModernFrame(true); + request.SetRequestAuthOnly(Common->Settings.TlsAuthOnly); + + SendExBlock(request, "ExRequest"); + + NActorsInterconnect::THandshakeReply reply; + if (!reply.ParseFromString(ReceiveExBlock("ExReply"))) { + Fail(TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT, "Incorrect THandshakeReply"); + } + ReportProto(reply, "ReceiveExBlock ExReply"); + + if (reply.HasErrorExplaination()) { + Fail(TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT, "error from peer: " + reply.GetErrorExplaination()); + } else if (!reply.HasSuccess()) { + Fail(TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT, "empty reply"); + } + + auto generateError = [this](TString msg) { + Fail(TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT, msg); + }; + + const auto& success = reply.GetSuccess(); + ValidateClusterUUID(success, generateError); + ValidateVersionTag(success, generateError); + + const auto& s = success.GetSenderActorId(); + PeerVirtualId.Parse(s.data(), s.size()); + + // recover flags + Params.Encryption = success.GetStartEncryption(); + Params.UseModernFrame = success.GetUseModernFrame(); + Params.AuthOnly = Params.Encryption && success.GetAuthOnly(); + if (success.HasServerScopeId()) { + ParsePeerScopeId(success.GetServerScopeId()); + } + + // recover peer process info from peer's reply + ProgramInfo = GetProgramInfo(success); + } else if (!response.Header.SelfVirtualId) { + // peer reported error -- empty ack was generated by proxy for this request + Fail(TEvHandshakeFail::HANDSHAKE_FAIL_SESSION_MISMATCH, "Peer rejected session continuation handshake"); + } else if (response.Header.SelfVirtualId != PeerVirtualId || response.Header.PeerVirtualId != SelfVirtualId) { + // resuming existing session; check that virtual ids of peers match each other + Fail(TEvHandshakeFail::HANDSHAKE_FAIL_SESSION_MISMATCH, "Session virtual ID mismatch"); + } else { + ProgramInfo.ConstructInPlace(); // successful handshake + } + } + + void PerformIncomingHandshake() { + LOG_LOG_IC_X(NActorsServices::INTERCONNECT, "ICH02", NLog::PRI_DEBUG, + "starting incoming handshake"); + + // set up incoming socket + SetupSocket(); + + // wait for initial request packet + TInitialPacket request; + ReceiveData(&request, sizeof(request), "ReceiveRequest"); + if (!request.Check()) { + Fail(TEvHandshakeFail::HANDSHAKE_FAIL_TRANSIENT, "Initial packet CRC error"); + } else if (request.Header.Version != INTERCONNECT_PROTOCOL_VERSION) { + Fail(TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT, Sprintf("Incompatible protocol %" PRIu64, request.Header.Version)); + } + + // extract peer node id from the peer + PeerNodeId = request.Header.SelfVirtualId.NodeId(); + if (!PeerNodeId) { + Y_VERIFY_DEBUG(false, "PeerNodeId is zero request# %s", request.ToString().data()); + Fail(TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT, "SelfVirtualId.NodeId is empty in initial packet"); + } + UpdatePrefix(); + + // extract next packet + NextPacketFromPeer = request.Header.NextPacket; + + if (request.Header.PeerVirtualId) { + // issue request to the proxy and wait for the response + auto reply = AskProxy<TEvHandshakeAck, TEvHandshakeNak>(MakeHolder<TEvHandshakeAsk>( + request.Header.SelfVirtualId, request.Header.PeerVirtualId, request.Header.NextPacket), + "TEvHandshakeAsk"); + if (auto *ack = reply->CastAsLocal<TEvHandshakeAck>()) { + // extract self/peer virtual ids + SelfVirtualId = ack->Self; + PeerVirtualId = request.Header.SelfVirtualId; + NextPacketToPeer = ack->NextPacket; + Params = ack->Params; + + // only succeed in case when proxy returned valid SelfVirtualId; otherwise it wants us to terminate + // the handshake process and it does not expect the handshake reply + ProgramInfo.ConstructInPlace(); + } else { + LOG_LOG_IC_X(NActorsServices::INTERCONNECT, "ICH08", NLog::PRI_NOTICE, + "Continuation request rejected by proxy"); + + // report continuation reject to peer + SelfVirtualId = TActorId(); + PeerVirtualId = TActorId(); + NextPacketToPeer = 0; + } + + // issue response to the peer + SendInitialPacket(); + } else { + // peer wants a new session, clear fields and send initial packet + SelfVirtualId = TActorId(); + PeerVirtualId = TActorId(); + NextPacketToPeer = 0; + SendInitialPacket(); + + // wait for extended request + auto ev = MakeHolder<TEvHandshakeRequest>(); + auto& request = ev->Record; + if (!request.ParseFromString(ReceiveExBlock("ExRequest"))) { + Fail(TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT, "Incorrect THandshakeRequest"); + } + ReportProto(request, "ReceiveExBlock ExRequest"); + + auto generateError = [this](TString msg) { + // issue reply to the peer to prevent repeating connection retries + NActorsInterconnect::THandshakeReply reply; + reply.SetErrorExplaination(msg); + SendExBlock(reply, "ExReply"); + + // terminate ths handshake + Fail(TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT, msg); + }; + + // check request cookie + TString error; + if (request.HasDoCheckCookie()) { + NActorsInterconnect::THandshakeReply reply; + reply.SetCookieCheckResult(request.GetCookie() == Common->Cookie); + SendExBlock(reply, "ExReplyDoCheckCookie"); + throw TExHandshakeFailed(); + } else if (request.HasCookie() && !CheckPeerCookie(request.GetCookie(), &error)) { + generateError(TStringBuilder() << "Peer connectivity-checking failed, error# " << error); + } + + // update log prefix with the reported peer host name + PeerHostName = request.GetSenderHostName(); + + // parse peer virtual id + const auto& str = request.GetSenderActorId(); + PeerVirtualId.Parse(str.data(), str.size()); + + // validate request + ValidateClusterUUID(request, generateError, request.GetUUID()); + if (request.GetReceiverNodeId() != SelfActorId.NodeId()) { + generateError(Sprintf("Incorrect ReceiverNodeId# %" PRIu32 " from the peer, expected# %" PRIu32, + request.GetReceiverNodeId(), SelfActorId.NodeId())); + } else if (request.GetReceiverHostName() != Common->TechnicalSelfHostName) { + generateError(Sprintf("ReceiverHostName# %s mismatch, expected# %s", request.GetReceiverHostName().data(), + Common->TechnicalSelfHostName.data())); + } + ValidateVersionTag(request, generateError); + + // check peer node + auto peerNodeInfo = GetPeerNodeInfo(); + if (!peerNodeInfo) { + generateError("Peer node not registered in nameservice"); + } else if (peerNodeInfo->Host != request.GetSenderHostName()) { + generateError("SenderHostName mismatch"); + } + + // check request against encryption + switch (Common->Settings.EncryptionMode) { + case EEncryptionMode::DISABLED: + if (request.GetRequireEncryption()) { + generateError("Peer requested encryption, but it is disabled locally"); + } + break; + + case EEncryptionMode::OPTIONAL: + Params.Encryption = request.HasRequireEncryption(); + break; + + case EEncryptionMode::REQUIRED: + if (!request.HasRequireEncryption()) { + generateError("Peer did not request encryption, but it is required locally"); + } + Params.Encryption = true; + break; + } + + Params.UseModernFrame = request.GetRequestModernFrame(); + Params.AuthOnly = Params.Encryption && request.GetRequestAuthOnly() && Common->Settings.TlsAuthOnly; + + if (request.HasClientScopeId()) { + ParsePeerScopeId(request.GetClientScopeId()); + } + + // remember program info (assuming successful handshake) + ProgramInfo = GetProgramInfo(request); + + // send to proxy + auto reply = AskProxy<TEvHandshakeReplyOK, TEvHandshakeReplyError>(std::move(ev), "TEvHandshakeRequest"); + + // parse it + if (auto ev = reply->CastAsLocal<TEvHandshakeReplyOK>()) { + // issue successful reply to the peer + auto& record = ev->Record; + Y_VERIFY(record.HasSuccess()); + auto& success = *record.MutableSuccess(); + SetupClusterUUID(success); + SetupVersionTag(success); + success.SetStartEncryption(Params.Encryption); + if (Common->LocalScopeId != TScopeId()) { + FillInScopeId(*success.MutableServerScopeId()); + } + success.SetUseModernFrame(Params.UseModernFrame); + success.SetAuthOnly(Params.AuthOnly); + SendExBlock(record, "ExReply"); + + // extract sender actor id (self virtual id) + const auto& str = success.GetSenderActorId(); + SelfVirtualId.Parse(str.data(), str.size()); + } else if (auto ev = reply->CastAsLocal<TEvHandshakeReplyError>()) { + // in case of error just send reply to the peer and terminate handshake + SendExBlock(ev->Record, "ExReply"); + ProgramInfo.Clear(); // do not issue reply to the proxy + } else { + Y_FAIL("unexpected event Type# 0x%08" PRIx32, reply->GetTypeRewrite()); + } + } + } + + template <typename T> + void SendExBlock(const T& proto, const char* what) { + TString data; + Y_PROTOBUF_SUPPRESS_NODISCARD proto.SerializeToString(&data); + Y_VERIFY(data.size() <= TExHeader::MaxSize); + + ReportProto(proto, Sprintf("SendExBlock %s", what).data()); + + TExHeader header; + header.Size = data.size(); + header.Sign(data.data(), data.size()); + SendData(&header, sizeof(header), Sprintf("Send%sHeader", what)); + SendData(data.data(), data.size(), Sprintf("Send%sData", what)); + } + + TString ReceiveExBlock(const char* what) { + TExHeader header; + ReceiveData(&header, sizeof(header), Sprintf("Receive%sHeader", what)); + if (header.Size > TExHeader::MaxSize) { + Fail(TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT, "Incorrect extended header size"); + } + + TString data; + data.resize(header.Size); + ReceiveData(data.Detach(), data.size(), Sprintf("Receive%sData", what)); + + if (!header.Check(data.data(), data.size())) { + Fail(TEvHandshakeFail::HANDSHAKE_FAIL_TRANSIENT, "Extended header CRC error"); + } + + return data; + } + + private: + void SendToProxy(THolder<IEventBase> ev) { + Y_VERIFY(PeerNodeId); + Send(GetActorSystem()->InterconnectProxy(PeerNodeId), ev.Release()); + } + + template <typename TEvent> + THolder<typename TEvent::THandle> WaitForSpecificEvent(TString state, TInstant deadline = TInstant::Max()) { + State = std::move(state); + return TActorCoroImpl::WaitForSpecificEvent<TEvent>(deadline); + } + + template <typename T1, typename T2, typename... TEvents> + THolder<IEventHandle> WaitForSpecificEvent(TString state, TInstant deadline = TInstant::Max()) { + State = std::move(state); + return TActorCoroImpl::WaitForSpecificEvent<T1, T2, TEvents...>(deadline); + } + + template <typename TEvent> + THolder<typename TEvent::THandle> AskProxy(THolder<IEventBase> ev, TString state) { + SendToProxy(std::move(ev)); + return WaitForSpecificEvent<TEvent>(std::move(state)); + } + + template <typename T1, typename T2, typename... TOther> + THolder<IEventHandle> AskProxy(THolder<IEventBase> ev, TString state) { + SendToProxy(std::move(ev)); + return WaitForSpecificEvent<T1, T2, TOther...>(std::move(state)); + } + + void Fail(TEvHandshakeFail::EnumHandshakeFail reason, TString explanation, bool network = false) { + TString msg = Sprintf("%s Peer# %s(%s) %s%s", HandshakeKind.data(), PeerHostName ? PeerHostName.data() : "<unknown>", + PeerAddr.size() ? PeerAddr.data() : "<unknown>", ResolveTimedOut ? "[resolve timeout] " : "", + explanation.data()); + + if (network) { + TInstant now = Now(); + TInstant prevLog = LastLogNotice[PeerNodeId]; + NActors::NLog::EPriority logPriority = NActors::NLog::PRI_DEBUG; + if (now - prevLog > MuteDuration) { + logPriority = NActors::NLog::PRI_NOTICE; + LastLogNotice[PeerNodeId] = now; + } + LOG_LOG_NET_X(logPriority, PeerNodeId, "network-related error occured on handshake: %s", msg.data()); + } else { + // calculate log severity based on failure type; permanent failures lead to error log messages + auto severity = reason == TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT + ? NActors::NLog::PRI_NOTICE + : NActors::NLog::PRI_INFO; + + LOG_LOG_IC_X(NActorsServices::INTERCONNECT, "ICH03", severity, "handshake failed, explanation# %s", msg.data()); + } + + if (PeerNodeId) { + SendToProxy(MakeHolder<TEvHandshakeFail>(reason, std::move(msg))); + } + + throw TExHandshakeFailed() << explanation; + } + + private: + //////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // COMMUNICATION BLOCK + //////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + void Connect(bool updatePeerAddr) { + // issue request to a nameservice to resolve peer node address + Send(Common->NameserviceId, new TEvInterconnect::TEvResolveNode(PeerNodeId, Deadline)); + + // wait for the result + auto ev = WaitForSpecificEvent<TEvResolveError, TEvLocalNodeInfo, TEvInterconnect::TEvNodeAddress>("ResolveNode", + Now() + ResolveTimeout); + + // extract address from the result + NInterconnect::TAddress address; + if (!ev) { + ResolveTimedOut = true; + if (auto peerNodeInfo = GetPeerNodeInfo(); peerNodeInfo && peerNodeInfo->Address) { + address = {peerNodeInfo->Address, peerNodeInfo->Port}; + } else { + Fail(TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT, "DNS resolve timed out and no static address defined", true); + } + } else if (auto *p = ev->CastAsLocal<TEvLocalNodeInfo>()) { + if (!p->Address) { + Fail(TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT, "DNS resolve error: no address returned", true); + } + address = {*p->Address}; + } else if (auto *p = ev->CastAsLocal<TEvInterconnect::TEvNodeAddress>()) { + const auto& r = p->Record; + if (!r.HasAddress() || !r.HasPort()) { + Fail(TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT, "DNS resolve error: no address returned", true); + } + address = {r.GetAddress(), static_cast<ui16>(r.GetPort())}; + } else { + Y_VERIFY(ev->GetTypeRewrite() == ui32(ENetwork::ResolveError)); + Fail(TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT, "DNS resolve error: " + ev->Get<TEvResolveError>()->Explain, true); + } + + // create the socket with matching address family + Socket = NInterconnect::TStreamSocket::Make(address.GetFamily()); + if (*Socket == -1) { + Fail(TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT, "System error: failed to create socket"); + } + + // extract peer address + if (updatePeerAddr) { + PeerAddr = address.ToString(); + } + + // set up socket parameters + SetupSocket(); + + // start connecting + switch (int err = -Socket->Connect(address)) { + case 0: // successful connection + break; + + case EINPROGRESS: // connection in progress + WaitPoller(false, true, "WaitConnect"); + err = Socket->GetConnectStatus(); + if (err) { + Fail(TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT, Sprintf("Connection failed: %s", strerror(err)), true); + } + break; + + default: + break; + } + + auto it = LastLogNotice.find(PeerNodeId); + NActors::NLog::EPriority logPriority = NActors::NLog::PRI_DEBUG; + if (it != LastLogNotice.end()) { + LastLogNotice.erase(it); + logPriority = NActors::NLog::PRI_NOTICE; + } + LOG_LOG_IC_X(NActorsServices::INTERCONNECT, "ICH05", logPriority, "connected to peer"); + } + + void SetupSocket() { + // switch to nonblocking mode + try { + SetNonBlock(*Socket); + SetNoDelay(*Socket, true); + } catch (...) { + Fail(TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT, "System error: can't up nonblocking mode for socket"); + } + + // setup send buffer size + Socket->SetSendBufferSize(Common->Settings.GetSendBufferSize()); + + // register in poller + RegisterInPoller(); + } + + void RegisterInPoller() { + const bool success = Send(MakePollerActorId(), new TEvPollerRegister(Socket, SelfActorId, SelfActorId)); + Y_VERIFY(success); + auto result = WaitForSpecificEvent<TEvPollerRegisterResult>("RegisterPoller"); + PollerToken = std::move(result->Get()->PollerToken); + Y_VERIFY(PollerToken); + Y_VERIFY(PollerToken->RefCount() == 1); // ensure exclusive ownership + } + + void SendInitialPacket() { + TInitialPacket packet(SelfVirtualId, PeerVirtualId, NextPacketToPeer, INTERCONNECT_PROTOCOL_VERSION); + SendData(&packet, sizeof(packet), "SendInitialPacket"); + } + + void WaitPoller(bool read, bool write, TString state) { + PollerToken->Request(read, write); + WaitForSpecificEvent<TEvPollerReady>(std::move(state)); + } + + template <typename TDataPtr, typename TSendRecvFunc> + void Process(TDataPtr buffer, size_t len, TSendRecvFunc&& sendRecv, bool read, bool write, TString state) { + Y_VERIFY(Socket); + NInterconnect::TStreamSocket* sock = Socket.Get(); + ssize_t (NInterconnect::TStreamSocket::*pfn)(TDataPtr, size_t, TString*) const = sendRecv; + size_t processed = 0; + + auto error = [&](TString msg) { + Fail(TEvHandshakeFail::HANDSHAKE_FAIL_TRANSIENT, Sprintf("Socket error# %s state# %s processed# %zu remain# %zu", + msg.data(), state.data(), processed, len), true); + }; + + while (len) { + TString err; + ssize_t nbytes = (sock->*pfn)(buffer, len, &err); + if (nbytes > 0) { + buffer = (char*)buffer + nbytes; + len -= nbytes; + processed += nbytes; + } else if (-nbytes == EAGAIN || -nbytes == EWOULDBLOCK) { + WaitPoller(read, write, state); + } else if (!nbytes) { + error("connection unexpectedly closed"); + } else if (-nbytes != EINTR) { + error(err ? err : TString(strerror(-nbytes))); + } + } + } + + void SendData(const void* buffer, size_t len, TString state) { + Process(buffer, len, &NInterconnect::TStreamSocket::Send, false, true, std::move(state)); + } + + void ReceiveData(void* buffer, size_t len, TString state) { + Process(buffer, len, &NInterconnect::TStreamSocket::Recv, true, false, std::move(state)); + } + + THolder<TEvInterconnect::TNodeInfo> GetPeerNodeInfo() { + Y_VERIFY(PeerNodeId); + Send(Common->NameserviceId, new TEvInterconnect::TEvGetNode(PeerNodeId, Deadline)); + auto response = WaitForSpecificEvent<TEvInterconnect::TEvNodeInfo>("GetPeerNodeInfo"); + return std::move(response->Get()->Node); + } + + template <typename T> + static THolder<TProgramInfo> GetProgramInfo(const T& proto) { + auto programInfo = MakeHolder<TProgramInfo>(); + programInfo->PID = proto.GetProgramPID(); + programInfo->StartTime = proto.GetProgramStartTime(); + programInfo->Serial = proto.GetSerial(); + return programInfo; + } + }; + + IActor* CreateOutgoingHandshakeActor(TInterconnectProxyCommon::TPtr common, const TActorId& self, + const TActorId& peer, ui32 nodeId, ui64 nextPacket, TString peerHostName, + TSessionParams params) { + return new TActorCoro(MakeHolder<THandshakeActor>(std::move(common), self, peer, nodeId, nextPacket, + std::move(peerHostName), std::move(params))); + } + + IActor* CreateIncomingHandshakeActor(TInterconnectProxyCommon::TPtr common, TSocketPtr socket) { + return new TActorCoro(MakeHolder<THandshakeActor>(std::move(common), std::move(socket))); + } + +} diff --git a/library/cpp/actors/interconnect/interconnect_handshake.h b/library/cpp/actors/interconnect/interconnect_handshake.h new file mode 100644 index 0000000000..b3c0db6c5d --- /dev/null +++ b/library/cpp/actors/interconnect/interconnect_handshake.h @@ -0,0 +1,24 @@ +#pragma once + +#include <library/cpp/actors/core/hfunc.h> +#include <library/cpp/actors/core/event_pb.h> +#include <library/cpp/actors/core/events.h> + +#include "interconnect_common.h" +#include "interconnect_impl.h" +#include "poller_tcp.h" +#include "events_local.h" + +namespace NActors { + static constexpr TDuration DEFAULT_HANDSHAKE_TIMEOUT = TDuration::Seconds(1); + static constexpr ui64 INTERCONNECT_PROTOCOL_VERSION = 2; + + using TSocketPtr = TIntrusivePtr<NInterconnect::TStreamSocket>; + + IActor* CreateOutgoingHandshakeActor(TInterconnectProxyCommon::TPtr common, const TActorId& self, + const TActorId& peer, ui32 nodeId, ui64 nextPacket, TString peerHostName, + TSessionParams params); + + IActor* CreateIncomingHandshakeActor(TInterconnectProxyCommon::TPtr common, TSocketPtr socket); + +} diff --git a/library/cpp/actors/interconnect/interconnect_impl.h b/library/cpp/actors/interconnect/interconnect_impl.h new file mode 100644 index 0000000000..ee29e4d397 --- /dev/null +++ b/library/cpp/actors/interconnect/interconnect_impl.h @@ -0,0 +1,45 @@ +#pragma once + +#include "interconnect.h" +#include <library/cpp/actors/protos/interconnect.pb.h> +#include <library/cpp/actors/core/event_pb.h> +#include <library/cpp/actors/helpers/mon_histogram_helper.h> +#include <library/cpp/monlib/dynamic_counters/counters.h> + +namespace NActors { + // resolve node info + struct TEvInterconnect::TEvResolveNode: public TEventPB<TEvInterconnect::TEvResolveNode, NActorsInterconnect::TEvResolveNode, TEvInterconnect::EvResolveNode> { + TEvResolveNode() { + } + + TEvResolveNode(ui32 nodeId, TInstant deadline = TInstant::Max()) { + Record.SetNodeId(nodeId); + if (deadline != TInstant::Max()) { + Record.SetDeadline(deadline.GetValue()); + } + } + }; + + // node info + struct TEvInterconnect::TEvNodeAddress: public TEventPB<TEvInterconnect::TEvNodeAddress, NActorsInterconnect::TEvNodeInfo, TEvInterconnect::EvNodeAddress> { + TEvNodeAddress() { + } + + TEvNodeAddress(ui32 nodeId) { + Record.SetNodeId(nodeId); + } + }; + + // register node + struct TEvInterconnect::TEvRegisterNode: public TEventBase<TEvInterconnect::TEvRegisterNode, TEvInterconnect::EvRegisterNode> { + }; + + // reply on register node + struct TEvInterconnect::TEvRegisterNodeResult: public TEventBase<TEvInterconnect::TEvRegisterNodeResult, TEvInterconnect::EvRegisterNodeResult> { + }; + + // disconnect + struct TEvInterconnect::TEvDisconnect: public TEventLocal<TEvInterconnect::TEvDisconnect, TEvInterconnect::EvDisconnect> { + }; + +} diff --git a/library/cpp/actors/interconnect/interconnect_mon.cpp b/library/cpp/actors/interconnect/interconnect_mon.cpp new file mode 100644 index 0000000000..cf924ccbf9 --- /dev/null +++ b/library/cpp/actors/interconnect/interconnect_mon.cpp @@ -0,0 +1,276 @@ +#include "interconnect_mon.h" +#include "interconnect_tcp_proxy.h" + +#include <library/cpp/json/json_value.h> +#include <library/cpp/json/json_writer.h> +#include <library/cpp/monlib/service/pages/templates.h> + +#include <openssl/ssl.h> +#include <openssl/pem.h> + +namespace NInterconnect { + + using namespace NActors; + + class TInterconnectMonActor : public TActor<TInterconnectMonActor> { + class TQueryProcessor : public TActorBootstrapped<TQueryProcessor> { + const TActorId Sender; + const bool Json; + TMap<ui32, TInterconnectProxyTCP::TProxyStats> Stats; + ui32 PendingReplies = 0; + + public: + static constexpr IActor::EActorActivity ActorActivityType() { + return INTERCONNECT_MONACTOR; + } + + TQueryProcessor(const TActorId& sender, bool json) + : Sender(sender) + , Json(json) + {} + + void Bootstrap(const TActorContext& ctx) { + Become(&TThis::StateFunc, ctx, TDuration::Seconds(5), new TEvents::TEvWakeup); + Send(GetNameserviceActorId(), new TEvInterconnect::TEvListNodes); + } + + void Handle(TEvInterconnect::TEvNodesInfo::TPtr ev, const TActorContext& ctx) { + TActorSystem* const as = ctx.ExecutorThread.ActorSystem; + for (const auto& node : ev->Get()->Nodes) { + Send(as->InterconnectProxy(node.NodeId), new TInterconnectProxyTCP::TEvQueryStats, IEventHandle::FlagTrackDelivery); + ++PendingReplies; + } + GenerateResultWhenReady(ctx); + } + + STRICT_STFUNC(StateFunc, + HFunc(TEvInterconnect::TEvNodesInfo, Handle) + HFunc(TInterconnectProxyTCP::TEvStats, Handle) + CFunc(TEvents::TSystem::Undelivered, HandleUndelivered) + CFunc(TEvents::TSystem::Wakeup, HandleWakeup) + ) + + void Handle(TInterconnectProxyTCP::TEvStats::TPtr& ev, const TActorContext& ctx) { + auto *msg = ev->Get(); + Stats.emplace(msg->PeerNodeId, std::move(msg->ProxyStats)); + --PendingReplies; + GenerateResultWhenReady(ctx); + } + + void HandleUndelivered(const TActorContext& ctx) { + --PendingReplies; + GenerateResultWhenReady(ctx); + } + + void HandleWakeup(const TActorContext& ctx) { + PendingReplies = 0; + GenerateResultWhenReady(ctx); + } + + void GenerateResultWhenReady(const TActorContext& ctx) { + if (!PendingReplies) { + if (Json) { + ctx.Send(Sender, new NMon::TEvHttpInfoRes(GenerateJson(), 0, NMon::IEvHttpInfoRes::EContentType::Custom)); + } else { + ctx.Send(Sender, new NMon::TEvHttpInfoRes(GenerateHtml())); + } + Die(ctx); + } + } + + TString GenerateHtml() { + TStringStream str; + HTML(str) { + TABLE_CLASS("table-sortable table") { + TABLEHEAD() { + TABLER() { + TABLEH() { str << "Peer node id"; } + TABLEH() { str << "State"; } + TABLEH() { str << "Ping"; } + TABLEH() { str << "Clock skew"; } + TABLEH() { str << "Scope id"; } + TABLEH() { str << "Encryption"; } + TABLEH() { str << "LastSessionDieTime"; } + TABLEH() { str << "TotalOutputQueueSize"; } + TABLEH() { str << "Connected"; } + TABLEH() { str << "Host"; } + TABLEH() { str << "Port"; } + TABLEH() { str << "LastErrorTimestamp"; } + TABLEH() { str << "LastErrorKind"; } + TABLEH() { str << "LastErrorExplanation"; } + } + } + TABLEBODY() { + for (const auto& kv : Stats) { + TABLER() { + TABLED() { str << "<a href='" << kv.second.Path << "'>" << kv.first << "</a>"; } + TABLED() { str << kv.second.State; } + TABLED() { + if (kv.second.Ping != TDuration::Zero()) { + str << kv.second.Ping; + } + } + TABLED() { + if (kv.second.ClockSkew < 0) { + str << "-" << TDuration::MicroSeconds(-kv.second.ClockSkew); + } else { + str << "+" << TDuration::MicroSeconds(kv.second.ClockSkew); + } + } + TABLED() { str << ScopeIdToString(kv.second.PeerScopeId); } + TABLED() { + const char *color = kv.second.Encryption != "none" ? "green" : "red"; + str << "<font color='" << color << "'>" << kv.second.Encryption << "</font>"; + } + TABLED() { + if (kv.second.LastSessionDieTime != TInstant::Zero()) { + str << kv.second.LastSessionDieTime; + } + } + TABLED() { str << kv.second.TotalOutputQueueSize; } + TABLED() { str << (kv.second.Connected ? "yes" : "<strong>no</strong>"); } + TABLED() { str << kv.second.Host; } + TABLED() { str << kv.second.Port; } + TABLED() { + str << "<strong>"; + if (kv.second.LastErrorTimestamp != TInstant::Zero()) { + str << kv.second.LastErrorTimestamp; + } + str << "</strong>"; + } + TABLED() { str << "<strong>" << kv.second.LastErrorKind << "</strong>"; } + TABLED() { str << "<strong>" << kv.second.LastErrorExplanation << "</strong>"; } + } + } + } + } + } + return str.Str(); + } + + TString GenerateJson() { + NJson::TJsonValue json; + for (const auto& [nodeId, info] : Stats) { + NJson::TJsonValue item; + item["NodeId"] = nodeId; + + auto id = [](const auto& x) { return x; }; + auto toString = [](const auto& x) { return x.ToString(); }; + +#define JSON(NAME, FUN) item[#NAME] = FUN(info.NAME); + JSON(Path, id) + JSON(State, id) + JSON(PeerScopeId, ScopeIdToString) + JSON(LastSessionDieTime, toString) + JSON(TotalOutputQueueSize, id) + JSON(Connected, id) + JSON(Host, id) + JSON(Port, id) + JSON(LastErrorTimestamp, toString) + JSON(LastErrorKind, id) + JSON(LastErrorExplanation, id) + JSON(Ping, toString) + JSON(ClockSkew, id) + JSON(Encryption, id) +#undef JSON + + json[ToString(nodeId)] = item; + } + TStringStream str(NMonitoring::HTTPOKJSON); + NJson::WriteJson(&str, &json); + return str.Str(); + } + }; + + private: + TIntrusivePtr<TInterconnectProxyCommon> Common; + + public: + static constexpr IActor::EActorActivity ActorActivityType() { + return INTERCONNECT_MONACTOR; + } + + TInterconnectMonActor(TIntrusivePtr<TInterconnectProxyCommon> common) + : TActor(&TThis::StateFunc) + , Common(std::move(common)) + {} + + STRICT_STFUNC(StateFunc, + HFunc(NMon::TEvHttpInfo, Handle) + ) + + void Handle(NMon::TEvHttpInfo::TPtr& ev, const TActorContext& ctx) { + const auto& params = ev->Get()->Request.GetParams(); + int certinfo = 0; + if (TryFromString(params.Get("certinfo"), certinfo) && certinfo) { + ctx.Send(ev->Sender, new NMon::TEvHttpInfoRes(GetCertInfoJson(), ev->Get()->SubRequestId, + NMon::TEvHttpInfoRes::Custom)); + } else { + const bool json = params.Has("fmt") && params.Get("fmt") == "json"; + ctx.Register(new TQueryProcessor(ev->Sender, json)); + } + } + + TString GetCertInfoJson() const { + NJson::TJsonValue json(NJson::JSON_MAP); + if (const TString cert = Common ? Common->Settings.Certificate : TString()) { + struct TEx : yexception {}; + try { + const auto& cert = Common->Settings.Certificate; + std::unique_ptr<BIO, void(*)(BIO*)> bio(BIO_new_mem_buf(cert.data(), cert.size()), &BIO_vfree); + if (!bio) { + throw TEx() << "BIO_new_mem_buf failed"; + } + std::unique_ptr<X509, void(*)(X509*)> x509(PEM_read_bio_X509(bio.get(), nullptr, nullptr, nullptr), + &X509_free); + if (!x509) { + throw TEx() << "PEM_read_bio_X509 failed"; + } + X509_NAME *name = X509_get_subject_name(x509.get()); + if (!name) { + throw TEx() << "X509_get_subject_name failed"; + } + char buffer[4096]; + if (char *p = X509_NAME_oneline(name, buffer, sizeof(buffer))) { + json["Subject"] = p; + } + if (int loc = X509_NAME_get_index_by_NID(name, NID_commonName, -1); loc >= 0) { + if (X509_NAME_ENTRY *entry = X509_NAME_get_entry(name, loc)) { + if (ASN1_STRING *data = X509_NAME_ENTRY_get_data(entry)) { + unsigned char *cn; + if (const int len = ASN1_STRING_to_UTF8(&cn, data); len >= 0) { + json["CommonName"] = TString(reinterpret_cast<char*>(cn), len); + OPENSSL_free(cn); + } + } + } + } + auto time = [](const ASN1_TIME *t, const char *name) -> TString { + if (t) { + struct tm tm; + if (ASN1_TIME_to_tm(t, &tm)) { + return Strftime("%Y-%m-%dT%H:%M:%S%z", &tm); + } else { + throw TEx() << "ASN1_TIME_to_tm failed"; + } + } else { + throw TEx() << name << " failed"; + } + }; + json["NotBefore"] = time(X509_get0_notBefore(x509.get()), "X509_get0_notBefore"); + json["NotAfter"] = time(X509_get0_notAfter(x509.get()), "X509_get0_notAfter"); + } catch (const TEx& ex) { + json["Error"] = ex.what(); + } + } + TStringStream str(NMonitoring::HTTPOKJSON); + NJson::WriteJson(&str, &json); + return str.Str(); + } + }; + + IActor *CreateInterconnectMonActor(TIntrusivePtr<TInterconnectProxyCommon> common) { + return new TInterconnectMonActor(std::move(common)); + } + +} // NInterconnect diff --git a/library/cpp/actors/interconnect/interconnect_mon.h b/library/cpp/actors/interconnect/interconnect_mon.h new file mode 100644 index 0000000000..3fb26053fb --- /dev/null +++ b/library/cpp/actors/interconnect/interconnect_mon.h @@ -0,0 +1,15 @@ +#pragma once + +#include <library/cpp/actors/core/actor.h> +#include "interconnect_common.h" + +namespace NInterconnect { + + NActors::IActor *CreateInterconnectMonActor(TIntrusivePtr<NActors::TInterconnectProxyCommon> common = nullptr); + + static inline NActors::TActorId MakeInterconnectMonActorId(ui32 nodeId) { + char s[12] = {'I', 'C', 'O', 'v', 'e', 'r', 'v', 'i', 'e', 'w', 0, 0}; + return NActors::TActorId(nodeId, TStringBuf(s, 12)); + } + +} // NInterconnect diff --git a/library/cpp/actors/interconnect/interconnect_nameserver_base.h b/library/cpp/actors/interconnect/interconnect_nameserver_base.h new file mode 100644 index 0000000000..df614f6c2b --- /dev/null +++ b/library/cpp/actors/interconnect/interconnect_nameserver_base.h @@ -0,0 +1,83 @@ +#include "interconnect.h" +#include "interconnect_impl.h" +#include "interconnect_address.h" +#include "events_local.h" + +#include <library/cpp/actors/core/hfunc.h> +#include <library/cpp/actors/memory_log/memlog.h> + +namespace NActors { + + template<typename TDerived> + class TInterconnectNameserverBase : public TActor<TDerived> { + protected: + const TMap<ui32, TTableNameserverSetup::TNodeInfo>& NodeTable; + + TInterconnectNameserverBase(void (TDerived::*func)(TAutoPtr<IEventHandle>& ev, const TActorContext& ctx) + , const TMap<ui32, TTableNameserverSetup::TNodeInfo>& nodeTable) + : TActor<TDerived>(func) + , NodeTable(nodeTable) + { + } + public: + + void HandleMissedNodeId(TEvInterconnect::TEvResolveNode::TPtr& ev, + const TActorContext& ctx, + const TInstant&) { + auto reply = new TEvLocalNodeInfo; + reply->NodeId = ev->Get()->Record.GetNodeId(); + ctx.Send(ev->Sender, reply); + } + + void Handle(TEvInterconnect::TEvResolveNode::TPtr& ev, + const TActorContext& ctx) { + const TEvInterconnect::TEvResolveNode* request = ev->Get(); + auto& record = request->Record; + const ui32 nodeId = record.GetNodeId(); + const TInstant deadline = record.HasDeadline() ? TInstant::FromValue(record.GetDeadline()) : TInstant::Max(); + auto it = NodeTable.find(nodeId); + + if (it == NodeTable.end()) { + static_cast<TDerived*>(this)->HandleMissedNodeId(ev, ctx, deadline); + } else { + IActor::RegisterWithSameMailbox( + CreateResolveActor(nodeId, it->second, ev->Sender, this->SelfId(), deadline)); + } + } + + void Handle(TEvResolveAddress::TPtr& ev, + const TActorContext&) { + const TEvResolveAddress* request = ev->Get(); + + IActor::RegisterWithSameMailbox( + CreateResolveActor(request->Address, request->Port, ev->Sender, this->SelfId(), TInstant::Max())); + } + + void Handle(TEvInterconnect::TEvListNodes::TPtr& ev, + const TActorContext& ctx) { + THolder<TEvInterconnect::TEvNodesInfo> + reply(new TEvInterconnect::TEvNodesInfo()); + reply->Nodes.reserve(NodeTable.size()); + for (const auto& pr : NodeTable) { + reply->Nodes.emplace_back(pr.first, + pr.second.Address, pr.second.Host, pr.second.ResolveHost, + pr.second.Port, pr.second.Location); + } + ctx.Send(ev->Sender, reply.Release()); + } + + void Handle(TEvInterconnect::TEvGetNode::TPtr& ev, + const TActorContext& ctx) { + ui32 nodeId = ev->Get()->NodeId; + THolder<TEvInterconnect::TEvNodeInfo> + reply(new TEvInterconnect::TEvNodeInfo(nodeId)); + auto it = NodeTable.find(nodeId); + if (it != NodeTable.end()) { + reply->Node = MakeHolder<TEvInterconnect::TNodeInfo>(it->first, it->second.Address, + it->second.Host, it->second.ResolveHost, + it->second.Port, it->second.Location); + } + ctx.Send(ev->Sender, reply.Release()); + } + }; +} diff --git a/library/cpp/actors/interconnect/interconnect_nameserver_dynamic.cpp b/library/cpp/actors/interconnect/interconnect_nameserver_dynamic.cpp new file mode 100644 index 0000000000..5e48401b14 --- /dev/null +++ b/library/cpp/actors/interconnect/interconnect_nameserver_dynamic.cpp @@ -0,0 +1,178 @@ +#include "interconnect.h" +#include "interconnect_impl.h" +#include "interconnect_address.h" +#include "interconnect_nameserver_base.h" +#include "events_local.h" +#include "logging.h" + +#include <library/cpp/actors/core/hfunc.h> +#include <library/cpp/actors/core/log.h> + +namespace NActors { + + class TInterconnectDynamicNameserver + : public TInterconnectNameserverBase<TInterconnectDynamicNameserver> + , public TInterconnectLoggingBase + { + struct TPendingRequest { + TEvInterconnect::TEvResolveNode::TPtr Request; + TInstant Deadline; + + TPendingRequest(TEvInterconnect::TEvResolveNode::TPtr request, const TInstant& deadline) + : Request(request), Deadline(deadline) + { + } + }; + + TMap<ui32, TTableNameserverSetup::TNodeInfo> NodeTable; + TVector<TPendingRequest> PendingRequests; + TDuration PendingPeriod; + + void PrintInfo() { + TString logMsg = TStringBuilder() << "Table size: " << NodeTable.size(); + for (const auto& [nodeId, node] : NodeTable) { + TString str = TStringBuilder() << "\n > Node " << nodeId << " `" << node.Address << "`:" << node.Port << ", host: " << node.Host << ", resolveHost: " << node.ResolveHost; + logMsg += str; + } + LOG_DEBUG_IC("ICN01", "%s", logMsg.c_str()); + } + + bool IsNodeUpdated(const ui32 nodeId, const TString& address, const ui32 port) { + bool printInfo = false; + auto it = NodeTable.find(nodeId); + if (it == NodeTable.end()) { + LOG_DEBUG_IC("ICN02", "New node %u `%s`: %u", + nodeId, address.c_str(), port); + printInfo = true; + } else if (it->second.Address != address || it->second.Port != port) { + LOG_DEBUG_IC("ICN03", "Updated node %u `%s`: %u (from `%s`: %u)", + nodeId, address.c_str(), port, it->second.Address.c_str(), it->second.Port); + printInfo = true; + Send(TActivationContext::InterconnectProxy(nodeId), new TEvInterconnect::TEvDisconnect); + } + return printInfo; + } + + void DiscardTimedOutRequests(const TActorContext& ctx, ui32 compactionCount = 0) { + + auto now = Now(); + + for (auto& pending : PendingRequests) { + if (pending.Deadline > now) { + LOG_ERROR_IC("ICN06", "Unknown nodeId: %u", pending.Request->Get()->Record.GetNodeId()); + auto reply = new TEvLocalNodeInfo; + reply->NodeId = pending.Request->Get()->Record.GetNodeId(); + ctx.Send(pending.Request->Sender, reply); + pending.Request.Reset(); + compactionCount++; + } + } + + if (compactionCount) { + TVector<TPendingRequest> requests; + if (compactionCount < PendingRequests.size()) { // sanity check + requests.reserve(PendingRequests.size() - compactionCount); + } + for (auto& pending : PendingRequests) { + if (pending.Request) { + requests.emplace_back(pending.Request, pending.Deadline); + } + } + PendingRequests.swap(requests); + } + } + + void SchedulePeriodic() { + Schedule(TDuration::MilliSeconds(200), new TEvents::TEvWakeup()); + } + + public: + static constexpr EActivityType ActorActivityType() { + return NAMESERVICE; + } + + TInterconnectDynamicNameserver(const TIntrusivePtr<TTableNameserverSetup>& setup, const TDuration& pendingPeriod, ui32 /*resolvePoolId*/ ) + : TInterconnectNameserverBase<TInterconnectDynamicNameserver>(&TInterconnectDynamicNameserver::StateFunc, NodeTable) + , NodeTable(setup->StaticNodeTable) + , PendingPeriod(pendingPeriod) + { + Y_VERIFY(setup->IsEntriesUnique()); + } + + STFUNC(StateFunc) { + try { + switch (ev->GetTypeRewrite()) { + HFunc(TEvInterconnect::TEvResolveNode, Handle); + HFunc(TEvResolveAddress, Handle); + HFunc(TEvInterconnect::TEvListNodes, Handle); + HFunc(TEvInterconnect::TEvGetNode, Handle); + HFunc(TEvInterconnect::TEvNodesInfo, HandleUpdate); + CFunc(TEvents::TEvWakeup::EventType, HandlePeriodic); + } + } catch (...) { + LOG_ERROR_IC("ICN09", "%s", CurrentExceptionMessage().c_str()); + } + } + + void HandleMissedNodeId(TEvInterconnect::TEvResolveNode::TPtr& ev, + const TActorContext& ctx, + const TInstant& deadline) { + if (PendingPeriod) { + if (PendingRequests.size() == 0) { + SchedulePeriodic(); + } + PendingRequests.emplace_back(std::move(ev), Min(deadline, Now() + PendingPeriod)); + } else { + LOG_ERROR_IC("ICN07", "Unknown nodeId: %u", ev->Get()->Record.GetNodeId()); + TInterconnectNameserverBase::HandleMissedNodeId(ev, ctx, deadline); + } + } + + void HandleUpdate(TEvInterconnect::TEvNodesInfo::TPtr& ev, + const TActorContext& ctx) { + + auto request = ev->Get(); + LOG_DEBUG_IC("ICN04", "Update TEvNodesInfo with sz: %lu ", request->Nodes.size()); + + bool printInfo = false; + ui32 compactionCount = 0; + + for (const auto& node : request->Nodes) { + printInfo |= IsNodeUpdated(node.NodeId, node.Address, node.Port); + + NodeTable[node.NodeId] = TTableNameserverSetup::TNodeInfo( + node.Address, node.Host, node.ResolveHost, node.Port, node.Location); + + for (auto& pending : PendingRequests) { + if (pending.Request->Get()->Record.GetNodeId() == node.NodeId) { + LOG_DEBUG_IC("ICN05", "Pending nodeId: %u discovered", node.NodeId); + RegisterWithSameMailbox( + CreateResolveActor(node.NodeId, NodeTable[node.NodeId], pending.Request->Sender, SelfId(), pending.Deadline)); + pending.Request.Reset(); + compactionCount++; + } + } + } + + if (printInfo) { + PrintInfo(); + } + + DiscardTimedOutRequests(ctx, compactionCount); + } + + void HandlePeriodic(const TActorContext& ctx) { + DiscardTimedOutRequests(ctx, 0); + if (PendingRequests.size()) { + SchedulePeriodic(); + } + } + }; + + IActor* CreateDynamicNameserver(const TIntrusivePtr<TTableNameserverSetup>& setup, + const TDuration& pendingPeriod, + ui32 poolId) { + return new TInterconnectDynamicNameserver(setup, pendingPeriod, poolId); + } + +} diff --git a/library/cpp/actors/interconnect/interconnect_nameserver_table.cpp b/library/cpp/actors/interconnect/interconnect_nameserver_table.cpp new file mode 100644 index 0000000000..43419bf70d --- /dev/null +++ b/library/cpp/actors/interconnect/interconnect_nameserver_table.cpp @@ -0,0 +1,86 @@ +#include "interconnect.h" +#include "interconnect_impl.h" +#include "interconnect_address.h" +#include "interconnect_nameserver_base.h" +#include "events_local.h" + +#include <library/cpp/actors/core/hfunc.h> +#include <library/cpp/actors/memory_log/memlog.h> + +namespace NActors { + + class TInterconnectNameserverTable: public TInterconnectNameserverBase<TInterconnectNameserverTable> { + TIntrusivePtr<TTableNameserverSetup> Config; + + public: + static constexpr EActivityType ActorActivityType() { + return NAMESERVICE; + } + + TInterconnectNameserverTable(const TIntrusivePtr<TTableNameserverSetup>& setup, ui32 /*resolvePoolId*/) + : TInterconnectNameserverBase<TInterconnectNameserverTable>(&TInterconnectNameserverTable::StateFunc, setup->StaticNodeTable) + , Config(setup) + { + Y_VERIFY(Config->IsEntriesUnique()); + } + + STFUNC(StateFunc) { + try { + switch (ev->GetTypeRewrite()) { + HFunc(TEvInterconnect::TEvResolveNode, Handle); + HFunc(TEvResolveAddress, Handle); + HFunc(TEvInterconnect::TEvListNodes, Handle); + HFunc(TEvInterconnect::TEvGetNode, Handle); + } + } catch (...) { + // on error - do nothing + } + } + }; + + IActor* CreateNameserverTable(const TIntrusivePtr<TTableNameserverSetup>& setup, ui32 poolId) { + return new TInterconnectNameserverTable(setup, poolId); + } + + bool TTableNameserverSetup::IsEntriesUnique() const { + TVector<const TNodeInfo*> infos; + infos.reserve(StaticNodeTable.size()); + for (const auto& x : StaticNodeTable) + infos.push_back(&x.second); + + auto CompareAddressLambda = + [](const TNodeInfo* left, const TNodeInfo* right) { + return left->Port == right->Port ? left->Address < right->Address : left->Port < right->Port; + }; + + Sort(infos, CompareAddressLambda); + + for (ui32 idx = 1, end = StaticNodeTable.size(); idx < end; ++idx) { + const TNodeInfo* left = infos[idx - 1]; + const TNodeInfo* right = infos[idx]; + if (left->Address && left->Address == right->Address && left->Port == right->Port) + return false; + } + + auto CompareHostLambda = + [](const TNodeInfo* left, const TNodeInfo* right) { + return left->Port == right->Port ? left->ResolveHost < right->ResolveHost : left->Port < right->Port; + }; + + Sort(infos, CompareHostLambda); + + for (ui32 idx = 1, end = StaticNodeTable.size(); idx < end; ++idx) { + const TNodeInfo* left = infos[idx - 1]; + const TNodeInfo* right = infos[idx]; + if (left->ResolveHost == right->ResolveHost && left->Port == right->Port) + return false; + } + + return true; + } + + TActorId GetNameserviceActorId() { + return TActorId(0, "namesvc"); + } + +} diff --git a/library/cpp/actors/interconnect/interconnect_proxy_wrapper.cpp b/library/cpp/actors/interconnect/interconnect_proxy_wrapper.cpp new file mode 100644 index 0000000000..1c44b4c59b --- /dev/null +++ b/library/cpp/actors/interconnect/interconnect_proxy_wrapper.cpp @@ -0,0 +1,47 @@ +#include "interconnect_proxy_wrapper.h" +#include "interconnect_tcp_proxy.h" +#include <library/cpp/actors/interconnect/mock/ic_mock.h> + +namespace NActors { + + class TInterconnectProxyWrapper : public IActor { + TIntrusivePtr<TInterconnectProxyCommon> Common; + const ui32 NodeId; + TInterconnectMock *Mock; + IActor *Proxy = nullptr; + + public: + TInterconnectProxyWrapper(TIntrusivePtr<TInterconnectProxyCommon> common, ui32 nodeId, TInterconnectMock *mock) + : IActor(static_cast<TReceiveFunc>(&TInterconnectProxyWrapper::StateFunc), INTERCONNECT_PROXY_WRAPPER) + , Common(std::move(common)) + , NodeId(nodeId) + , Mock(mock) + {} + + STFUNC(StateFunc) { + if (ev->GetTypeRewrite() == TEvents::TSystem::Poison && !Proxy) { + PassAway(); + } else { + if (!Proxy) { + IActor *actor = Mock + ? Mock->CreateProxyMock(TActivationContext::ActorSystem()->NodeId, NodeId, Common) + : new TInterconnectProxyTCP(NodeId, Common, &Proxy); + RegisterWithSameMailbox(actor); + if (Mock) { + Proxy = actor; + } + Y_VERIFY(Proxy); + } + InvokeOtherActor(*Proxy, &IActor::Receive, ev, ctx); + } + } + }; + + TProxyWrapperFactory CreateProxyWrapperFactory(TIntrusivePtr<TInterconnectProxyCommon> common, ui32 poolId, + TInterconnectMock *mock) { + return [=](TActorSystem *as, ui32 nodeId) -> TActorId { + return as->Register(new TInterconnectProxyWrapper(common, nodeId, mock), TMailboxType::HTSwap, poolId); + }; + } + +} // NActors diff --git a/library/cpp/actors/interconnect/interconnect_proxy_wrapper.h b/library/cpp/actors/interconnect/interconnect_proxy_wrapper.h new file mode 100644 index 0000000000..e5942351a7 --- /dev/null +++ b/library/cpp/actors/interconnect/interconnect_proxy_wrapper.h @@ -0,0 +1,12 @@ +#pragma once + +#include "interconnect_common.h" + +#include <library/cpp/actors/core/actorsystem.h> + +namespace NActors { + + TProxyWrapperFactory CreateProxyWrapperFactory(TIntrusivePtr<TInterconnectProxyCommon> common, ui32 poolId, + class TInterconnectMock *mock = nullptr); + +} diff --git a/library/cpp/actors/interconnect/interconnect_resolve.cpp b/library/cpp/actors/interconnect/interconnect_resolve.cpp new file mode 100644 index 0000000000..14296194df --- /dev/null +++ b/library/cpp/actors/interconnect/interconnect_resolve.cpp @@ -0,0 +1,174 @@ +#include "interconnect.h" +#include "interconnect_address.h" +#include "events_local.h" + +#include <library/cpp/actors/core/actor_bootstrapped.h> +#include <library/cpp/actors/core/hfunc.h> +#include <library/cpp/actors/dnsresolver/dnsresolver.h> + +namespace NActors { + + using namespace NActors::NDnsResolver; + + class TInterconnectResolveActor : public TActorBootstrapped<TInterconnectResolveActor> { + public: + TInterconnectResolveActor( + const TString& host, ui16 port, ui32 nodeId, const TString& defaultAddress, + const TActorId& replyTo, const TActorId& replyFrom, TInstant deadline) + : Host(host) + , NodeId(nodeId) + , Port(port) + , DefaultAddress(defaultAddress) + , ReplyTo(replyTo) + , ReplyFrom(replyFrom) + , Deadline(deadline) + { } + + TInterconnectResolveActor( + const TString& host, ui16 port, + const TActorId& replyTo, const TActorId& replyFrom, TInstant deadline) + : Host(host) + , Port(port) + , ReplyTo(replyTo) + , ReplyFrom(replyFrom) + , Deadline(deadline) + { } + + static constexpr EActivityType ActorActivityType() { + return NAMESERVICE; + } + + void Bootstrap() { + TMaybe<TString> errorText; + if (auto addr = ExtractDefaultAddr(errorText)) { + return SendAddrAndDie(std::move(addr)); + } + + if (errorText) { + SendErrorAndDie(*errorText); + } + + auto now = TActivationContext::Now(); + if (Deadline < now) { + SendErrorAndDie("Deadline"); + return; + } + + Send(MakeDnsResolverActorId(), + new TEvDns::TEvGetAddr(Host, AF_UNSPEC), + IEventHandle::FlagTrackDelivery); + + if (Deadline != TInstant::Max()) { + Schedule(Deadline, new TEvents::TEvWakeup); + } + + Become(&TThis::StateWork); + } + + STRICT_STFUNC(StateWork, { + sFunc(TEvents::TEvWakeup, HandleTimeout); + sFunc(TEvents::TEvUndelivered, HandleUndelivered); + hFunc(TEvDns::TEvGetAddrResult, Handle); + }); + + void HandleTimeout() { + SendErrorAndDie("Deadline"); + } + + void HandleUndelivered() { + SendErrorAndDie("Dns resolver is unavailable"); + } + + void Handle(TEvDns::TEvGetAddrResult::TPtr& ev) { + if (auto addr = ExtractAddr(ev->Get())) { + return SendAddrAndDie(std::move(addr)); + } + + SendErrorAndDie(ev->Get()->ErrorText); + } + + void SendAddrAndDie(NAddr::IRemoteAddrPtr addr) { + if (NodeId) { + auto reply = new TEvLocalNodeInfo; + reply->NodeId = *NodeId; + reply->Address = std::move(addr); + TActivationContext::Send(new IEventHandle(ReplyTo, ReplyFrom, reply)); + } else { + auto reply = new TEvAddressInfo; + reply->Address = std::move(addr); + TActivationContext::Send(new IEventHandle(ReplyTo, ReplyFrom, reply)); + } + PassAway(); + } + + void SendErrorAndDie(const TString& errorText) { + auto *event = new TEvResolveError; + event->Explain = errorText; + TActivationContext::Send(new IEventHandle(ReplyTo, ReplyFrom, event)); + PassAway(); + } + + NAddr::IRemoteAddrPtr ExtractAddr(TEvDns::TEvGetAddrResult* msg) { + if (msg->Status == 0) { + if (msg->IsV6()) { + struct sockaddr_in6 sin6; + Zero(sin6); + sin6.sin6_family = AF_INET6; + sin6.sin6_addr = msg->GetAddrV6(); + sin6.sin6_port = HostToInet(Port); + return MakeHolder<NAddr::TIPv6Addr>(sin6); + } + + if (msg->IsV4()) { + return MakeHolder<NAddr::TIPv4Addr>(TIpAddress(msg->GetAddrV4().s_addr, Port)); + } + + Y_FAIL("Unexpected result address family"); + } + + return nullptr; + } + + NAddr::IRemoteAddrPtr ExtractDefaultAddr(TMaybe<TString>& errorText) { + if (DefaultAddress) { + NInterconnect::TAddress address(DefaultAddress.data(), Port); + + switch (address.GetFamily()) { + case AF_INET: + return MakeHolder<NAddr::TIPv4Addr>(*(sockaddr_in*)address.SockAddr()); + case AF_INET6: + return MakeHolder<NAddr::TIPv6Addr>(*(sockaddr_in6*)address.SockAddr()); + default: + errorText = "Unsupported default address: " + DefaultAddress; + break; + } + } + + return nullptr; + } + + private: + const TString Host; + const std::optional<ui32> NodeId; + const ui16 Port; + const TString DefaultAddress; + const TActorId ReplyTo; + const TActorId ReplyFrom; + const TInstant Deadline; + }; + + IActor* CreateResolveActor( + const TString& host, ui16 port, ui32 nodeId, const TString& defaultAddress, + const TActorId& replyTo, const TActorId& replyFrom, TInstant deadline) + { + return new TInterconnectResolveActor(host, port, nodeId, defaultAddress, replyTo, replyFrom, deadline); + } + + IActor* CreateResolveActor( + const TString& host, ui16 port, + const TActorId& replyTo, const TActorId& replyFrom, TInstant deadline) + { + return new TInterconnectResolveActor(host, port, replyTo, replyFrom, deadline); + } + +} // namespace NActors diff --git a/library/cpp/actors/interconnect/interconnect_stream.cpp b/library/cpp/actors/interconnect/interconnect_stream.cpp new file mode 100644 index 0000000000..158ebc9e1d --- /dev/null +++ b/library/cpp/actors/interconnect/interconnect_stream.cpp @@ -0,0 +1,628 @@ +#include "interconnect_stream.h" +#include "logging.h" +#include <library/cpp/openssl/init/init.h> +#include <util/network/socket.h> +#include <openssl/ssl.h> +#include <openssl/err.h> +#include <openssl/pem.h> + +#if defined(_win_) +#include <util/system/file.h> +#define SOCK_NONBLOCK 0 +#elif defined(_darwin_) +#define SOCK_NONBLOCK 0 +#else +#include <sys/un.h> +#include <sys/stat.h> +#endif //_win_ + +#if !defined(_win_) +#include <sys/ioctl.h> +#endif + +#include <cerrno> + +namespace NInterconnect { + namespace { + inline int + LastSocketError() { +#if defined(_win_) + return WSAGetLastError(); +#else + return errno; +#endif + } + } + + TSocket::TSocket(SOCKET fd) + : Descriptor(fd) + { + } + + TSocket::~TSocket() { + if (Descriptor == INVALID_SOCKET) { + return; + } + + auto const result = ::closesocket(Descriptor); + if (result == 0) + return; + switch (LastSocketError()) { + case EBADF: + Y_FAIL("Close bad descriptor"); + case EINTR: + break; + case EIO: + Y_FAIL("EIO"); + default: + Y_FAIL("It's something unexpected"); + } + } + + int TSocket::GetDescriptor() { + return Descriptor; + } + + int + TSocket::Bind(const TAddress& addr) const { + const auto ret = ::bind(Descriptor, addr.SockAddr(), addr.Size()); + if (ret < 0) + return -LastSocketError(); + + return 0; + } + + int + TSocket::Shutdown(int how) const { + const auto ret = ::shutdown(Descriptor, how); + if (ret < 0) + return -LastSocketError(); + + return 0; + } + + int TSocket::GetConnectStatus() const { + int err = 0; + socklen_t len = sizeof(err); + if (getsockopt(Descriptor, SOL_SOCKET, SO_ERROR, reinterpret_cast<char*>(&err), &len) == -1) { + err = LastSocketError(); + } + return err; + } + + ///////////////////////////////////////////////////////////////// + + TIntrusivePtr<TStreamSocket> TStreamSocket::Make(int domain) { + const SOCKET res = ::socket(domain, SOCK_STREAM | SOCK_NONBLOCK, 0); + if (res == -1) { + const int err = LastSocketError(); + Y_VERIFY(err != EMFILE && err != ENFILE); + } + return MakeIntrusive<TStreamSocket>(res); + } + + TStreamSocket::TStreamSocket(SOCKET fd) + : TSocket(fd) + { + } + + ssize_t + TStreamSocket::Send(const void* msg, size_t len, TString* /*err*/) const { + const auto ret = ::send(Descriptor, static_cast<const char*>(msg), int(len), 0); + if (ret < 0) + return -LastSocketError(); + + return ret; + } + + ssize_t + TStreamSocket::Recv(void* buf, size_t len, TString* /*err*/) const { + const auto ret = ::recv(Descriptor, static_cast<char*>(buf), int(len), 0); + if (ret < 0) + return -LastSocketError(); + + return ret; + } + + ssize_t + TStreamSocket::WriteV(const struct iovec* iov, int iovcnt) const { +#ifndef _win_ + const auto ret = ::writev(Descriptor, iov, iovcnt); + if (ret < 0) + return -LastSocketError(); + return ret; +#else + Y_FAIL("WriteV() unsupported on Windows"); +#endif + } + + ssize_t + TStreamSocket::ReadV(const struct iovec* iov, int iovcnt) const { +#ifndef _win_ + const auto ret = ::readv(Descriptor, iov, iovcnt); + if (ret < 0) + return -LastSocketError(); + return ret; +#else + Y_FAIL("ReadV() unsupported on Windows"); +#endif + } + + ssize_t TStreamSocket::GetUnsentQueueSize() const { + int num = -1; +#ifndef _win_ // we have no means to determine output queue size on Windows + if (ioctl(Descriptor, TIOCOUTQ, &num) == -1) { + num = -1; + } +#endif + return num; + } + + int + TStreamSocket::Connect(const TAddress& addr) const { + const auto ret = ::connect(Descriptor, addr.SockAddr(), addr.Size()); + if (ret < 0) + return -LastSocketError(); + + return ret; + } + + int + TStreamSocket::Connect(const NAddr::IRemoteAddr* addr) const { + const auto ret = ::connect(Descriptor, addr->Addr(), addr->Len()); + if (ret < 0) + return -LastSocketError(); + + return ret; + } + + int + TStreamSocket::Listen(int backlog) const { + const auto ret = ::listen(Descriptor, backlog); + if (ret < 0) + return -LastSocketError(); + + return ret; + } + + int + TStreamSocket::Accept(TAddress& acceptedAddr) const { + socklen_t acceptedSize = sizeof(::sockaddr_in6); + const auto ret = ::accept(Descriptor, acceptedAddr.SockAddr(), &acceptedSize); + if (ret == INVALID_SOCKET) + return -LastSocketError(); + + return ret; + } + + void + TStreamSocket::SetSendBufferSize(i32 len) const { + (void)SetSockOpt(Descriptor, SOL_SOCKET, SO_SNDBUF, len); + } + + ui32 TStreamSocket::GetSendBufferSize() const { + ui32 res = 0; + CheckedGetSockOpt(Descriptor, SOL_SOCKET, SO_SNDBUF, res, "SO_SNDBUF"); + return res; + } + + ////////////////////////////////////////////////////// + + TDatagramSocket::TPtr TDatagramSocket::Make(int domain) { + const SOCKET res = ::socket(domain, SOCK_DGRAM, 0); + if (res == -1) { + const int err = LastSocketError(); + Y_VERIFY(err != EMFILE && err != ENFILE); + } + return std::make_shared<TDatagramSocket>(res); + } + + TDatagramSocket::TDatagramSocket(SOCKET fd) + : TSocket(fd) + { + } + + ssize_t + TDatagramSocket::SendTo(const void* msg, size_t len, const TAddress& toAddr) const { + const auto ret = ::sendto(Descriptor, static_cast<const char*>(msg), int(len), 0, toAddr.SockAddr(), toAddr.Size()); + if (ret < 0) + return -LastSocketError(); + + return ret; + } + + ssize_t + TDatagramSocket::RecvFrom(void* buf, size_t len, TAddress& fromAddr) const { + socklen_t fromSize = sizeof(::sockaddr_in6); + const auto ret = ::recvfrom(Descriptor, static_cast<char*>(buf), int(len), 0, fromAddr.SockAddr(), &fromSize); + if (ret < 0) + return -LastSocketError(); + + return ret; + } + + + // deleter for SSL objects + struct TDeleter { + void operator ()(BIO *bio) const { + BIO_free(bio); + } + + void operator ()(X509 *x509) const { + X509_free(x509); + } + + void operator ()(RSA *rsa) const { + RSA_free(rsa); + } + + void operator ()(SSL_CTX *ctx) const { + SSL_CTX_free(ctx); + } + }; + + class TSecureSocketContext::TImpl { + std::unique_ptr<SSL_CTX, TDeleter> Ctx; + + public: + TImpl(const TString& certificate, const TString& privateKey, const TString& caFilePath, + const TString& ciphers) { + int ret; + InitOpenSSL(); +#if OPENSSL_VERSION_NUMBER < 0x10100000L + Ctx.reset(SSL_CTX_new(TLSv1_2_method())); + Y_VERIFY(Ctx, "SSL_CTX_new() failed"); +#else + Ctx.reset(SSL_CTX_new(TLS_method())); + Y_VERIFY(Ctx, "SSL_CTX_new() failed"); + ret = SSL_CTX_set_min_proto_version(Ctx.get(), TLS1_2_VERSION); + Y_VERIFY(ret == 1, "failed to set min proto version"); + ret = SSL_CTX_set_max_proto_version(Ctx.get(), TLS1_2_VERSION); + Y_VERIFY(ret == 1, "failed to set max proto version"); +#endif + SSL_CTX_set_verify(Ctx.get(), SSL_VERIFY_PEER | SSL_VERIFY_FAIL_IF_NO_PEER_CERT, &Verify); + SSL_CTX_set_mode(*this, SSL_MODE_ENABLE_PARTIAL_WRITE | SSL_MODE_ACCEPT_MOVING_WRITE_BUFFER); + + // apply certificates in SSL context + if (certificate) { + std::unique_ptr<BIO, TDeleter> bio(BIO_new_mem_buf(certificate.data(), certificate.size())); + Y_VERIFY(bio); + + // first certificate in the chain is expected to be a leaf + std::unique_ptr<X509, TDeleter> cert(PEM_read_bio_X509(bio.get(), nullptr, nullptr, nullptr)); + Y_VERIFY(cert, "failed to parse certificate"); + ret = SSL_CTX_use_certificate(Ctx.get(), cert.get()); + Y_VERIFY(ret == 1); + + // loading additional certificates in the chain, if any + while(true) { + X509 *ca = PEM_read_bio_X509(bio.get(), nullptr, nullptr, nullptr); + if (ca == nullptr) { + break; + } + ret = SSL_CTX_add0_chain_cert(Ctx.get(), ca); + Y_VERIFY(ret == 1); + // we must not free memory if certificate was added successfully by SSL_CTX_add0_chain_cert + } + } + if (privateKey) { + std::unique_ptr<BIO, TDeleter> bio(BIO_new_mem_buf(privateKey.data(), privateKey.size())); + Y_VERIFY(bio); + std::unique_ptr<RSA, TDeleter> pkey(PEM_read_bio_RSAPrivateKey(bio.get(), nullptr, nullptr, nullptr)); + Y_VERIFY(pkey); + ret = SSL_CTX_use_RSAPrivateKey(Ctx.get(), pkey.get()); + Y_VERIFY(ret == 1); + } + if (caFilePath) { + ret = SSL_CTX_load_verify_locations(Ctx.get(), caFilePath.data(), nullptr); + Y_VERIFY(ret == 1); + } + + int success = SSL_CTX_set_cipher_list(Ctx.get(), ciphers ? ciphers.data() : "AES128-GCM-SHA256"); + Y_VERIFY(success, "failed to set cipher list"); + } + + operator SSL_CTX*() const { + return Ctx.get(); + } + + static int GetExIndex() { + static int index = SSL_get_ex_new_index(0, nullptr, nullptr, nullptr, nullptr); + return index; + } + + private: + static int Verify(int preverify, X509_STORE_CTX *ctx) { + if (!preverify) { + X509 *badCert = X509_STORE_CTX_get_current_cert(ctx); + int err = X509_STORE_CTX_get_error(ctx); + int depth = X509_STORE_CTX_get_error_depth(ctx); + SSL *ssl = static_cast<SSL*>(X509_STORE_CTX_get_ex_data(ctx, SSL_get_ex_data_X509_STORE_CTX_idx())); + TString *errp = static_cast<TString*>(SSL_get_ex_data(ssl, GetExIndex())); + char buffer[1024]; + X509_NAME_oneline(X509_get_subject_name(badCert), buffer, sizeof(buffer)); + TStringBuilder s; + s << "Error during certificate validation" + << " error# " << X509_verify_cert_error_string(err) + << " depth# " << depth + << " cert# " << buffer; + if (err == X509_V_ERR_UNABLE_TO_GET_ISSUER_CERT) { + X509_NAME_oneline(X509_get_issuer_name(badCert), buffer, sizeof(buffer)); + s << " issuer# " << buffer; + } + *errp = s; + } + return preverify; + } + }; + + TSecureSocketContext::TSecureSocketContext(const TString& certificate, const TString& privateKey, + const TString& caFilePath, const TString& ciphers) + : Impl(new TImpl(certificate, privateKey, caFilePath, ciphers)) + {} + + TSecureSocketContext::~TSecureSocketContext() + {} + + class TSecureSocket::TImpl { + SSL *Ssl; + TString ErrorDescription; + bool WantRead_ = false; + bool WantWrite_ = false; + + public: + TImpl(SSL_CTX *ctx, int fd) + : Ssl(SSL_new(ctx)) + { + Y_VERIFY(Ssl, "SSL_new() failed"); + SSL_set_fd(Ssl, fd); + SSL_set_ex_data(Ssl, TSecureSocketContext::TImpl::GetExIndex(), &ErrorDescription); + } + + ~TImpl() { + SSL_free(Ssl); + } + + TString GetErrorStack() { + if (ErrorDescription) { + return ErrorDescription; + } + std::unique_ptr<BIO, int(*)(BIO*)> mem(BIO_new(BIO_s_mem()), BIO_free); + ERR_print_errors(mem.get()); + char *p = nullptr; + auto len = BIO_get_mem_data(mem.get(), &p); + return TString(p, len); + } + + EStatus ConvertResult(int res, TString& err) { + switch (res) { + case SSL_ERROR_NONE: + return EStatus::SUCCESS; + + case SSL_ERROR_WANT_READ: + return EStatus::WANT_READ; + + case SSL_ERROR_WANT_WRITE: + return EStatus::WANT_WRITE; + + case SSL_ERROR_SYSCALL: + err = TStringBuilder() << "syscall error: " << strerror(LastSocketError()) << ": " << GetErrorStack(); + break; + + case SSL_ERROR_ZERO_RETURN: + err = "TLS negotiation failed"; + break; + + case SSL_ERROR_SSL: + err = "SSL error: " + GetErrorStack(); + break; + + default: + err = "unknown OpenSSL error"; + break; + } + return EStatus::ERROR; + } + + enum EConnectState { + CONNECT, + SHUTDOWN, + READ, + } ConnectState = EConnectState::CONNECT; + + EStatus Establish(bool server, bool authOnly, TString& err) { + switch (ConnectState) { + case EConnectState::CONNECT: { + auto callback = server ? SSL_accept : SSL_connect; + const EStatus status = ConvertResult(SSL_get_error(Ssl, callback(Ssl)), err); + if (status != EStatus::SUCCESS || !authOnly) { + return status; + } + ConnectState = EConnectState::SHUTDOWN; + [[fallthrough]]; + } + + case EConnectState::SHUTDOWN: { + const int res = SSL_shutdown(Ssl); + if (res == 1) { + return EStatus::SUCCESS; + } else if (res != 0) { + return ConvertResult(SSL_get_error(Ssl, res), err); + } + ConnectState = EConnectState::READ; + [[fallthrough]]; + } + + case EConnectState::READ: { + char data[256]; + size_t numRead = 0; + const int res = SSL_get_error(Ssl, SSL_read_ex(Ssl, data, sizeof(data), &numRead)); + if (res == SSL_ERROR_ZERO_RETURN) { + return EStatus::SUCCESS; + } else if (res != SSL_ERROR_NONE) { + return ConvertResult(res, err); + } else if (numRead) { + err = "non-zero return from SSL_read_ex: " + ToString(numRead); + return EStatus::ERROR; + } else { + return EStatus::SUCCESS; + } + } + } + Y_FAIL(); + } + + std::optional<std::pair<const void*, size_t>> BlockedSend; + + ssize_t Send(const void* msg, size_t len, TString *err) { + Y_VERIFY(!BlockedSend || *BlockedSend == std::make_pair(msg, len)); + const ssize_t res = Operate(msg, len, &SSL_write_ex, err); + if (res == -EAGAIN) { + BlockedSend.emplace(msg, len); + } else { + BlockedSend.reset(); + } + return res; + } + + std::optional<std::pair<void*, size_t>> BlockedReceive; + + ssize_t Recv(void* msg, size_t len, TString *err) { + Y_VERIFY(!BlockedReceive || *BlockedReceive == std::make_pair(msg, len)); + const ssize_t res = Operate(msg, len, &SSL_read_ex, err); + if (res == -EAGAIN) { + BlockedReceive.emplace(msg, len); + } else { + BlockedReceive.reset(); + } + return res; + } + + TString GetCipherName() const { + return SSL_get_cipher_name(Ssl); + } + + int GetCipherBits() const { + return SSL_get_cipher_bits(Ssl, nullptr); + } + + TString GetProtocolName() const { + return SSL_get_cipher_version(Ssl); + } + + TString GetPeerCommonName() const { + TString res; + if (X509 *cert = SSL_get_peer_certificate(Ssl)) { + char buffer[256]; + memset(buffer, 0, sizeof(buffer)); + if (X509_NAME *name = X509_get_subject_name(cert)) { + X509_NAME_get_text_by_NID(name, NID_commonName, buffer, sizeof(buffer)); + } + X509_free(cert); + res = TString(buffer, strnlen(buffer, sizeof(buffer))); + } + return res; + } + + bool WantRead() const { + return WantRead_; + } + + bool WantWrite() const { + return WantWrite_; + } + + private: + template<typename TBuffer, typename TOp> + ssize_t Operate(TBuffer* buffer, size_t len, TOp&& op, TString *err) { + WantRead_ = WantWrite_ = false; + size_t processed = 0; + int ret = op(Ssl, buffer, len, &processed); + if (ret == 1) { + return processed; + } + switch (const int status = SSL_get_error(Ssl, ret)) { + case SSL_ERROR_ZERO_RETURN: + return 0; + + case SSL_ERROR_WANT_READ: + WantRead_ = true; + return -EAGAIN; + + case SSL_ERROR_WANT_WRITE: + WantWrite_ = true; + return -EAGAIN; + + case SSL_ERROR_SYSCALL: + return -LastSocketError(); + + case SSL_ERROR_SSL: + if (err) { + *err = GetErrorStack(); + } + return -EPROTO; + + default: + Y_FAIL("unexpected SSL_get_error() status# %d", status); + } + } + }; + + TSecureSocket::TSecureSocket(TStreamSocket& socket, TSecureSocketContext::TPtr context) + : TStreamSocket(socket.ReleaseDescriptor()) + , Context(std::move(context)) + , Impl(new TImpl(*Context->Impl, Descriptor)) + {} + + TSecureSocket::~TSecureSocket() + {} + + TSecureSocket::EStatus TSecureSocket::Establish(bool server, bool authOnly, TString& err) const { + return Impl->Establish(server, authOnly, err); + } + + TIntrusivePtr<TStreamSocket> TSecureSocket::Detach() { + return MakeIntrusive<TStreamSocket>(ReleaseDescriptor()); + } + + ssize_t TSecureSocket::Send(const void* msg, size_t len, TString *err) const { + return Impl->Send(msg, len, err); + } + + ssize_t TSecureSocket::Recv(void* msg, size_t len, TString *err) const { + return Impl->Recv(msg, len, err); + } + + ssize_t TSecureSocket::WriteV(const struct iovec* /*iov*/, int /*iovcnt*/) const { + Y_FAIL("unsupported on SSL sockets"); + } + + ssize_t TSecureSocket::ReadV(const struct iovec* /*iov*/, int /*iovcnt*/) const { + Y_FAIL("unsupported on SSL sockets"); + } + + TString TSecureSocket::GetCipherName() const { + return Impl->GetCipherName(); + } + + int TSecureSocket::GetCipherBits() const { + return Impl->GetCipherBits(); + } + + TString TSecureSocket::GetProtocolName() const { + return Impl->GetProtocolName(); + } + + TString TSecureSocket::GetPeerCommonName() const { + return Impl->GetPeerCommonName(); + } + + bool TSecureSocket::WantRead() const { + return Impl->WantRead(); + } + + bool TSecureSocket::WantWrite() const { + return Impl->WantWrite(); + } + +} diff --git a/library/cpp/actors/interconnect/interconnect_stream.h b/library/cpp/actors/interconnect/interconnect_stream.h new file mode 100644 index 0000000000..074adc6e74 --- /dev/null +++ b/library/cpp/actors/interconnect/interconnect_stream.h @@ -0,0 +1,131 @@ +#pragma once + +#include <util/generic/string.h> +#include <util/generic/noncopyable.h> +#include <util/network/address.h> +#include <util/network/init.h> +#include <util/system/defaults.h> + +#include "poller.h" + +#include "interconnect_address.h" + +#include <memory> + +#include <sys/uio.h> + +namespace NInterconnect { + class TSocket: public NActors::TSharedDescriptor, public TNonCopyable { + protected: + TSocket(SOCKET fd); + + virtual ~TSocket() override; + + SOCKET Descriptor; + + virtual int GetDescriptor() override; + + private: + friend class TSecureSocket; + + SOCKET ReleaseDescriptor() { + return std::exchange(Descriptor, INVALID_SOCKET); + } + + public: + operator SOCKET() const { + return Descriptor; + } + + int Bind(const TAddress& addr) const; + int Shutdown(int how) const; + int GetConnectStatus() const; + }; + + class TStreamSocket: public TSocket { + public: + TStreamSocket(SOCKET fd); + + static TIntrusivePtr<TStreamSocket> Make(int domain); + + virtual ssize_t Send(const void* msg, size_t len, TString *err = nullptr) const; + virtual ssize_t Recv(void* buf, size_t len, TString *err = nullptr) const; + + virtual ssize_t WriteV(const struct iovec* iov, int iovcnt) const; + virtual ssize_t ReadV(const struct iovec* iov, int iovcnt) const; + + int Connect(const TAddress& addr) const; + int Connect(const NAddr::IRemoteAddr* addr) const; + int Listen(int backlog) const; + int Accept(TAddress& acceptedAddr) const; + + ssize_t GetUnsentQueueSize() const; + + void SetSendBufferSize(i32 len) const; + ui32 GetSendBufferSize() const; + }; + + class TSecureSocketContext { + class TImpl; + THolder<TImpl> Impl; + + friend class TSecureSocket; + + public: + TSecureSocketContext(const TString& certificate, const TString& privateKey, const TString& caFilePath, + const TString& ciphers); + ~TSecureSocketContext(); + + public: + using TPtr = std::shared_ptr<TSecureSocketContext>; + }; + + class TSecureSocket : public TStreamSocket { + TSecureSocketContext::TPtr Context; + + class TImpl; + THolder<TImpl> Impl; + + public: + enum class EStatus { + SUCCESS, + ERROR, + WANT_READ, + WANT_WRITE, + }; + + public: + TSecureSocket(TStreamSocket& socket, TSecureSocketContext::TPtr context); + ~TSecureSocket(); + + EStatus Establish(bool server, bool authOnly, TString& err) const; + TIntrusivePtr<TStreamSocket> Detach(); + + ssize_t Send(const void* msg, size_t len, TString *err) const override; + ssize_t Recv(void* msg, size_t len, TString *err) const override; + + ssize_t WriteV(const struct iovec* iov, int iovcnt) const override; + ssize_t ReadV(const struct iovec* iov, int iovcnt) const override; + + TString GetCipherName() const; + int GetCipherBits() const; + TString GetProtocolName() const; + TString GetPeerCommonName() const; + + bool WantRead() const; + bool WantWrite() const; + }; + + class TDatagramSocket: public TSocket { + public: + typedef std::shared_ptr<TDatagramSocket> TPtr; + + TDatagramSocket(SOCKET fd); + + static TPtr Make(int domain); + + ssize_t SendTo(const void* msg, size_t len, const TAddress& toAddr) const; + ssize_t RecvFrom(void* buf, size_t len, TAddress& fromAddr) const; + }; + +} diff --git a/library/cpp/actors/interconnect/interconnect_tcp_input_session.cpp b/library/cpp/actors/interconnect/interconnect_tcp_input_session.cpp new file mode 100644 index 0000000000..0abe9fe659 --- /dev/null +++ b/library/cpp/actors/interconnect/interconnect_tcp_input_session.cpp @@ -0,0 +1,476 @@ +#include "interconnect_tcp_session.h" +#include "interconnect_tcp_proxy.h" +#include <library/cpp/actors/core/probes.h> +#include <library/cpp/actors/util/datetime.h> + +namespace NActors { + LWTRACE_USING(ACTORLIB_PROVIDER); + + TInputSessionTCP::TInputSessionTCP(const TActorId& sessionId, TIntrusivePtr<NInterconnect::TStreamSocket> socket, + TIntrusivePtr<TReceiveContext> context, TInterconnectProxyCommon::TPtr common, + std::shared_ptr<IInterconnectMetrics> metrics, ui32 nodeId, ui64 lastConfirmed, + TDuration deadPeerTimeout, TSessionParams params) + : SessionId(sessionId) + , Socket(std::move(socket)) + , Context(std::move(context)) + , Common(std::move(common)) + , NodeId(nodeId) + , Params(std::move(params)) + , ConfirmedByInput(lastConfirmed) + , Metrics(std::move(metrics)) + , DeadPeerTimeout(deadPeerTimeout) + { + Y_VERIFY(Context); + Y_VERIFY(Socket); + Y_VERIFY(SessionId); + + AtomicSet(Context->PacketsReadFromSocket, 0); + + Metrics->SetClockSkewMicrosec(0); + + Context->UpdateState = EUpdateState::NONE; + + // ensure that we do not spawn new session while the previous one is still alive + TAtomicBase sessions = AtomicIncrement(Context->NumInputSessions); + Y_VERIFY(sessions == 1, "sessions# %" PRIu64, ui64(sessions)); + } + + void TInputSessionTCP::Bootstrap() { + SetPrefix(Sprintf("InputSession %s [node %" PRIu32 "]", SelfId().ToString().data(), NodeId)); + Become(&TThis::WorkingState, DeadPeerTimeout, new TEvCheckDeadPeer); + LOG_DEBUG_IC_SESSION("ICIS01", "InputSession created"); + LastReceiveTimestamp = TActivationContext::Now(); + ReceiveData(); + } + + void TInputSessionTCP::CloseInputSession() { + CloseInputSessionRequested = true; + ReceiveData(); + } + + void TInputSessionTCP::Handle(TEvPollerReady::TPtr ev) { + if (Context->ReadPending) { + Metrics->IncUsefulReadWakeups(); + } else if (!ev->Cookie) { + Metrics->IncSpuriousReadWakeups(); + } + Context->ReadPending = false; + ReceiveData(); + if (Params.Encryption && Context->WriteBlockedByFullSendBuffer && !ev->Cookie) { + Send(SessionId, ev->Release().Release(), 0, 1); + } + } + + void TInputSessionTCP::Handle(TEvPollerRegisterResult::TPtr ev) { + PollerToken = std::move(ev->Get()->PollerToken); + ReceiveData(); + } + + void TInputSessionTCP::HandleResumeReceiveData() { + ReceiveData(); + } + + void TInputSessionTCP::ReceiveData() { + TTimeLimit limit(GetMaxCyclesPerEvent()); + ui64 numDataBytes = 0; + const size_t headerLen = Params.UseModernFrame ? sizeof(TTcpPacketHeader_v2) : sizeof(TTcpPacketHeader_v1); + + LOG_DEBUG_IC_SESSION("ICIS02", "ReceiveData called"); + + for (int iteration = 0; Socket; ++iteration) { + if (iteration && limit.CheckExceeded()) { + // we have hit processing time limit for this message, send notification to resume processing a bit later + Send(SelfId(), new TEvResumeReceiveData); + break; + } + + switch (State) { + case EState::HEADER: + if (IncomingData.GetSize() < headerLen) { + break; + } else { + ProcessHeader(headerLen); + } + continue; + + case EState::PAYLOAD: + if (!IncomingData) { + break; + } else { + ProcessPayload(numDataBytes); + } + continue; + } + + // if we have reached this point, it means that we do not have enough data in read buffer; try to obtain some + if (!ReadMore()) { + // we have no data from socket, so we have some free time to spend -- preallocate buffers using this time + PreallocateBuffers(); + break; + } + } + + // calculate ping time + auto it = std::min_element(PingQ.begin(), PingQ.end()); + const TDuration ping = it != PingQ.end() ? *it : TDuration::Zero(); + + // send update to main session actor if something valuable has changed + if (!UpdateFromInputSession) { + UpdateFromInputSession = MakeHolder<TEvUpdateFromInputSession>(ConfirmedByInput, numDataBytes, ping); + } else { + Y_VERIFY(ConfirmedByInput >= UpdateFromInputSession->ConfirmedByInput); + UpdateFromInputSession->ConfirmedByInput = ConfirmedByInput; + UpdateFromInputSession->NumDataBytes += numDataBytes; + UpdateFromInputSession->Ping = Min(UpdateFromInputSession->Ping, ping); + } + + for (;;) { + EUpdateState state = Context->UpdateState; + EUpdateState next; + + // calculate next state + switch (state) { + case EUpdateState::NONE: + case EUpdateState::CONFIRMING: + // we have no inflight messages to session actor, we will issue one a bit later + next = EUpdateState::INFLIGHT; + break; + + case EUpdateState::INFLIGHT: + case EUpdateState::INFLIGHT_AND_PENDING: + // we already have inflight message, so we will keep pending message and session actor will issue + // TEvConfirmUpdate to kick processing + next = EUpdateState::INFLIGHT_AND_PENDING; + break; + } + + if (Context->UpdateState.compare_exchange_weak(state, next)) { + switch (next) { + case EUpdateState::INFLIGHT: + Send(SessionId, UpdateFromInputSession.Release()); + break; + + case EUpdateState::INFLIGHT_AND_PENDING: + Y_VERIFY(UpdateFromInputSession); + break; + + default: + Y_FAIL("unexpected state"); + } + break; + } + } + } + + void TInputSessionTCP::ProcessHeader(size_t headerLen) { + const bool success = IncomingData.ExtractFrontPlain(Header.Data, headerLen); + Y_VERIFY(success); + if (Params.UseModernFrame) { + PayloadSize = Header.v2.PayloadLength; + HeaderSerial = Header.v2.Serial; + HeaderConfirm = Header.v2.Confirm; + if (!Params.Encryption) { + ChecksumExpected = std::exchange(Header.v2.Checksum, 0); + Checksum = Crc32cExtendMSanCompatible(0, &Header.v2, sizeof(Header.v2)); // start calculating checksum now + if (!PayloadSize && Checksum != ChecksumExpected) { + LOG_ERROR_IC_SESSION("ICIS10", "payload checksum error"); + return ReestablishConnection(TDisconnectReason::ChecksumError()); + } + } + } else if (!Header.v1.Check()) { + LOG_ERROR_IC_SESSION("ICIS03", "header checksum error"); + return ReestablishConnection(TDisconnectReason::ChecksumError()); + } else { + PayloadSize = Header.v1.DataSize; + HeaderSerial = Header.v1.Serial; + HeaderConfirm = Header.v1.Confirm; + ChecksumExpected = Header.v1.PayloadCRC32; + Checksum = 0; + } + if (PayloadSize >= 65536) { + LOG_CRIT_IC_SESSION("ICIS07", "payload is way too big"); + return DestroySession(TDisconnectReason::FormatError()); + } + if (ConfirmedByInput < HeaderConfirm) { + ConfirmedByInput = HeaderConfirm; + if (AtomicGet(Context->ControlPacketId) <= HeaderConfirm && !NewPingProtocol) { + ui64 sendTime = AtomicGet(Context->ControlPacketSendTimer); + TDuration duration = CyclesToDuration(GetCycleCountFast() - sendTime); + const auto durationUs = duration.MicroSeconds(); + Metrics->UpdateLegacyPingTimeHist(durationUs); + PingQ.push_back(duration); + if (PingQ.size() > 16) { + PingQ.pop_front(); + } + AtomicSet(Context->ControlPacketId, 0ULL); + } + } + if (PayloadSize) { + const ui64 expected = Context->GetLastProcessedPacketSerial() + 1; + if (HeaderSerial == 0 || HeaderSerial > expected) { + LOG_CRIT_IC_SESSION("ICIS06", "packet serial %" PRIu64 ", but %" PRIu64 " expected", HeaderSerial, expected); + return DestroySession(TDisconnectReason::FormatError()); + } + IgnorePayload = HeaderSerial != expected; + State = EState::PAYLOAD; + } else if (HeaderSerial & TTcpPacketBuf::PingRequestMask) { + Send(SessionId, new TEvProcessPingRequest(HeaderSerial & ~TTcpPacketBuf::PingRequestMask)); + } else if (HeaderSerial & TTcpPacketBuf::PingResponseMask) { + const ui64 sent = HeaderSerial & ~TTcpPacketBuf::PingResponseMask; + const ui64 received = GetCycleCountFast(); + HandlePingResponse(CyclesToDuration(received - sent)); + } else if (HeaderSerial & TTcpPacketBuf::ClockMask) { + HandleClock(TInstant::MicroSeconds(HeaderSerial & ~TTcpPacketBuf::ClockMask)); + } + } + + void TInputSessionTCP::ProcessPayload(ui64& numDataBytes) { + const size_t numBytes = Min(PayloadSize, IncomingData.GetSize()); + IncomingData.ExtractFront(numBytes, &Payload); + numDataBytes += numBytes; + PayloadSize -= numBytes; + if (PayloadSize) { + return; // there is still some data to receive in the Payload rope + } + State = EState::HEADER; // we'll continue with header next time + if (!Params.UseModernFrame || !Params.Encryption) { // see if we are checksumming packet body + for (const auto&& [data, size] : Payload) { + Checksum = Crc32cExtendMSanCompatible(Checksum, data, size); + } + if (Checksum != ChecksumExpected) { // validate payload checksum + LOG_ERROR_IC_SESSION("ICIS04", "payload checksum error"); + return ReestablishConnection(TDisconnectReason::ChecksumError()); + } + } + if (Y_UNLIKELY(IgnorePayload)) { + return; + } + if (!Context->AdvanceLastProcessedPacketSerial()) { + return DestroySession(TDisconnectReason::NewSession()); + } + + while (Payload && Socket) { + // extract channel part header from the payload stream + TChannelPart part; + if (!Payload.ExtractFrontPlain(&part, sizeof(part))) { + LOG_CRIT_IC_SESSION("ICIS14", "missing TChannelPart header in payload"); + return DestroySession(TDisconnectReason::FormatError()); + } + if (!part.Size) { // bogus frame + continue; + } else if (Payload.GetSize() < part.Size) { + LOG_CRIT_IC_SESSION("ICIS08", "payload format error ChannelPart# %s", part.ToString().data()); + return DestroySession(TDisconnectReason::FormatError()); + } + + const ui16 channel = part.Channel & ~TChannelPart::LastPartFlag; + TRope *eventData = channel < Context->ChannelArray.size() + ? &Context->ChannelArray[channel] + : &Context->ChannelMap[channel]; + + Metrics->AddInputChannelsIncomingTraffic(channel, sizeof(part) + part.Size); + + TEventDescr descr; + if (~part.Channel & TChannelPart::LastPartFlag) { + Payload.ExtractFront(part.Size, eventData); + } else if (part.Size != sizeof(descr)) { + LOG_CRIT_IC_SESSION("ICIS11", "incorrect last part of an event"); + return DestroySession(TDisconnectReason::FormatError()); + } else if (Payload.ExtractFrontPlain(&descr, sizeof(descr))) { + Metrics->IncInputChannelsIncomingEvents(channel); + ProcessEvent(*eventData, descr); + *eventData = TRope(); + } else { + Y_FAIL(); + } + } + } + + void TInputSessionTCP::ProcessEvent(TRope& data, TEventDescr& descr) { + if (!Params.UseModernFrame || descr.Checksum) { + ui32 checksum = 0; + for (const auto&& [data, size] : data) { + checksum = Crc32cExtendMSanCompatible(checksum, data, size); + } + if (checksum != descr.Checksum) { + LOG_CRIT_IC_SESSION("ICIS05", "event checksum error"); + return ReestablishConnection(TDisconnectReason::ChecksumError()); + } + } + auto ev = std::make_unique<IEventHandle>(SessionId, + descr.Type, + descr.Flags & ~IEventHandle::FlagExtendedFormat, + descr.Recipient, + descr.Sender, + MakeIntrusive<TEventSerializedData>(std::move(data), bool(descr.Flags & IEventHandle::FlagExtendedFormat)), + descr.Cookie, + Params.PeerScopeId, + NWilson::TTraceId(descr.TraceId)); + if (Common->EventFilter && !Common->EventFilter->CheckIncomingEvent(*ev, Common->LocalScopeId)) { + LOG_CRIT_IC_SESSION("ICIC03", "Event dropped due to scope error LocalScopeId# %s PeerScopeId# %s Type# 0x%08" PRIx32, + ScopeIdToString(Common->LocalScopeId).data(), ScopeIdToString(Params.PeerScopeId).data(), descr.Type); + ev.reset(); + } + if (ev) { + TActivationContext::Send(ev.release()); + } + } + + void TInputSessionTCP::HandleConfirmUpdate() { + for (;;) { + switch (EUpdateState state = Context->UpdateState) { + case EUpdateState::NONE: + case EUpdateState::INFLIGHT: + case EUpdateState::INFLIGHT_AND_PENDING: + // here we may have a race + return; + + case EUpdateState::CONFIRMING: + Y_VERIFY(UpdateFromInputSession); + if (Context->UpdateState.compare_exchange_weak(state, EUpdateState::INFLIGHT)) { + Send(SessionId, UpdateFromInputSession.Release()); + return; + } + } + } + } + + bool TInputSessionTCP::ReadMore() { + PreallocateBuffers(); + + TStackVec<TIoVec, NumPreallocatedBuffers> buffs; + for (const auto& item : Buffers) { + TIoVec iov{item->GetBuffer(), item->GetCapacity()}; + buffs.push_back(iov); + if (Params.Encryption) { + break; // do not put more than one buffer in queue to prevent using ReadV + } + } + + const struct iovec* iovec = reinterpret_cast<const struct iovec*>(buffs.data()); + int iovcnt = buffs.size(); + + ssize_t recvres = 0; + TString err; + LWPROBE_IF_TOO_LONG(SlowICReadFromSocket, ms) { + do { +#ifndef _win_ + recvres = iovcnt == 1 ? Socket->Recv(iovec->iov_base, iovec->iov_len, &err) : Socket->ReadV(iovec, iovcnt); +#else + recvres = Socket->Recv(iovec[0].iov_base, iovec[0].iov_len, &err); +#endif + Metrics->IncRecvSyscalls(); + } while (recvres == -EINTR); + } + + LOG_DEBUG_IC_SESSION("ICIS12", "ReadMore recvres# %zd iovcnt# %d err# %s", recvres, iovcnt, err.data()); + + if (recvres <= 0 || CloseInputSessionRequested) { + if ((-recvres != EAGAIN && -recvres != EWOULDBLOCK) || CloseInputSessionRequested) { + TString message = CloseInputSessionRequested ? "connection closed by debug command" + : recvres == 0 ? "connection closed by peer" + : err ? err + : Sprintf("socket: %s", strerror(-recvres)); + LOG_NOTICE_NET(NodeId, "%s", message.data()); + ReestablishConnection(CloseInputSessionRequested ? TDisconnectReason::Debug() : + recvres == 0 ? TDisconnectReason::EndOfStream() : TDisconnectReason::FromErrno(-recvres)); + } else if (PollerToken && !std::exchange(Context->ReadPending, true)) { + if (Params.Encryption) { + auto *secure = static_cast<NInterconnect::TSecureSocket*>(Socket.Get()); + const bool wantRead = secure->WantRead(), wantWrite = secure->WantWrite(); + Y_VERIFY_DEBUG(wantRead || wantWrite); + PollerToken->Request(wantRead, wantWrite); + } else { + PollerToken->Request(true, false); + } + } + return false; + } + + Y_VERIFY(recvres > 0); + Metrics->AddTotalBytesRead(recvres); + TDeque<TIntrusivePtr<TRopeAlignedBuffer>>::iterator it; + for (it = Buffers.begin(); recvres; ++it) { + Y_VERIFY(it != Buffers.end()); + const size_t bytesFromFrontBuffer = Min<size_t>(recvres, (*it)->GetCapacity()); + (*it)->AdjustSize(bytesFromFrontBuffer); + IncomingData.Insert(IncomingData.End(), TRope(std::move(*it))); + recvres -= bytesFromFrontBuffer; + } + Buffers.erase(Buffers.begin(), it); + + LastReceiveTimestamp = TActivationContext::Now(); + + return true; + } + + void TInputSessionTCP::PreallocateBuffers() { + // ensure that we have exactly "numBuffers" in queue + LWPROBE_IF_TOO_LONG(SlowICReadLoopAdjustSize, ms) { + const ui32 target = Params.Encryption ? 1 : NumPreallocatedBuffers; + while (Buffers.size() < target) { + Buffers.emplace_back(TRopeAlignedBuffer::Allocate(sizeof(TTcpPacketBuf))); + } + } + } + + void TInputSessionTCP::ReestablishConnection(TDisconnectReason reason) { + LOG_DEBUG_IC_SESSION("ICIS09", "ReestablishConnection, reason# %s", reason.ToString().data()); + AtomicDecrement(Context->NumInputSessions); + Send(SessionId, new TEvSocketDisconnect(std::move(reason))); + PassAway(); + Socket.Reset(); + } + + void TInputSessionTCP::DestroySession(TDisconnectReason reason) { + LOG_DEBUG_IC_SESSION("ICIS13", "DestroySession, reason# %s", reason.ToString().data()); + AtomicDecrement(Context->NumInputSessions); + Send(SessionId, TInterconnectSessionTCP::NewEvTerminate(std::move(reason))); + PassAway(); + Socket.Reset(); + } + + void TInputSessionTCP::HandleCheckDeadPeer() { + const TInstant now = TActivationContext::Now(); + if (now >= LastReceiveTimestamp + DeadPeerTimeout) { + ReceiveData(); + if (Socket && now >= LastReceiveTimestamp + DeadPeerTimeout) { + // nothing has changed, terminate session + DestroySession(TDisconnectReason::DeadPeer()); + } + } + Schedule(LastReceiveTimestamp + DeadPeerTimeout - now, new TEvCheckDeadPeer); + } + + void TInputSessionTCP::HandlePingResponse(TDuration passed) { + PingQ.push_back(passed); + if (PingQ.size() > 16) { + PingQ.pop_front(); + } + const TDuration ping = *std::min_element(PingQ.begin(), PingQ.end()); + const auto pingUs = ping.MicroSeconds(); + Context->PingRTT_us = pingUs; + NewPingProtocol = true; + Metrics->UpdateLegacyPingTimeHist(pingUs); + } + + void TInputSessionTCP::HandleClock(TInstant clock) { + const TInstant here = TInstant::Now(); // wall clock + const TInstant remote = clock + TDuration::MicroSeconds(Context->PingRTT_us / 2); + i64 skew = remote.MicroSeconds() - here.MicroSeconds(); + SkewQ.push_back(skew); + if (SkewQ.size() > 16) { + SkewQ.pop_front(); + } + i64 clockSkew = SkewQ.front(); + for (i64 skew : SkewQ) { + if (abs(skew) < abs(clockSkew)) { + clockSkew = skew; + } + } + Context->ClockSkew_us = clockSkew; + Metrics->SetClockSkewMicrosec(clockSkew); + } + + +} diff --git a/library/cpp/actors/interconnect/interconnect_tcp_proxy.cpp b/library/cpp/actors/interconnect/interconnect_tcp_proxy.cpp new file mode 100644 index 0000000000..7e2d8ccb94 --- /dev/null +++ b/library/cpp/actors/interconnect/interconnect_tcp_proxy.cpp @@ -0,0 +1,936 @@ +#include "interconnect_tcp_proxy.h" +#include "interconnect_handshake.h" +#include "interconnect_tcp_session.h" +#include <library/cpp/actors/core/log.h> +#include <library/cpp/actors/protos/services_common.pb.h> +#include <library/cpp/monlib/service/pages/templates.h> +#include <util/system/getpid.h> + +namespace NActors { + static constexpr TDuration GetNodeRequestTimeout = TDuration::Seconds(5); + + static constexpr TDuration FirstErrorSleep = TDuration::MilliSeconds(10); + static constexpr TDuration MaxErrorSleep = TDuration::Seconds(10); + static constexpr ui32 SleepRetryMultiplier = 4; + + static TString PeerNameForHuman(ui32 nodeNum, const TString& longName, ui16 port) { + TStringBuf token; + TStringBuf(longName).NextTok('.', token); + return ToString<ui32>(nodeNum) + ":" + (token.size() > 0 ? TString(token) : longName) + ":" + ToString<ui16>(port); + } + + TInterconnectProxyTCP::TInterconnectProxyTCP(const ui32 node, TInterconnectProxyCommon::TPtr common, + IActor **dynamicPtr) + : TActor(&TThis::StateInit) + , PeerNodeId(node) + , DynamicPtr(dynamicPtr) + , Common(std::move(common)) + , SecureContext(new NInterconnect::TSecureSocketContext(Common->Settings.Certificate, Common->Settings.PrivateKey, + Common->Settings.CaFilePath, Common->Settings.CipherList)) + { + Y_VERIFY(Common); + Y_VERIFY(Common->NameserviceId); + if (DynamicPtr) { + Y_VERIFY(!*DynamicPtr); + *DynamicPtr = this; + } + } + + void TInterconnectProxyTCP::Bootstrap() { + SetPrefix(Sprintf("Proxy %s [node %" PRIu32 "]", SelfId().ToString().data(), PeerNodeId)); + + SwitchToInitialState(); + PassAwayTimestamp = TActivationContext::Now() + TDuration::Seconds(15); + + LOG_INFO_IC("ICP01", "ready to work"); + } + + void TInterconnectProxyTCP::Registered(TActorSystem* sys, const TActorId& owner) { + if (!DynamicPtr) { + // perform usual bootstrap for static nodes + sys->Send(new IEventHandle(TEvents::TSystem::Bootstrap, 0, SelfId(), owner, nullptr, 0)); + } + if (const auto& mon = Common->RegisterMonPage) { + TString path = Sprintf("peer%04" PRIu32, PeerNodeId); + TString title = Sprintf("Peer #%04" PRIu32, PeerNodeId); + mon(path, title, sys, SelfId()); + } + } + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // PendingActivation + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + void TInterconnectProxyTCP::RequestNodeInfo(STATEFN_SIG) { + ICPROXY_PROFILED; + + Y_VERIFY(!IncomingHandshakeActor && !OutgoingHandshakeActor && !PendingIncomingHandshakeEvents && !PendingSessionEvents); + EnqueueSessionEvent(ev); + StartConfiguring(); + } + + void TInterconnectProxyTCP::RequestNodeInfoForIncomingHandshake(STATEFN_SIG) { + ICPROXY_PROFILED; + + if (!Terminated) { + Y_VERIFY(!IncomingHandshakeActor && !OutgoingHandshakeActor && !PendingIncomingHandshakeEvents && !PendingSessionEvents); + EnqueueIncomingHandshakeEvent(ev); + StartConfiguring(); + } + } + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // PendingNodeInfo + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + void TInterconnectProxyTCP::StartConfiguring() { + ICPROXY_PROFILED; + + Y_VERIFY(!IncomingHandshakeActor && !OutgoingHandshakeActor); + + // issue node info request + Send(Common->NameserviceId, new TEvInterconnect::TEvGetNode(PeerNodeId)); + + // arm configure timer; store pointer to event to ensure that we will handle correct one if there were any other + // wakeup events in flight + SwitchToState(__LINE__, "PendingNodeInfo", &TThis::PendingNodeInfo, GetNodeRequestTimeout, + ConfigureTimeoutCookie = new TEvents::TEvWakeup); + } + + void TInterconnectProxyTCP::Configure(TEvInterconnect::TEvNodeInfo::TPtr& ev) { + ICPROXY_PROFILED; + + Y_VERIFY(!IncomingHandshakeActor && !OutgoingHandshakeActor && !Session); + + if (!ev->Get()->Node) { + TransitToErrorState("cannot get node info"); + } else { + auto& info = *ev->Get()->Node; + TString name = PeerNameForHuman(PeerNodeId, info.Host, info.Port); + TechnicalPeerHostName = info.Host; + if (!Metrics) { + Metrics = Common->Metrics ? CreateInterconnectMetrics(Common) : CreateInterconnectCounters(Common); + } + Metrics->SetPeerInfo(name, info.Location.GetDataCenterId()); + + LOG_DEBUG_IC("ICP02", "configured for host %s", name.data()); + + ProcessConfigured(); + } + } + + void TInterconnectProxyTCP::ConfigureTimeout(TEvents::TEvWakeup::TPtr& ev) { + ICPROXY_PROFILED; + + if (ev->Get() == ConfigureTimeoutCookie) { + TransitToErrorState("timed out while waiting for node info"); + } + } + + void TInterconnectProxyTCP::ProcessConfigured() { + ICPROXY_PROFILED; + + // if the request was initiated by some activity involving Interconnect, then we are expected to start handshake + if (PendingSessionEvents) { + StartInitialHandshake(); + } + + // process incoming handshake requests; all failures were ejected from the queue along with the matching initiation requests + for (THolder<IEventHandle>& ev : PendingIncomingHandshakeEvents) { + TAutoPtr<IEventHandle> x(ev.Release()); + IncomingHandshake(x); + } + PendingIncomingHandshakeEvents.clear(); + + // possible situation -- incoming handshake arrives, but actually it is not satisfied and rejected; in this case + // we are going to return to initial state as we have nothing to do + if (!IncomingHandshakeActor && !OutgoingHandshakeActor) { + SwitchToInitialState(); + } + } + + void TInterconnectProxyTCP::StartInitialHandshake() { + ICPROXY_PROFILED; + + // since we are starting initial handshake for some reason, we'll drop any existing handshakes, if any + DropHandshakes(); + + // create and register handshake actor + OutgoingHandshakeActor = Register(CreateOutgoingHandshakeActor(Common, GenerateSessionVirtualId(), + TActorId(), PeerNodeId, 0, TechnicalPeerHostName, TSessionParams()), TMailboxType::ReadAsFilled); + OutgoingHandshakeActorCreated = TActivationContext::Now(); + + // prepare for new handshake + PrepareNewSessionHandshake(); + } + + void TInterconnectProxyTCP::StartResumeHandshake(ui64 inputCounter) { + ICPROXY_PROFILED; + + // drop outgoing handshake if we have one; keep incoming handshakes as they may be useful + DropOutgoingHandshake(); + + // ensure that we have session + Y_VERIFY(Session); + + // ensure that we have both virtual ids + Y_VERIFY(SessionVirtualId); + Y_VERIFY(RemoteSessionVirtualId); + + // create and register handshake actor + OutgoingHandshakeActor = Register(CreateOutgoingHandshakeActor(Common, SessionVirtualId, + RemoteSessionVirtualId, PeerNodeId, inputCounter, TechnicalPeerHostName, Session->Params), + TMailboxType::ReadAsFilled); + OutgoingHandshakeActorCreated = TActivationContext::Now(); + } + + void TInterconnectProxyTCP::IssueIncomingHandshakeReply(const TActorId& handshakeId, ui64 peerLocalId, + THolder<IEventBase> event) { + ICPROXY_PROFILED; + + Y_VERIFY(!IncomingHandshakeActor); + IncomingHandshakeActor = handshakeId; + IncomingHandshakeActorFilledIn = TActivationContext::Now(); + Y_VERIFY(!LastSerialFromIncomingHandshake || *LastSerialFromIncomingHandshake <= peerLocalId); + LastSerialFromIncomingHandshake = peerLocalId; + + if (OutgoingHandshakeActor && SelfId().NodeId() < PeerNodeId) { + // Both outgoing and incoming handshake are in progress. To prevent race condition during semultanous handshake + // incoming handshake must be held till outgoing handshake is complete or failed + LOG_DEBUG_IC("ICP06", "reply for incoming handshake (actor %s) is held", IncomingHandshakeActor.ToString().data()); + HeldHandshakeReply = std::move(event); + + // Check that we are in one of acceptable states that would properly handle handshake statuses. + const auto state = CurrentStateFunc(); + Y_VERIFY(state == &TThis::PendingConnection || state == &TThis::StateWork, "invalid handshake request in state# %s", State); + } else { + LOG_DEBUG_IC("ICP07", "issued incoming handshake reply"); + + // No race, so we can send reply immediately. + Y_VERIFY(!HeldHandshakeReply); + Send(IncomingHandshakeActor, event.Release()); + + // Start waiting for handshake reply, if not yet started; also, if session is already created, then we don't + // switch from working state. + if (!Session) { + LOG_INFO_IC("ICP08", "No active sessions, becoming PendingConnection"); + SwitchToState(__LINE__, "PendingConnection", &TThis::PendingConnection); + } else { + Y_VERIFY(CurrentStateFunc() == &TThis::StateWork); + } + } + } + + void TInterconnectProxyTCP::IncomingHandshake(TEvHandshakeAsk::TPtr& ev) { + ICPROXY_PROFILED; + + TEvHandshakeAsk *msg = ev->Get(); + + // TEvHandshakeAsk is only applicable for continuation requests + LOG_DEBUG_IC("ICP09", "(actor %s) from: %s for: %s", ev->Sender.ToString().data(), + ev->Get()->Self.ToString().data(), ev->Get()->Peer.ToString().data()); + + if (!Session) { + // if there is no open session, report error -- continuation request works only with open sessions + LOG_NOTICE_IC("ICP12", "(actor %s) peer tries to resume nonexistent session Self# %s Peer# %s", + ev->Sender.ToString().data(), msg->Self.ToString().data(), msg->Peer.ToString().data()); + } else if (SessionVirtualId != ev->Get()->Peer || RemoteSessionVirtualId != ev->Get()->Self) { + // check session virtual ids for continuation + LOG_NOTICE_IC("ICP13", "(actor %s) virtual id mismatch with existing session (Peer: %s Self: %s" + " SessionVirtualId: %s RemoteSessionVirtualId: %s)", ev->Sender.ToString().data(), + ev->Get()->Peer.ToString().data(), ev->Get()->Self.ToString().data(), SessionVirtualId.ToString().data(), + RemoteSessionVirtualId.ToString().data()); + } else { + // if we already have incoming handshake, then terminate existing one + DropIncomingHandshake(); + + // issue reply to the sender, possibly holding it while outgoing handshake is at race + THolder<IEventBase> reply = IActor::InvokeOtherActor(*Session, &TInterconnectSessionTCP::ProcessHandshakeRequest, ev); + return IssueIncomingHandshakeReply(ev->Sender, RemoteSessionVirtualId.LocalId(), std::move(reply)); + } + + // error case -- report error to the handshake actor + Send(ev->Sender, new TEvHandshakeNak); + } + + void TInterconnectProxyTCP::IncomingHandshake(TEvHandshakeRequest::TPtr& ev) { + ICPROXY_PROFILED; + + LOG_DEBUG_IC("ICP17", "incoming handshake (actor %s)", ev->Sender.ToString().data()); + + const auto& record = ev->Get()->Record; + ui64 remotePID = record.GetProgramPID(); + ui64 remoteStartTime = record.GetProgramStartTime(); + ui64 remoteSerial = record.GetSerial(); + + if (RemoteProgramInfo && remotePID == RemoteProgramInfo->PID && remoteStartTime == RemoteProgramInfo->StartTime) { + if (remoteSerial < RemoteProgramInfo->Serial) { + LOG_INFO_IC("ICP18", "handshake (actor %s) is too old", ev->Sender.ToString().data()); + Send(ev->Sender, new TEvents::TEvPoisonPill); + return; + } else { + RemoteProgramInfo->Serial = remoteSerial; + } + } else { + const auto ptr = new TProgramInfo; + ptr->PID = remotePID; + ptr->StartTime = remoteStartTime; + ptr->Serial = remoteSerial; + RemoteProgramInfo.Reset(ptr); + } + + /* Let's check peer technical hostname */ + if (record.HasSenderHostName() && TechnicalPeerHostName != record.GetSenderHostName()) { + Send(ev->Sender, new TEvHandshakeReplyError("host name mismatch")); + return; + } + + // check sender actor id and check if it is not very old + if (LastSerialFromIncomingHandshake) { + const ui64 serial = record.GetSerial(); + if (serial < *LastSerialFromIncomingHandshake) { + LOG_NOTICE_IC("ICP15", "Handshake# %s has duplicate serial# %" PRIu64 + " LastSerialFromIncomingHandshake# %" PRIu64, ev->Sender.ToString().data(), + serial, *LastSerialFromIncomingHandshake); + Send(ev->Sender, new TEvHandshakeReplyError("duplicate serial")); + return; + } else if (serial == *LastSerialFromIncomingHandshake) { + LOG_NOTICE_IC("ICP15", "Handshake# %s is obsolete, serial# %" PRIu64 + " LastSerialFromIncomingHandshake# %" PRIu64, ev->Sender.ToString().data(), + serial, *LastSerialFromIncomingHandshake); + Send(ev->Sender, new TEvents::TEvPoisonPill); + return; + } + } + + // drop incoming handshake as this is definitely more recent + DropIncomingHandshake(); + + // prepare for new session + PrepareNewSessionHandshake(); + + auto event = MakeHolder<TEvHandshakeReplyOK>(); + auto* pb = event->Record.MutableSuccess(); + const TActorId virtualId = GenerateSessionVirtualId(); + pb->SetProtocol(INTERCONNECT_PROTOCOL_VERSION); + pb->SetSenderActorId(virtualId.ToString()); + pb->SetProgramPID(GetPID()); + pb->SetProgramStartTime(Common->StartTime); + pb->SetSerial(virtualId.LocalId()); + + IssueIncomingHandshakeReply(ev->Sender, 0, std::move(event)); + } + + void TInterconnectProxyTCP::HandleHandshakeStatus(TEvHandshakeDone::TPtr& ev) { + ICPROXY_PROFILED; + + TEvHandshakeDone *msg = ev->Get(); + + // Terminate handshake actor working in opposite direction, if set up. + if (ev->Sender == IncomingHandshakeActor) { + LOG_INFO_IC("ICP19", "incoming handshake succeeded"); + DropIncomingHandshake(false); + DropOutgoingHandshake(); + } else if (ev->Sender == OutgoingHandshakeActor) { + LOG_INFO_IC("ICP20", "outgoing handshake succeeded"); + DropIncomingHandshake(); + DropOutgoingHandshake(false); + } else { + /* It seems to be an old handshake. */ + return; + } + + Y_VERIFY(!IncomingHandshakeActor && !OutgoingHandshakeActor); + SwitchToState(__LINE__, "StateWork", &TThis::StateWork); + + if (Session) { + // this is continuation request, check that virtual ids match + Y_VERIFY(SessionVirtualId == msg->Self && RemoteSessionVirtualId == msg->Peer); + } else { + // this is initial request, check that we have virtual ids not filled in + Y_VERIFY(!SessionVirtualId && !RemoteSessionVirtualId); + } + + auto error = [&](const char* description) { + TransitToErrorState(description); + }; + + // If session is not created, then create new one. + if (!Session) { + RemoteProgramInfo = std::move(msg->ProgramInfo); + if (!RemoteProgramInfo) { + // we have received resume handshake, but session was closed concurrently while handshaking + return error("Session continuation race"); + } + + // Create new session actor. + SessionID = RegisterWithSameMailbox(Session = new TInterconnectSessionTCP(this, msg->Params)); + IActor::InvokeOtherActor(*Session, &TInterconnectSessionTCP::Init); + SessionVirtualId = msg->Self; + RemoteSessionVirtualId = msg->Peer; + LOG_INFO_IC("ICP22", "created new session: %s", SessionID.ToString().data()); + } + + // ensure that we have session local/peer virtual ids + Y_VERIFY(Session && SessionVirtualId && RemoteSessionVirtualId); + + // Set up new connection for the session. + IActor::InvokeOtherActor(*Session, &TInterconnectSessionTCP::SetNewConnection, ev); + + // Reset retry timer + HoldByErrorWakeupDuration = TDuration::Zero(); + + /* Forward all held events */ + ProcessPendingSessionEvents(); + } + + void TInterconnectProxyTCP::HandleHandshakeStatus(TEvHandshakeFail::TPtr& ev) { + ICPROXY_PROFILED; + + // update error state log; this fail is inconclusive unless this is the last pending handshake + const bool inconclusive = (ev->Sender != IncomingHandshakeActor && ev->Sender != OutgoingHandshakeActor) || + (IncomingHandshakeActor && OutgoingHandshakeActor); + LogHandshakeFail(ev, inconclusive); + + if (ev->Sender == IncomingHandshakeActor) { + LOG_NOTICE_IC("ICP24", "incoming handshake failed, temporary: %" PRIu32 " explanation: %s outgoing: %s", + ui32(ev->Get()->Temporary), ev->Get()->Explanation.data(), OutgoingHandshakeActor.ToString().data()); + DropIncomingHandshake(false); + } else if (ev->Sender == OutgoingHandshakeActor) { + LOG_NOTICE_IC("ICP25", "outgoing handshake failed, temporary: %" PRIu32 " explanation: %s incoming: %s held: %s", + ui32(ev->Get()->Temporary), ev->Get()->Explanation.data(), IncomingHandshakeActor.ToString().data(), + HeldHandshakeReply ? "yes" : "no"); + DropOutgoingHandshake(false); + + if (IEventBase* reply = HeldHandshakeReply.Release()) { + Y_VERIFY(IncomingHandshakeActor); + LOG_DEBUG_IC("ICP26", "sent held handshake reply to %s", IncomingHandshakeActor.ToString().data()); + Send(IncomingHandshakeActor, reply); + } + + // if we have no current session, then we have to drop all pending events as the outgoing handshake has failed + ProcessPendingSessionEvents(); + } else { + /* It seems to be an old fail, just ignore it */ + LOG_NOTICE_IC("ICP27", "obsolete handshake fail ignored"); + return; + } + + if (Metrics) { + Metrics->IncHandshakeFails(); + } + + if (IncomingHandshakeActor || OutgoingHandshakeActor) { + // one of handshakes is still going on + LOG_DEBUG_IC("ICP28", "other handshake is still going on"); + return; + } + + switch (ev->Get()->Temporary) { + case TEvHandshakeFail::HANDSHAKE_FAIL_TRANSIENT: + if (!Session) { + if (PendingSessionEvents) { + // try to start outgoing handshake as we have some events enqueued + StartInitialHandshake(); + } else { + // return back to initial state as we have no session and no pending handshakes + SwitchToInitialState(); + } + } else if (Session->Socket) { + // try to reestablish connection -- meaning restart handshake from the last known position + IActor::InvokeOtherActor(*Session, &TInterconnectSessionTCP::ReestablishConnectionWithHandshake, + TDisconnectReason::HandshakeFailTransient()); + } else { + // we have no active connection in that session, so just restart handshake from last known position + IActor::InvokeOtherActor(*Session, &TInterconnectSessionTCP::StartHandshake); + } + break; + + case TEvHandshakeFail::HANDSHAKE_FAIL_SESSION_MISMATCH: + StartInitialHandshake(); + break; + + case TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT: + TString timeExplanation = " LastSessionDieTime# " + LastSessionDieTime.ToString(); + if (Session) { + InvokeOtherActor(*Session, &TInterconnectSessionTCP::Terminate, + TDisconnectReason::HandshakeFailPermanent()); + } + TransitToErrorState(ev->Get()->Explanation + timeExplanation, false); + break; + } + } + + void TInterconnectProxyTCP::LogHandshakeFail(TEvHandshakeFail::TPtr& ev, bool inconclusive) { + ICPROXY_PROFILED; + + TString kind = "unknown"; + switch (ev->Get()->Temporary) { + case TEvHandshakeFail::HANDSHAKE_FAIL_TRANSIENT: + kind = Session ? "transient w/session" : "transient w/o session"; + break; + + case TEvHandshakeFail::HANDSHAKE_FAIL_SESSION_MISMATCH: + kind = "session_mismatch"; + break; + + case TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT: + kind = "permanent"; + break; + } + if (inconclusive) { + kind += " inconclusive"; + } + UpdateErrorStateLog(TActivationContext::Now(), kind, ev->Get()->Explanation); + } + + void TInterconnectProxyTCP::ProcessPendingSessionEvents() { + ICPROXY_PROFILED; + + while (PendingSessionEvents) { + TPendingSessionEvent ev = std::move(PendingSessionEvents.front()); + PendingSessionEventsSize -= ev.Size; + TAutoPtr<IEventHandle> event(ev.Event.Release()); + PendingSessionEvents.pop_front(); + + if (Session) { + ForwardSessionEventToSession(event); + } else { + DropSessionEvent(event); + } + } + } + + void TInterconnectProxyTCP::DropSessionEvent(STATEFN_SIG) { + ICPROXY_PROFILED; + + ValidateEvent(ev, "DropSessionEvent"); + switch (ev->GetTypeRewrite()) { + case TEvInterconnect::EvForward: + if (ev->Flags & IEventHandle::FlagSubscribeOnSession) { + Send(ev->Sender, new TEvInterconnect::TEvNodeDisconnected(PeerNodeId), 0, ev->Cookie); + } + TActivationContext::Send(ev->ForwardOnNondelivery(TEvents::TEvUndelivered::Disconnected)); + break; + + case TEvInterconnect::TEvConnectNode::EventType: + case TEvents::TEvSubscribe::EventType: + Send(ev->Sender, new TEvInterconnect::TEvNodeDisconnected(PeerNodeId), 0, ev->Cookie); + break; + + case TEvents::TEvUnsubscribe::EventType: + /* Do nothing */ + break; + + default: + Y_FAIL("Unexpected type of event in held event queue"); + } + } + + void TInterconnectProxyTCP::UnregisterSession(TInterconnectSessionTCP* session) { + ICPROXY_PROFILED; + + Y_VERIFY(Session && Session == session && SessionID); + + LOG_INFO_IC("ICP30", "unregister session Session# %s VirtualId# %s", SessionID.ToString().data(), + SessionVirtualId.ToString().data()); + + Session = nullptr; + SessionID = TActorId(); + + // drop all pending events as we are closed + ProcessPendingSessionEvents(); + + // reset virtual ids as this session is terminated + SessionVirtualId = TActorId(); + RemoteSessionVirtualId = TActorId(); + + if (Metrics) { + Metrics->IncSessionDeaths(); + } + LastSessionDieTime = TActivationContext::Now(); + + if (IncomingHandshakeActor || OutgoingHandshakeActor) { + PrepareNewSessionHandshake(); + } else { + SwitchToInitialState(); + } + } + + void TInterconnectProxyTCP::EnqueueSessionEvent(STATEFN_SIG) { + ICPROXY_PROFILED; + + ValidateEvent(ev, "EnqueueSessionEvent"); + const ui32 size = ev->GetSize(); + PendingSessionEventsSize += size; + PendingSessionEvents.emplace_back(TActivationContext::Now() + Common->Settings.MessagePendingTimeout, size, ev); + ScheduleCleanupEventQueue(); + CleanupEventQueue(); + } + + void TInterconnectProxyTCP::EnqueueIncomingHandshakeEvent(STATEFN_SIG) { + ICPROXY_PROFILED; + + // enqueue handshake request + Y_UNUSED(); + PendingIncomingHandshakeEvents.emplace_back(ev); + } + + void TInterconnectProxyTCP::EnqueueIncomingHandshakeEvent(TEvHandshakeDone::TPtr& /*ev*/) { + ICPROXY_PROFILED; + + // TEvHandshakeDone can't get into the queue, because we have to process handshake request first; this may be the + // race with the previous handshakes, so simply ignore it + } + + void TInterconnectProxyTCP::EnqueueIncomingHandshakeEvent(TEvHandshakeFail::TPtr& ev) { + ICPROXY_PROFILED; + + for (auto it = PendingIncomingHandshakeEvents.begin(); it != PendingIncomingHandshakeEvents.end(); ++it) { + THolder<IEventHandle>& pendingEvent = *it; + if (pendingEvent->Sender == ev->Sender) { + // we have found cancellation request for the pending handshake request; so simply remove it from the + // deque, as we are not interested in failure reason; must likely it happens because of handshake timeout + if (pendingEvent->GetTypeRewrite() == TEvHandshakeFail::EventType) { + TEvHandshakeFail::TPtr tmp(static_cast<TEventHandle<TEvHandshakeFail>*>(pendingEvent.Release())); + LogHandshakeFail(tmp, true); + } + PendingIncomingHandshakeEvents.erase(it); + break; + } + } + } + + void TInterconnectProxyTCP::ForwardSessionEventToSession(STATEFN_SIG) { + ICPROXY_PROFILED; + + Y_VERIFY(Session && SessionID); + ValidateEvent(ev, "ForwardSessionEventToSession"); + InvokeOtherActor(*Session, &TInterconnectSessionTCP::Receive, ev, TActivationContext::ActorContextFor(SessionID)); + } + + void TInterconnectProxyTCP::GenerateHttpInfo(NMon::TEvHttpInfo::TPtr& ev) { + ICPROXY_PROFILED; + + LOG_INFO_IC("ICP31", "proxy http called"); + + TStringStream str; + + HTML(str) { + DIV_CLASS("panel panel-info") { + DIV_CLASS("panel-heading") { + str << "Proxy"; + } + DIV_CLASS("panel-body") { + TABLE_CLASS("table") { + TABLEHEAD() { + TABLER() { + TABLEH() { + str << "Sensor"; + } + TABLEH() { + str << "Value"; + } + } + } +#define MON_VAR(NAME) \ + TABLER() { \ + TABLED() { \ + str << #NAME; \ + } \ + TABLED() { \ + str << NAME; \ + } \ + } + + TABLEBODY() { + MON_VAR(TActivationContext::Now()) + MON_VAR(SessionID) + MON_VAR(LastSessionDieTime) + MON_VAR(IncomingHandshakeActor) + MON_VAR(IncomingHandshakeActorFilledIn) + MON_VAR(IncomingHandshakeActorReset) + MON_VAR(OutgoingHandshakeActor) + MON_VAR(OutgoingHandshakeActorCreated) + MON_VAR(OutgoingHandshakeActorReset) + MON_VAR(State) + MON_VAR(StateSwitchTime) + } + } + } + } + + DIV_CLASS("panel panel-info") { + DIV_CLASS("panel-heading") { + str << "Error Log"; + } + DIV_CLASS("panel-body") { + TABLE_CLASS("table") { + TABLEHEAD() { + TABLER() { + TABLEH() { + str << "Timestamp"; + } + TABLEH() { + str << "Elapsed"; + } + TABLEH() { + str << "Kind"; + } + TABLEH() { + str << "Explanation"; + } + } + } + TABLEBODY() { + const TInstant now = TActivationContext::Now(); + const TInstant barrier = now - TDuration::Minutes(1); + for (auto it = ErrorStateLog.rbegin(); it != ErrorStateLog.rend(); ++it) { + auto wrapper = [&](const auto& lambda) { + if (std::get<0>(*it) > barrier) { + str << "<strong>"; + lambda(); + str << "</strong>"; + } else { + lambda(); + } + }; + TABLER() { + TABLED() { + wrapper([&] { + str << std::get<0>(*it); + }); + } + TABLED() { + wrapper([&] { + str << now - std::get<0>(*it); + }); + } + TABLED() { + wrapper([&] { + str << std::get<1>(*it); + }); + } + TABLED() { + wrapper([&] { + str << std::get<2>(*it); + }); + + ui32 rep = std::get<3>(*it); + if (rep != 1) { + str << " <strong>x" << rep << "</strong>"; + } + } + } + } + } + } + } + } + } + + if (Session != nullptr) { + Session->GenerateHttpInfo(str); + } + + Send(ev->Sender, new NMon::TEvHttpInfoRes(str.Str())); + } + + void TInterconnectProxyTCP::TransitToErrorState(TString explanation, bool updateErrorLog) { + ICPROXY_PROFILED; + + LOG_NOTICE_IC("ICP32", "transit to hold-by-error state Explanation# %s", explanation.data()); + LOG_INFO(*TlsActivationContext, NActorsServices::INTERCONNECT_STATUS, "[%u] error state: %s", PeerNodeId, explanation.data()); + + if (updateErrorLog) { + UpdateErrorStateLog(TActivationContext::Now(), "permanent conclusive", explanation); + } + + Y_VERIFY(Session == nullptr); + Y_VERIFY(!SessionID); + + // recalculate wakeup timeout -- if this is the first failure, then we sleep for default timeout; otherwise we + // sleep N times longer than the previous try, but not longer than desired number of seconds + HoldByErrorWakeupDuration = HoldByErrorWakeupDuration != TDuration::Zero() + ? Min(HoldByErrorWakeupDuration * SleepRetryMultiplier, MaxErrorSleep) + : FirstErrorSleep; + + // transit to required state and arm wakeup timer + if (Terminated) { + // switch to this state permanently + SwitchToState(__LINE__, "HoldByError", &TThis::HoldByError); + HoldByErrorWakeupCookie = nullptr; + } else { + SwitchToState(__LINE__, "HoldByError", &TThis::HoldByError, HoldByErrorWakeupDuration, + HoldByErrorWakeupCookie = new TEvents::TEvWakeup); + } + + /* Process all pending events. */ + ProcessPendingSessionEvents(); + + /* Terminate handshakes */ + DropHandshakes(); + + /* Terminate pending incoming handshake requests. */ + for (auto& ev : PendingIncomingHandshakeEvents) { + Send(ev->Sender, new TEvents::TEvPoisonPill); + if (ev->GetTypeRewrite() == TEvHandshakeFail::EventType) { + TEvHandshakeFail::TPtr tmp(static_cast<TEventHandle<TEvHandshakeFail>*>(ev.Release())); + LogHandshakeFail(tmp, true); + } + } + PendingIncomingHandshakeEvents.clear(); + } + + void TInterconnectProxyTCP::WakeupFromErrorState(TEvents::TEvWakeup::TPtr& ev) { + ICPROXY_PROFILED; + + LOG_INFO_IC("ICP33", "wake up from error state"); + + if (ev->Get() == HoldByErrorWakeupCookie) { + SwitchToInitialState(); + } + } + + void TInterconnectProxyTCP::Disconnect() { + ICPROXY_PROFILED; + + // terminate handshakes (if any) + DropHandshakes(); + + if (Session) { + IActor::InvokeOtherActor(*Session, &TInterconnectSessionTCP::Terminate, TDisconnectReason::UserRequest()); + } else { + TransitToErrorState("forced disconnect"); + } + } + + void TInterconnectProxyTCP::ScheduleCleanupEventQueue() { + ICPROXY_PROFILED; + + if (!CleanupEventQueueScheduled && PendingSessionEvents) { + // apply batching at 50 ms granularity + Schedule(Max(TDuration::MilliSeconds(50), PendingSessionEvents.front().Deadline - TActivationContext::Now()), new TEvCleanupEventQueue); + CleanupEventQueueScheduled = true; + } + } + + void TInterconnectProxyTCP::HandleCleanupEventQueue() { + ICPROXY_PROFILED; + + Y_VERIFY(CleanupEventQueueScheduled); + CleanupEventQueueScheduled = false; + CleanupEventQueue(); + ScheduleCleanupEventQueue(); + } + + void TInterconnectProxyTCP::CleanupEventQueue() { + ICPROXY_PROFILED; + + const TInstant now = TActivationContext::Now(); + while (PendingSessionEvents) { + TPendingSessionEvent& ev = PendingSessionEvents.front(); + if (now >= ev.Deadline || PendingSessionEventsSize > Common->Settings.MessagePendingSize) { + TAutoPtr<IEventHandle> event(ev.Event.Release()); + PendingSessionEventsSize -= ev.Size; + DropSessionEvent(event); + PendingSessionEvents.pop_front(); + } else { + break; + } + } + } + + void TInterconnectProxyTCP::HandleClosePeerSocket() { + ICPROXY_PROFILED; + + if (Session && Session->Socket) { + LOG_INFO_IC("ICP34", "closed connection by debug command"); + Session->Socket->Shutdown(SHUT_RDWR); + } + } + + void TInterconnectProxyTCP::HandleCloseInputSession() { + ICPROXY_PROFILED; + + if (Session) { + IActor::InvokeOtherActor(*Session, &TInterconnectSessionTCP::CloseInputSession); + } + } + + void TInterconnectProxyTCP::HandlePoisonSession() { + ICPROXY_PROFILED; + + if (Session) { + IActor::InvokeOtherActor(*Session, &TInterconnectSessionTCP::Terminate, TDisconnectReason::Debug()); + } + } + + void TInterconnectProxyTCP::HandleSessionBufferSizeRequest(TEvSessionBufferSizeRequest::TPtr& ev) { + ICPROXY_PROFILED; + + ui64 bufSize = 0; + if (Session) { + bufSize = Session->TotalOutputQueueSize; + } + + Send(ev->Sender, new TEvSessionBufferSizeResponse(SessionID, bufSize)); + } + + void TInterconnectProxyTCP::Handle(TEvQueryStats::TPtr& ev) { + ICPROXY_PROFILED; + + TProxyStats stats; + stats.Path = Sprintf("peer%04" PRIu32, PeerNodeId); + stats.State = State; + stats.PeerScopeId = Session ? Session->Params.PeerScopeId : TScopeId(); + stats.LastSessionDieTime = LastSessionDieTime; + stats.TotalOutputQueueSize = Session ? Session->TotalOutputQueueSize : 0; + stats.Connected = Session ? (bool)Session->Socket : false; + stats.Host = TechnicalPeerHostName; + stats.Port = 0; + ui32 rep = 0; + std::tie(stats.LastErrorTimestamp, stats.LastErrorKind, stats.LastErrorExplanation, rep) = ErrorStateLog + ? ErrorStateLog.back() + : std::make_tuple(TInstant(), TString(), TString(), 1U); + if (rep != 1) { + stats.LastErrorExplanation += Sprintf(" x%" PRIu32, rep); + } + stats.Ping = Session ? Session->GetPingRTT() : TDuration::Zero(); + stats.ClockSkew = Session ? Session->GetClockSkew() : 0; + if (Session) { + if (auto *x = dynamic_cast<NInterconnect::TSecureSocket*>(Session->Socket.Get())) { + stats.Encryption = Sprintf("%s/%u", x->GetCipherName().data(), x->GetCipherBits()); + } else { + stats.Encryption = "none"; + } + } + + auto response = MakeHolder<TEvStats>(); + response->PeerNodeId = PeerNodeId; + response->ProxyStats = std::move(stats); + Send(ev->Sender, response.Release()); + } + + void TInterconnectProxyTCP::HandleTerminate() { + ICPROXY_PROFILED; + + if (Session) { + IActor::InvokeOtherActor(*Session, &TInterconnectSessionTCP::Terminate, TDisconnectReason()); + } + Terminated = true; + TransitToErrorState("terminated"); + } + + void TInterconnectProxyTCP::PassAway() { + if (Session) { + IActor::InvokeOtherActor(*Session, &TInterconnectSessionTCP::Terminate, TDisconnectReason()); + } + if (DynamicPtr) { + Y_VERIFY(*DynamicPtr == this); + *DynamicPtr = nullptr; + } + // TODO: unregister actor mon page + TActor::PassAway(); + } +} diff --git a/library/cpp/actors/interconnect/interconnect_tcp_proxy.h b/library/cpp/actors/interconnect/interconnect_tcp_proxy.h new file mode 100644 index 0000000000..023e5bd1ee --- /dev/null +++ b/library/cpp/actors/interconnect/interconnect_tcp_proxy.h @@ -0,0 +1,537 @@ +#pragma once + +#include <library/cpp/actors/core/actor_bootstrapped.h> +#include <library/cpp/actors/core/hfunc.h> +#include <library/cpp/actors/core/event_pb.h> +#include <library/cpp/actors/core/events.h> +#include <library/cpp/monlib/dynamic_counters/counters.h> + +#include "interconnect_common.h" +#include "interconnect_counters.h" +#include "interconnect_tcp_session.h" +#include "profiler.h" + +#define ICPROXY_PROFILED TFunction func(*this, __func__, __LINE__) + +namespace NActors { + + + /* WARNING: all proxy actors should be alive during actorsystem activity */ + class TInterconnectProxyTCP + : public TActor<TInterconnectProxyTCP> + , public TInterconnectLoggingBase + , public TProfiled + { + enum { + EvCleanupEventQueue = EventSpaceBegin(TEvents::ES_PRIVATE), + EvQueryStats, + EvStats, + EvPassAwayIfNeeded, + }; + + struct TEvCleanupEventQueue : TEventLocal<TEvCleanupEventQueue, EvCleanupEventQueue> {}; + + public: + struct TEvQueryStats : TEventLocal<TEvQueryStats, EvQueryStats> {}; + + struct TProxyStats { + TString Path; + TString State; + TScopeId PeerScopeId; + TInstant LastSessionDieTime; + ui64 TotalOutputQueueSize; + bool Connected; + TString Host; + ui16 Port; + TInstant LastErrorTimestamp; + TString LastErrorKind; + TString LastErrorExplanation; + TDuration Ping; + i64 ClockSkew; + TString Encryption; + }; + + struct TEvStats : TEventLocal<TEvStats, EvStats> { + ui32 PeerNodeId; + TProxyStats ProxyStats; + }; + + static constexpr EActivityType ActorActivityType() { + return INTERCONNECT_PROXY_TCP; + } + + TInterconnectProxyTCP(const ui32 node, TInterconnectProxyCommon::TPtr common, IActor **dynamicPtr = nullptr); + + STFUNC(StateInit) { + Bootstrap(); + if (ev->Type != TEvents::TSystem::Bootstrap) { // for dynamic nodes we do not receive Bootstrap event + Receive(ev, ctx); + } + } + + void Bootstrap(); + void Registered(TActorSystem* sys, const TActorId& owner) override; + + private: + friend class TInterconnectSessionTCP; + friend class TInterconnectSessionTCPv0; + friend class THandshake; + friend class TInputSessionTCP; + + void UnregisterSession(TInterconnectSessionTCP* session); + +#define SESSION_EVENTS(HANDLER) \ + fFunc(TEvInterconnect::EvForward, HANDLER) \ + fFunc(TEvInterconnect::TEvConnectNode::EventType, HANDLER) \ + fFunc(TEvents::TEvSubscribe::EventType, HANDLER) \ + fFunc(TEvents::TEvUnsubscribe::EventType, HANDLER) + +#define INCOMING_HANDSHAKE_EVENTS(HANDLER) \ + fFunc(TEvHandshakeAsk::EventType, HANDLER) \ + fFunc(TEvHandshakeRequest::EventType, HANDLER) + +#define HANDSHAKE_STATUS_EVENTS(HANDLER) \ + hFunc(TEvHandshakeDone, HANDLER) \ + hFunc(TEvHandshakeFail, HANDLER) + +#define PROXY_STFUNC(STATE, SESSION_HANDLER, INCOMING_HANDSHAKE_HANDLER, \ + HANDSHAKE_STATUS_HANDLER, DISCONNECT_HANDLER, \ + WAKEUP_HANDLER, NODE_INFO_HANDLER) \ + STATEFN(STATE) { \ + const ui32 type = ev->GetTypeRewrite(); \ + const bool profiled = type != TEvInterconnect::EvForward \ + && type != TEvInterconnect::EvConnectNode \ + && type != TEvents::TSystem::Subscribe \ + && type != TEvents::TSystem::Unsubscribe; \ + if (profiled) { \ + TProfiled::Start(); \ + } \ + { \ + TProfiled::TFunction func(*this, __func__, __LINE__); \ + switch (type) { \ + SESSION_EVENTS(SESSION_HANDLER) \ + INCOMING_HANDSHAKE_EVENTS(INCOMING_HANDSHAKE_HANDLER) \ + HANDSHAKE_STATUS_EVENTS(HANDSHAKE_STATUS_HANDLER) \ + cFunc(TEvInterconnect::EvDisconnect, DISCONNECT_HANDLER) \ + hFunc(TEvents::TEvWakeup, WAKEUP_HANDLER) \ + hFunc(TEvGetSecureSocket, Handle) \ + hFunc(NMon::TEvHttpInfo, GenerateHttpInfo) \ + cFunc(EvCleanupEventQueue, HandleCleanupEventQueue) \ + hFunc(TEvInterconnect::TEvNodeInfo, NODE_INFO_HANDLER) \ + cFunc(TEvInterconnect::EvClosePeerSocket, HandleClosePeerSocket) \ + cFunc(TEvInterconnect::EvCloseInputSession, HandleCloseInputSession) \ + cFunc(TEvInterconnect::EvPoisonSession, HandlePoisonSession) \ + hFunc(TEvSessionBufferSizeRequest, HandleSessionBufferSizeRequest) \ + hFunc(TEvQueryStats, Handle) \ + cFunc(TEvInterconnect::EvTerminate, HandleTerminate) \ + cFunc(EvPassAwayIfNeeded, HandlePassAwayIfNeeded) \ + default: \ + Y_FAIL("unexpected event Type# 0x%08" PRIx32, type); \ + } \ + } \ + if (profiled) { \ + if (TProfiled::Duration() >= TDuration::MilliSeconds(16)) { \ + const TString report = TProfiled::Format(); \ + LOG_ERROR_IC("ICP35", "event processing took too much time %s", report.data()); \ + } \ + TProfiled::Finish(); \ + } \ + } + + template <typename T> + void Ignore(T& /*ev*/) { + ICPROXY_PROFILED; + } + + void Ignore() { + ICPROXY_PROFILED; + } + + void Ignore(TEvHandshakeDone::TPtr& ev) { + ICPROXY_PROFILED; + + Y_VERIFY(ev->Sender != IncomingHandshakeActor); + Y_VERIFY(ev->Sender != OutgoingHandshakeActor); + } + + void Ignore(TEvHandshakeFail::TPtr& ev) { + ICPROXY_PROFILED; + + Y_VERIFY(ev->Sender != IncomingHandshakeActor); + Y_VERIFY(ev->Sender != OutgoingHandshakeActor); + LogHandshakeFail(ev, true); + } + + const char* State = nullptr; + TInstant StateSwitchTime; + + template <typename... TArgs> + void SwitchToState(int line, const char* name, TArgs&&... args) { + ICPROXY_PROFILED; + + LOG_DEBUG_IC("ICP77", "@%d %s -> %s", line, State, name); + State = name; + StateSwitchTime = TActivationContext::Now(); + Become(std::forward<TArgs>(args)...); + Y_VERIFY(!Terminated || CurrentStateFunc() == &TThis::HoldByError); // ensure we never escape this state + if (CurrentStateFunc() != &TThis::PendingActivation) { + PassAwayTimestamp = TInstant::Max(); + } + } + + TInstant PassAwayTimestamp; + bool PassAwayScheduled = false; + + void SwitchToInitialState() { + ICPROXY_PROFILED; + + Y_VERIFY(!PendingSessionEvents && !PendingIncomingHandshakeEvents, "%s PendingSessionEvents# %zu" + " PendingIncomingHandshakeEvents# %zu State# %s", LogPrefix.data(), PendingSessionEvents.size(), + PendingIncomingHandshakeEvents.size(), State); + SwitchToState(__LINE__, "PendingActivation", &TThis::PendingActivation); + if (DynamicPtr && !PassAwayScheduled && PassAwayTimestamp != TInstant::Max()) { + TActivationContext::Schedule(PassAwayTimestamp, new IEventHandle(EvPassAwayIfNeeded, 0, SelfId(), + {}, nullptr, 0)); + PassAwayScheduled = true; + } + } + + void HandlePassAwayIfNeeded() { + Y_VERIFY(PassAwayScheduled); + if (PassAwayTimestamp != TInstant::Max()) { + PassAway(); + } + } + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // PendingActivation + // + // In this state we are just waiting for some activities, which may include: + // * an external Session event + // * incoming handshake request + // + // Upon receiving such event, we put it to corresponding queue and initiate start up by calling IssueGetNodeRequest, + // which, as the name says, issued TEvGetNode to the nameservice and arms timer to handle timeout (which should not + // occur, but we want to be sure we don't hang on this), and then switches to PendingNodeInfo state. + + PROXY_STFUNC(PendingActivation, + RequestNodeInfo, // Session events + RequestNodeInfoForIncomingHandshake, // Incoming handshake requests + Ignore, // Handshake status + Ignore, // Disconnect request + Ignore, // Wakeup + Ignore // Node info + ) + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // PendingNodeInfo + // + // This state is entered when we asked nameserver to provide description for peer node we are working with. All + // external Session events and incoming handshake requests are enqueued into their respective queues, TEvNodeInfo + // is main event that triggers processing. On success, we try to initiate outgoing handshake if needed, or process + // incoming handshakes. On error, we enter HoldByError state. + // + // NOTE: handshake status events are also enqueued as the handshake actor may have generated failure event due to + // timeout or some other reason without waiting for acknowledge, and it must be processed correctly to prevent + // session hang + + PROXY_STFUNC(PendingNodeInfo, + EnqueueSessionEvent, // Session events + EnqueueIncomingHandshakeEvent, // Incoming handshake requests + EnqueueIncomingHandshakeEvent, // Handshake status + Disconnect, // Disconnect request + ConfigureTimeout, // Wakeup + Configure // Node info + ) + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // PendingConnection + // + // Here we have issued outgoing handshake or have accepted (or may be both) incoming handshake and we are waiting for + // the status of the handshake. When one if handshakes finishes, we use this status to establish connection (or to + // go to error state). When one handshake terminates with error while other is running, we will still wait for the + // second one to finish. + + PROXY_STFUNC(PendingConnection, + EnqueueSessionEvent, // Session events + IncomingHandshake, // Incoming handshake requests + HandleHandshakeStatus, // Handshake status + Disconnect, // Disconnect request + Ignore, // Wakeup + Ignore // Node info + ) + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // StateWork + // + // We have accepted session and process any incoming messages with the session. Incoming handshakes are accepted + // concurrently and applied when finished. + + PROXY_STFUNC(StateWork, + ForwardSessionEventToSession, // Session events + IncomingHandshake, // Incoming handshake requests + HandleHandshakeStatus, // Handshake status + Disconnect, // Disconnect request + Ignore, // Wakeup + Ignore // Node info + ) + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // HoldByError + // + // When something bad happens with the connection, we sleep in this state. After wake up we go back to + // PendingActivation. + + PROXY_STFUNC(HoldByError, + DropSessionEvent, // Session events + RequestNodeInfoForIncomingHandshake, // Incoming handshake requests + Ignore, // Handshake status + Ignore, // Disconnect request + WakeupFromErrorState, // Wakeup + Ignore // Node info + ) + +#undef SESSION_EVENTS +#undef INCOMING_HANDSHAKE_EVENTS +#undef HANDSHAKE_STATUS_EVENTS +#undef PROXY_STFUNC + + void ForwardSessionEventToSession(STATEFN_SIG); + void EnqueueSessionEvent(STATEFN_SIG); + + // Incoming handshake handlers, including special wrapper when the IncomingHandshake is used as fFunc + void IncomingHandshake(STATEFN_SIG) { + switch (ev->GetTypeRewrite()) { + hFunc(TEvHandshakeAsk, IncomingHandshake); + hFunc(TEvHandshakeRequest, IncomingHandshake); + default: + Y_FAIL(); + } + } + void IncomingHandshake(TEvHandshakeAsk::TPtr& ev); + void IncomingHandshake(TEvHandshakeRequest::TPtr& ev); + + void RequestNodeInfo(STATEFN_SIG); + void RequestNodeInfoForIncomingHandshake(STATEFN_SIG); + + void StartInitialHandshake(); + void StartResumeHandshake(ui64 inputCounter); + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Incoming handshake event queue processing + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + void EnqueueIncomingHandshakeEvent(STATEFN_SIG); + void EnqueueIncomingHandshakeEvent(TEvHandshakeDone::TPtr& ev); + void EnqueueIncomingHandshakeEvent(TEvHandshakeFail::TPtr& ev); + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // PendingNodeInfo + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + IEventBase* ConfigureTimeoutCookie; // pointer to the scheduled event used to match sent and received events + + void StartConfiguring(); + void Configure(TEvInterconnect::TEvNodeInfo::TPtr& ev); + void ConfigureTimeout(TEvents::TEvWakeup::TPtr& ev); + void ProcessConfigured(); + + void HandleHandshakeStatus(TEvHandshakeDone::TPtr& ev); + void HandleHandshakeStatus(TEvHandshakeFail::TPtr& ev); + + void TransitToErrorState(TString Explanation, bool updateErrorLog = true); + void WakeupFromErrorState(TEvents::TEvWakeup::TPtr& ev); + void Disconnect(); + + const ui32 PeerNodeId; + IActor **DynamicPtr; + + void ValidateEvent(TAutoPtr<IEventHandle>& ev, const char* func) { + if (SelfId().NodeId() == PeerNodeId) { + TString msg = Sprintf("Event Type# 0x%08" PRIx32 " TypeRewrite# 0x%08" PRIx32 + " from Sender# %s sent to the proxy for the node itself via Interconnect;" + " THIS IS NOT A BUG IN INTERCONNECT, check the event sender instead", + ev->Type, ev->GetTypeRewrite(), ev->Sender.ToString().data()); + LOG_ERROR_IC("ICP03", "%s", msg.data()); + Y_VERIFY_DEBUG(false, "%s", msg.data()); + } + + Y_VERIFY(ev->GetTypeRewrite() != TEvInterconnect::EvForward || ev->Recipient.NodeId() == PeerNodeId, + "Recipient/Proxy NodeId mismatch Recipient# %s Type# 0x%08" PRIx32 " PeerNodeId# %" PRIu32 " Func# %s", + ev->Recipient.ToString().data(), ev->Type, PeerNodeId, func); + } + + // Common with helpers + // All proxy actors share the same information in the object + // read only + TInterconnectProxyCommon::TPtr const Common; + + const TActorId& GetNameserviceId() const { + return Common->NameserviceId; + } + + TString TechnicalPeerHostName; + + std::shared_ptr<IInterconnectMetrics> Metrics; + + void HandleClosePeerSocket(); + void HandleCloseInputSession(); + void HandlePoisonSession(); + + void HandleSessionBufferSizeRequest(TEvSessionBufferSizeRequest::TPtr& ev); + + bool CleanupEventQueueScheduled = false; + void ScheduleCleanupEventQueue(); + void HandleCleanupEventQueue(); + void CleanupEventQueue(); + + // hold all events before connection is established + struct TPendingSessionEvent { + TInstant Deadline; + ui32 Size; + THolder<IEventHandle> Event; + + TPendingSessionEvent(TInstant deadline, ui32 size, TAutoPtr<IEventHandle> event) + : Deadline(deadline) + , Size(size) + , Event(event) + {} + }; + TDeque<TPendingSessionEvent> PendingSessionEvents; + ui64 PendingSessionEventsSize = 0; + void ProcessPendingSessionEvents(); + void DropSessionEvent(STATEFN_SIG); + + TInterconnectSessionTCP* Session = nullptr; + TActorId SessionID; + + // virtual ids used during handshake to check if it is the connection + // for the same session or to find out the latest shandshake + // it's virtual because session actor apears after successfull handshake + TActorId SessionVirtualId; + TActorId RemoteSessionVirtualId; + + TActorId GenerateSessionVirtualId() { + ICPROXY_PROFILED; + + const ui64 localId = TlsActivationContext->ExecutorThread.ActorSystem->AllocateIDSpace(1); + return NActors::TActorId(SelfId().NodeId(), 0, localId, 0); + } + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + TActorId IncomingHandshakeActor; + TInstant IncomingHandshakeActorFilledIn; + TInstant IncomingHandshakeActorReset; + TMaybe<ui64> LastSerialFromIncomingHandshake; + THolder<IEventBase> HeldHandshakeReply; + + void DropIncomingHandshake(bool poison = true) { + ICPROXY_PROFILED; + + if (const TActorId& actorId = std::exchange(IncomingHandshakeActor, TActorId())) { + LOG_DEBUG_IC("ICP111", "dropped incoming handshake: %s poison: %s", actorId.ToString().data(), + poison ? "true" : "false"); + if (poison) { + Send(actorId, new TEvents::TEvPoisonPill); + } + LastSerialFromIncomingHandshake.Clear(); + HeldHandshakeReply.Reset(); + IncomingHandshakeActorReset = TActivationContext::Now(); + } + } + + void DropOutgoingHandshake(bool poison = true) { + ICPROXY_PROFILED; + + if (const TActorId& actorId = std::exchange(OutgoingHandshakeActor, TActorId())) { + LOG_DEBUG_IC("ICP112", "dropped outgoing handshake: %s poison: %s", actorId.ToString().data(), + poison ? "true" : "false"); + if (poison) { + Send(actorId, new TEvents::TEvPoisonPill); + } + OutgoingHandshakeActorReset = TActivationContext::Now(); + } + } + + void DropHandshakes() { + ICPROXY_PROFILED; + + DropIncomingHandshake(); + DropOutgoingHandshake(); + } + + void PrepareNewSessionHandshake() { + ICPROXY_PROFILED; + + // drop existing session if we have one + if (Session) { + LOG_INFO_IC("ICP04", "terminating current session as we are negotiating a new one"); + IActor::InvokeOtherActor(*Session, &TInterconnectSessionTCP::Terminate, TDisconnectReason::NewSession()); + } + + // ensure we have no current session + Y_VERIFY(!Session); + + // switch to pending connection state -- we wait for handshakes, we want more handshakes! + SwitchToState(__LINE__, "PendingConnection", &TThis::PendingConnection); + } + + void IssueIncomingHandshakeReply(const TActorId& handshakeId, ui64 peerLocalId, + THolder<IEventBase> event); + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + TActorId OutgoingHandshakeActor; + TInstant OutgoingHandshakeActorCreated; + TInstant OutgoingHandshakeActorReset; + + TInstant LastSessionDieTime; + + void GenerateHttpInfo(NMon::TEvHttpInfo::TPtr& ev); + + void Handle(TEvQueryStats::TPtr& ev); + + TDuration HoldByErrorWakeupDuration = TDuration::Zero(); + TEvents::TEvWakeup* HoldByErrorWakeupCookie; + + THolder<TProgramInfo> RemoteProgramInfo; + NInterconnect::TSecureSocketContext::TPtr SecureContext; + + void Handle(TEvGetSecureSocket::TPtr ev) { + auto socket = MakeIntrusive<NInterconnect::TSecureSocket>(*ev->Get()->Socket, SecureContext); + Send(ev->Sender, new TEvSecureSocket(std::move(socket))); + } + + TDeque<THolder<IEventHandle>> PendingIncomingHandshakeEvents; + + TDeque<std::tuple<TInstant, TString, TString, ui32>> ErrorStateLog; + + void UpdateErrorStateLog(TInstant now, TString kind, TString explanation) { + ICPROXY_PROFILED; + + if (ErrorStateLog) { + auto& back = ErrorStateLog.back(); + TString lastKind, lastExpl; + if (kind == std::get<1>(back) && explanation == std::get<2>(back)) { + std::get<0>(back) = now; + ++std::get<3>(back); + return; + } + } + + ErrorStateLog.emplace_back(now, std::move(kind), std::move(explanation), 1); + if (ErrorStateLog.size() > 20) { + ErrorStateLog.pop_front(); + } + } + + void LogHandshakeFail(TEvHandshakeFail::TPtr& ev, bool inconclusive); + + bool Terminated = false; + void HandleTerminate(); + + void PassAway() override; + }; + +} diff --git a/library/cpp/actors/interconnect/interconnect_tcp_server.cpp b/library/cpp/actors/interconnect/interconnect_tcp_server.cpp new file mode 100644 index 0000000000..b95c994598 --- /dev/null +++ b/library/cpp/actors/interconnect/interconnect_tcp_server.cpp @@ -0,0 +1,117 @@ +#include "interconnect_tcp_server.h" +#include "interconnect_handshake.h" + +#include <library/cpp/actors/core/log.h> +#include <library/cpp/actors/protos/services_common.pb.h> + +#include "interconnect_common.h" + +namespace NActors { + TInterconnectListenerTCP::TInterconnectListenerTCP(const TString& address, ui16 port, TInterconnectProxyCommon::TPtr common, const TMaybe<SOCKET>& socket) + : TActor(&TThis::Initial) + , TInterconnectLoggingBase(Sprintf("ICListener: %s", SelfId().ToString().data())) + , Address(address.c_str(), port) + , Listener( + socket + ? new NInterconnect::TStreamSocket(*socket) + : nullptr) + , ExternalSocket(!!Listener) + , ProxyCommonCtx(std::move(common)) + { + if (ExternalSocket) { + SetNonBlock(*Listener); + } + } + + TAutoPtr<IEventHandle> TInterconnectListenerTCP::AfterRegister(const TActorId& self, const TActorId& parentId) { + return new IEventHandle(self, parentId, new TEvents::TEvBootstrap, 0); + } + + void TInterconnectListenerTCP::Die(const TActorContext& ctx) { + LOG_DEBUG_IC("ICL08", "Dying"); + TActor::Die(ctx); + } + + int TInterconnectListenerTCP::Bind() { + NInterconnect::TAddress addr = Address; + + if (ProxyCommonCtx->Settings.BindOnAllAddresses) { + switch (addr.GetFamily()) { + case AF_INET: { + auto *sa = reinterpret_cast<sockaddr_in*>(addr.SockAddr()); + sa->sin_addr = {INADDR_ANY}; + break; + } + + case AF_INET6: { + auto *sa = reinterpret_cast<sockaddr_in6*>(addr.SockAddr()); + sa->sin6_addr = in6addr_any; + break; + } + + default: + Y_FAIL("Unsupported address family"); + } + } + + Listener = NInterconnect::TStreamSocket::Make(addr.GetFamily()); + if (*Listener == -1) { + return errno; + } + SetNonBlock(*Listener); + Listener->SetSendBufferSize(ProxyCommonCtx->Settings.GetSendBufferSize()); // TODO(alexvru): WTF? + SetSockOpt(*Listener, SOL_SOCKET, SO_REUSEADDR, 1); + if (const auto e = -Listener->Bind(addr)) { + return e; + } else if (const auto e = -Listener->Listen(SOMAXCONN)) { + return e; + } else { + return 0; + } + } + + void TInterconnectListenerTCP::Bootstrap(const TActorContext& ctx) { + if (!Listener) { + if (const int err = Bind()) { + LOG_ERROR_IC("ICL01", "Bind failed: %s (%s)", strerror(err), Address.ToString().data()); + Listener.Reset(); + Become(&TThis::Initial, TDuration::Seconds(1), new TEvents::TEvBootstrap); + return; + } + } + if (const auto& callback = ProxyCommonCtx->InitWhiteboard) { + callback(Address.GetPort(), TlsActivationContext->ExecutorThread.ActorSystem); + } + const bool success = ctx.Send(MakePollerActorId(), new TEvPollerRegister(Listener, SelfId(), {})); + Y_VERIFY(success); + Become(&TThis::Listen); + } + + void TInterconnectListenerTCP::Handle(TEvPollerRegisterResult::TPtr ev, const TActorContext& ctx) { + PollerToken = std::move(ev->Get()->PollerToken); + Process(ctx); + } + + void TInterconnectListenerTCP::Process(const TActorContext& ctx) { + for (;;) { + NInterconnect::TAddress address; + const int r = Listener->Accept(address); + if (r >= 0) { + LOG_DEBUG_IC("ICL04", "Accepted from: %s", address.ToString().data()); + auto socket = MakeIntrusive<NInterconnect::TStreamSocket>(static_cast<SOCKET>(r)); + ctx.Register(CreateIncomingHandshakeActor(ProxyCommonCtx, std::move(socket))); + continue; + } else if (-r != EAGAIN && -r != EWOULDBLOCK) { + Y_VERIFY(-r != ENFILE && -r != EMFILE && !ExternalSocket); + LOG_ERROR_IC("ICL06", "Listen failed: %s (%s)", strerror(-r), Address.ToString().data()); + Listener.Reset(); + PollerToken.Reset(); + Become(&TThis::Initial, TDuration::Seconds(1), new TEvents::TEvBootstrap); + } else if (PollerToken) { + PollerToken->Request(true, false); + } + break; + } + } + +} diff --git a/library/cpp/actors/interconnect/interconnect_tcp_server.h b/library/cpp/actors/interconnect/interconnect_tcp_server.h new file mode 100644 index 0000000000..fc71073c2d --- /dev/null +++ b/library/cpp/actors/interconnect/interconnect_tcp_server.h @@ -0,0 +1,57 @@ +#pragma once + +#include <library/cpp/actors/core/hfunc.h> +#include <library/cpp/actors/core/event_pb.h> +#include <library/cpp/actors/core/events.h> + +#include "interconnect_common.h" +#include "poller_actor.h" +#include "events_local.h" + +namespace NActors { + class TInterconnectListenerTCP: public TActor<TInterconnectListenerTCP>, public TInterconnectLoggingBase { + public: + static constexpr EActivityType ActorActivityType() { + return INTERCONNECT_COMMON; + } + + TInterconnectListenerTCP(const TString& address, ui16 port, TInterconnectProxyCommon::TPtr common, const TMaybe<SOCKET>& socket = Nothing()); + int Bind(); + + private: + STFUNC(Initial) { + switch (ev->GetTypeRewrite()) { + CFunc(TEvents::TEvBootstrap::EventType, Bootstrap); + CFunc(TEvents::TEvPoisonPill::EventType, Die); + } + } + + STFUNC(Listen) { + switch (ev->GetTypeRewrite()) { + CFunc(TEvents::TEvPoisonPill::EventType, Die); + HFunc(TEvPollerRegisterResult, Handle); + CFunc(TEvPollerReady::EventType, Process); + } + } + + TAutoPtr<IEventHandle> AfterRegister(const TActorId& self, const TActorId& parentId) override; + + void Die(const TActorContext& ctx) override; + + void Bootstrap(const TActorContext& ctx); + void Handle(TEvPollerRegisterResult::TPtr ev, const TActorContext& ctx); + + void Process(const TActorContext& ctx); + + const NInterconnect::TAddress Address; + TIntrusivePtr<NInterconnect::TStreamSocket> Listener; + const bool ExternalSocket; + TPollerToken::TPtr PollerToken; + TInterconnectProxyCommon::TPtr const ProxyCommonCtx; + }; + + static inline TActorId MakeInterconnectListenerActorId(bool dynamic) { + char x[12] = {'I', 'C', 'L', 'i', 's', 't', 'e', 'n', 'e', 'r', '/', dynamic ? 'D' : 'S'}; + return TActorId(0, TStringBuf(x, 12)); + } +} diff --git a/library/cpp/actors/interconnect/interconnect_tcp_session.cpp b/library/cpp/actors/interconnect/interconnect_tcp_session.cpp new file mode 100644 index 0000000000..2ded7f9f53 --- /dev/null +++ b/library/cpp/actors/interconnect/interconnect_tcp_session.cpp @@ -0,0 +1,1228 @@ +#include "interconnect_tcp_proxy.h" +#include "interconnect_tcp_session.h" +#include "interconnect_handshake.h" + +#include <library/cpp/actors/core/probes.h> +#include <library/cpp/actors/core/log.h> +#include <library/cpp/actors/core/interconnect.h> +#include <library/cpp/actors/util/datetime.h> +#include <library/cpp/actors/protos/services_common.pb.h> +#include <library/cpp/monlib/service/pages/templates.h> + +namespace NActors { + LWTRACE_USING(ACTORLIB_PROVIDER); + + DECLARE_WILSON_EVENT(OutputQueuePush, (ui32, QueueSizeInEvents), (ui64, QueueSizeInBytes)); + + template<typename T> + T Coalesce(T&& x) { + return x; + } + + template<typename T, typename T2, typename... TRest> + typename std::common_type<T, T2, TRest...>::type Coalesce(T&& first, T2&& mid, TRest&&... rest) { + if (first != typename std::remove_reference<T>::type()) { + return first; + } else { + return Coalesce(std::forward<T2>(mid), std::forward<TRest>(rest)...); + } + } + + TInterconnectSessionTCP::TInterconnectSessionTCP(TInterconnectProxyTCP* const proxy, TSessionParams params) + : TActor(&TInterconnectSessionTCP::StateFunc) + , Created(TInstant::Now()) + , Proxy(proxy) + , CloseOnIdleWatchdog(GetCloseOnIdleTimeout(), std::bind(&TThis::OnCloseOnIdleTimerHit, this)) + , LostConnectionWatchdog(GetLostConnectionTimeout(), std::bind(&TThis::OnLostConnectionTimerHit, this)) + , Params(std::move(params)) + , TotalOutputQueueSize(0) + , OutputStuckFlag(false) + , OutputQueueUtilization(16) + , OutputCounter(0ULL) + { + Proxy->Metrics->SetConnected(0); + ReceiveContext.Reset(new TReceiveContext); + } + + TInterconnectSessionTCP::~TInterconnectSessionTCP() { + // close socket ASAP when actor system is being shut down + if (Socket) { + Socket->Shutdown(SHUT_RDWR); + } + } + + void TInterconnectSessionTCP::Init() { + auto destroyCallback = [as = TlsActivationContext->ExecutorThread.ActorSystem, id = Proxy->Common->DestructorId](THolder<IEventBase> event) { + as->Send(id, event.Release()); + }; + Pool.ConstructInPlace(Proxy->Common, std::move(destroyCallback)); + ChannelScheduler.ConstructInPlace(Proxy->PeerNodeId, Proxy->Common->ChannelsConfig, Proxy->Metrics, *Pool, + Proxy->Common->Settings.MaxSerializedEventSize, Params); + + LOG_INFO(*TlsActivationContext, NActorsServices::INTERCONNECT_STATUS, "[%u] session created", Proxy->PeerNodeId); + SetPrefix(Sprintf("Session %s [node %" PRIu32 "]", SelfId().ToString().data(), Proxy->PeerNodeId)); + SendUpdateToWhiteboard(); + } + + void TInterconnectSessionTCP::CloseInputSession() { + Send(ReceiverId, new TEvInterconnect::TEvCloseInputSession); + } + + void TInterconnectSessionTCP::Handle(TEvTerminate::TPtr& ev) { + Terminate(ev->Get()->Reason); + } + + void TInterconnectSessionTCP::HandlePoison() { + Terminate(TDisconnectReason()); + } + + void TInterconnectSessionTCP::Terminate(TDisconnectReason reason) { + LOG_INFO_IC_SESSION("ICS01", "socket: %" PRIi64, (Socket ? i64(*Socket) : -1)); + + IActor::InvokeOtherActor(*Proxy, &TInterconnectProxyTCP::UnregisterSession, this); + ShutdownSocket(std::move(reason)); + + for (const auto& kv : Subscribers) { + Send(kv.first, new TEvInterconnect::TEvNodeDisconnected(Proxy->PeerNodeId), 0, kv.second); + } + Proxy->Metrics->SubSubscribersCount(Subscribers.size()); + Subscribers.clear(); + + ChannelScheduler->ForEach([&](TEventOutputChannel& channel) { + channel.NotifyUndelivered(); + }); + + if (ReceiverId) { + Send(ReceiverId, new TEvents::TEvPoisonPill); + } + + SendUpdateToWhiteboard(false); + + Proxy->Metrics->SubOutputBuffersTotalSize(TotalOutputQueueSize); + Proxy->Metrics->SubInflightDataAmount(InflightDataAmount); + + LOG_INFO(*TlsActivationContext, NActorsServices::INTERCONNECT_STATUS, "[%u] session destroyed", Proxy->PeerNodeId); + + if (!Subscribers.empty()) { + Proxy->Metrics->SubSubscribersCount(Subscribers.size()); + } + + TActor::PassAway(); + } + + void TInterconnectSessionTCP::PassAway() { + Y_FAIL("TInterconnectSessionTCP::PassAway() can't be called directly"); + } + + void TInterconnectSessionTCP::Forward(STATEFN_SIG) { + Proxy->ValidateEvent(ev, "Forward"); + + LOG_DEBUG_IC_SESSION("ICS02", "send event from: %s to: %s", ev->Sender.ToString().data(), ev->Recipient.ToString().data()); + ++MessagesGot; + + if (ev->Flags & IEventHandle::FlagSubscribeOnSession) { + Subscribe(ev); + } + + ui16 evChannel = ev->GetChannel(); + auto& oChannel = ChannelScheduler->GetOutputChannel(evChannel); + const bool wasWorking = oChannel.IsWorking(); + + const auto [dataSize, event] = oChannel.Push(*ev); + LWTRACK(ForwardEvent, event->Orbit, Proxy->PeerNodeId, event->Descr.Type, event->Descr.Flags, LWACTORID(event->Descr.Recipient), LWACTORID(event->Descr.Sender), event->Descr.Cookie, event->EventSerializedSize); + + TotalOutputQueueSize += dataSize; + Proxy->Metrics->AddOutputBuffersTotalSize(dataSize); + if (!wasWorking) { + // this channel has returned to work -- it was empty and this we have just put first event in the queue + ChannelScheduler->AddToHeap(oChannel, EqualizeCounter); + } + + SetOutputStuckFlag(true); + ++NumEventsInReadyChannels; + + LWTRACK(EnqueueEvent, event->Orbit, Proxy->PeerNodeId, NumEventsInReadyChannels, GetWriteBlockedTotal(), evChannel, oChannel.GetQueueSize(), oChannel.GetBufferedAmountOfData()); + WILSON_TRACE(*TlsActivationContext, &ev->TraceId, OutputQueuePush, + QueueSizeInEvents = oChannel.GetQueueSize(), + QueueSizeInBytes = oChannel.GetBufferedAmountOfData()); + + // check for overloaded queues + ui64 sendBufferDieLimit = Proxy->Common->Settings.SendBufferDieLimitInMB * ui64(1 << 20); + if (sendBufferDieLimit != 0 && TotalOutputQueueSize > sendBufferDieLimit) { + LOG_ERROR_IC_SESSION("ICS03", "socket: %" PRIi64 " output queue is overloaded, actual %" PRIu64 " bytes, limit is %" PRIu64, + Socket ? i64(*Socket) : -1, TotalOutputQueueSize, sendBufferDieLimit); + return Terminate(TDisconnectReason::QueueOverload()); + } + + ui64 outputBuffersTotalSizeLimit = Proxy->Common->Settings.OutputBuffersTotalSizeLimitInMB * ui64(1 << 20); + if (outputBuffersTotalSizeLimit != 0 && static_cast<ui64>(Proxy->Metrics->GetOutputBuffersTotalSize()) > outputBuffersTotalSizeLimit) { + LOG_ERROR_IC_SESSION("ICS77", "Exceeded total limit on output buffers size"); + if (AtomicTryLock(&Proxy->Common->StartedSessionKiller)) { + CreateSessionKillingActor(Proxy->Common); + } + } + + if (RamInQueue && !RamInQueue->Batching) { + // we have pending TEvRam, so GenerateTraffic will be called no matter what + } else if (InflightDataAmount >= GetTotalInflightAmountOfData() || !Socket || ReceiveContext->WriteBlockedByFullSendBuffer) { + // we can't issue more traffic now; GenerateTraffic will be called upon unblocking + } else if (TotalOutputQueueSize >= 64 * 1024) { + // output queue size is quite big to issue some traffic + GenerateTraffic(); + } else if (!RamInQueue) { + Y_VERIFY_DEBUG(NumEventsInReadyChannels == 1); + RamInQueue = new TEvRam(true); + auto *ev = new IEventHandle(SelfId(), {}, RamInQueue); + const TDuration batchPeriod = Proxy->Common->Settings.BatchPeriod; + if (batchPeriod != TDuration()) { + TActivationContext::Schedule(batchPeriod, ev); + } else { + TActivationContext::Send(ev); + } + LWPROBE(StartBatching, Proxy->PeerNodeId, batchPeriod.MillisecondsFloat()); + LOG_DEBUG_IC_SESSION("ICS17", "batching started"); + } + } + + void TInterconnectSessionTCP::Subscribe(STATEFN_SIG) { + LOG_DEBUG_IC_SESSION("ICS04", "subscribe for session state for %s", ev->Sender.ToString().data()); + const auto [it, inserted] = Subscribers.emplace(ev->Sender, ev->Cookie); + if (inserted) { + Proxy->Metrics->IncSubscribersCount(); + } else { + it->second = ev->Cookie; + } + Send(ev->Sender, new TEvInterconnect::TEvNodeConnected(Proxy->PeerNodeId), 0, ev->Cookie); + } + + void TInterconnectSessionTCP::Unsubscribe(STATEFN_SIG) { + LOG_DEBUG_IC_SESSION("ICS05", "unsubscribe for session state for %s", ev->Sender.ToString().data()); + Proxy->Metrics->SubSubscribersCount( Subscribers.erase(ev->Sender)); + } + + THolder<TEvHandshakeAck> TInterconnectSessionTCP::ProcessHandshakeRequest(TEvHandshakeAsk::TPtr& ev) { + TEvHandshakeAsk *msg = ev->Get(); + + // close existing input session, if any, and do nothing upon its destruction + ReestablishConnection({}, false, TDisconnectReason::NewSession()); + const ui64 lastInputSerial = ReceiveContext->LockLastProcessedPacketSerial(); + + LOG_INFO_IC_SESSION("ICS08", "incoming handshake Self# %s Peer# %s Counter# %" PRIu64 " LastInputSerial# %" PRIu64, + msg->Self.ToString().data(), msg->Peer.ToString().data(), msg->Counter, lastInputSerial); + + return MakeHolder<TEvHandshakeAck>(msg->Peer, lastInputSerial, Params); + } + + void TInterconnectSessionTCP::SetNewConnection(TEvHandshakeDone::TPtr& ev) { + if (ReceiverId) { + // upon destruction of input session actor invoke this callback again + ReestablishConnection(std::move(ev), false, TDisconnectReason::NewSession()); + return; + } + + LOG_INFO_IC_SESSION("ICS09", "handshake done sender: %s self: %s peer: %s socket: %" PRIi64, + ev->Sender.ToString().data(), ev->Get()->Self.ToString().data(), ev->Get()->Peer.ToString().data(), + i64(*ev->Get()->Socket)); + + NewConnectionSet = TActivationContext::Now(); + PacketsWrittenToSocket = 0; + + SendBufferSize = ev->Get()->Socket->GetSendBufferSize(); + Socket = std::move(ev->Get()->Socket); + + // there may be a race + const ui64 nextPacket = Max(LastConfirmed, ev->Get()->NextPacket); + + // arm watchdogs + CloseOnIdleWatchdog.Arm(SelfId()); + + // reset activity timestamps + LastInputActivityTimestamp = LastPayloadActivityTimestamp = TActivationContext::Now(); + + LOG_INFO_IC_SESSION("ICS10", "traffic start"); + + // create input session actor + auto actor = MakeHolder<TInputSessionTCP>(SelfId(), Socket, ReceiveContext, Proxy->Common, + Proxy->Metrics, Proxy->PeerNodeId, nextPacket, GetDeadPeerTimeout(), Params); + ReceiveContext->UnlockLastProcessedPacketSerial(); + ReceiverId = Params.Encryption ? RegisterWithSameMailbox(actor.Release()) : Register(actor.Release(), TMailboxType::ReadAsFilled); + + // register our socket in poller actor + LOG_DEBUG_IC_SESSION("ICS11", "registering socket in PollerActor"); + const bool success = Send(MakePollerActorId(), new TEvPollerRegister(Socket, ReceiverId, SelfId())); + Y_VERIFY(success); + ReceiveContext->WriteBlockedByFullSendBuffer = false; + + LostConnectionWatchdog.Disarm(); + Proxy->Metrics->SetConnected(1); + LOG_INFO(*TlsActivationContext, NActorsServices::INTERCONNECT_STATUS, "[%u] connected", Proxy->PeerNodeId); + + // arm pinger timer + ResetFlushLogic(); + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // REINITIALIZE SEND QUEUE + // + // scan through send queue and leave only those packets who have data -- we will simply resend them; drop all other + // auxiliary packets; also reset packet metrics to zero to start sending from the beginning + // also reset SendQueuePos + + // drop confirmed packets first as we do not need unwanted retransmissions + SendQueuePos = SendQueue.end(); + DropConfirmed(nextPacket); + + for (TSendQueue::iterator it = SendQueue.begin(); it != SendQueue.end(); ) { + const TSendQueue::iterator next = std::next(it); + if (it->IsEmpty()) { + SendQueueCache.splice(SendQueueCache.begin(), SendQueue, it); + } else { + it->ResetBufs(); + } + it = next; + } + TrimSendQueueCache(); + SendQueuePos = SendQueue.begin(); + + TMaybe<ui64> s; + for (auto it = SendQueuePos; it != SendQueue.end(); ++it) { + if (!it->IsEmpty()) { + s = it->GetSerial(); + } + } + const ui64 serial = s.GetOrElse(Max<ui64>()); + + Y_VERIFY(serial > LastConfirmed, "%s serial# %" PRIu64 " LastConfirmed# %" PRIu64, LogPrefix.data(), serial, LastConfirmed); + LOG_DEBUG_IC_SESSION("ICS06", "rewind SendQueue size# %zu LastConfirmed# %" PRIu64 " SendQueuePos.Serial# %" PRIu64 "\n", + SendQueue.size(), LastConfirmed, serial); + + BytesUnwritten = 0; + for (const auto& packet : SendQueue) { + BytesUnwritten += (Params.UseModernFrame ? sizeof(TTcpPacketHeader_v2) : sizeof(TTcpPacketHeader_v1)) + + packet.GetDataSize(); + } + + SwitchStuckPeriod(); + + LastHandshakeDone = TActivationContext::Now(); + + RamInQueue = nullptr; + GenerateTraffic(); + } + + void TInterconnectSessionTCP::Handle(TEvUpdateFromInputSession::TPtr& ev) { + if (ev->Sender == ReceiverId) { + TEvUpdateFromInputSession& msg = *ev->Get(); + + // update ping time + Ping = msg.Ping; + LWPROBE(UpdateFromInputSession, Proxy->PeerNodeId, Ping.MillisecondsFloat()); + + bool needConfirm = false; + + // update activity timer for dead peer checker + LastInputActivityTimestamp = TActivationContext::Now(); + + if (msg.NumDataBytes) { + UnconfirmedBytes += msg.NumDataBytes; + if (UnconfirmedBytes >= GetTotalInflightAmountOfData() / 4) { + needConfirm = true; + } else { + SetForcePacketTimestamp(Proxy->Common->Settings.ForceConfirmPeriod); + } + + // reset payload watchdog that controls close-on-idle behaviour + LastPayloadActivityTimestamp = TActivationContext::Now(); + CloseOnIdleWatchdog.Reset(); + } + + bool unblockedSomething = false; + LWPROBE_IF_TOO_LONG(SlowICDropConfirmed, Proxy->PeerNodeId, ms) { + unblockedSomething = DropConfirmed(msg.ConfirmedByInput); + } + + // generate more traffic if we have unblocked state now + if (unblockedSomething) { + LWPROBE(UnblockByDropConfirmed, Proxy->PeerNodeId, NHPTimer::GetSeconds(GetCycleCountFast() - ev->SendTime) * 1000.0); + GenerateTraffic(); + } + + // if we haven't generated any packets, then make a lone Flush packet without any data + if (needConfirm && Socket) { + ++ConfirmPacketsForcedBySize; + MakePacket(false); + } + + for (;;) { + switch (EUpdateState state = ReceiveContext->UpdateState) { + case EUpdateState::NONE: + case EUpdateState::CONFIRMING: + Y_FAIL("unexpected state"); + + case EUpdateState::INFLIGHT: + // this message we are processing was the only one in flight, so we can reset state to NONE here + if (ReceiveContext->UpdateState.compare_exchange_weak(state, EUpdateState::NONE)) { + return; + } + break; + + case EUpdateState::INFLIGHT_AND_PENDING: + // there is more messages pending from the input session actor, so we have to inform it to release + // that message + if (ReceiveContext->UpdateState.compare_exchange_weak(state, EUpdateState::CONFIRMING)) { + Send(ev->Sender, new TEvConfirmUpdate); + return; + } + break; + } + } + } + } + + void TInterconnectSessionTCP::HandleRam(TEvRam::TPtr& ev) { + if (ev->Get() == RamInQueue) { + LWPROBE(FinishRam, Proxy->PeerNodeId, NHPTimer::GetSeconds(GetCycleCountFast() - ev->SendTime) * 1000.0); + RamInQueue = nullptr; + GenerateTraffic(); + } + } + + void TInterconnectSessionTCP::GenerateTraffic() { + // generate ping request, if needed + IssuePingRequest(); + + if (RamInQueue && !RamInQueue->Batching) { + LWPROBE(SkipGenerateTraffic, Proxy->PeerNodeId, NHPTimer::GetSeconds(GetCycleCountFast() - RamStartedCycles) * 1000.0); + return; // we'll do it a bit later + } else { + RamInQueue = nullptr; + } + + LOG_DEBUG_IC_SESSION("ICS19", "GenerateTraffic"); + + // There is a tradeoff between fairness and efficiency. + // The less traffic is generated here, the less buffering is after fair scheduler, + // the more fair system is, the less latency is present. + // The more traffic is generated here, the less syscalls and actor-system overhead occurs, + // the less cpu is consumed. + static const ui64 generateLimit = 64 * 1024; + + const ui64 sizeBefore = TotalOutputQueueSize; + ui32 generatedPackets = 0; + ui64 generatedBytes = 0; + ui64 generateStarted = GetCycleCountFast(); + + // apply traffic changes + auto accountTraffic = [&] { ChannelScheduler->ForEach([](TEventOutputChannel& channel) { channel.AccountTraffic(); }); }; + + // first, we create as many data packets as we can generate under certain conditions; they include presence + // of events in channels queues and in flight fitting into requested limit; after we hit one of these conditions + // we exit cycle + while (Socket && NumEventsInReadyChannels && InflightDataAmount < GetTotalInflightAmountOfData() && !ReceiveContext->WriteBlockedByFullSendBuffer) { + if (generatedBytes >= generateLimit) { + // resume later but ensure that we have issued at least one packet + RamInQueue = new TEvRam(false); + Send(SelfId(), RamInQueue); + RamStartedCycles = GetCycleCountFast(); + LWPROBE(StartRam, Proxy->PeerNodeId); + break; + } + + try { + generatedBytes += MakePacket(true); + ++generatedPackets; + } catch (const TExSerializedEventTooLarge& ex) { + // terminate session if the event can't be serialized properly + accountTraffic(); + LOG_CRIT_IC("ICS31", "serialized event Type# 0x%08" PRIx32 " is too large", ex.Type); + return Terminate(TDisconnectReason::EventTooLarge()); + } + } + + if (Socket) { + WriteData(); + } + + LWPROBE(GenerateTraffic, Proxy->PeerNodeId, NHPTimer::GetSeconds(GetCycleCountFast() - generateStarted) * 1000.0, sizeBefore - TotalOutputQueueSize, generatedPackets, generatedBytes); + + accountTraffic(); + EqualizeCounter += ChannelScheduler->Equalize(); + } + + void TInterconnectSessionTCP::StartHandshake() { + LOG_INFO_IC_SESSION("ICS15", "start handshake"); + IActor::InvokeOtherActor(*Proxy, &TInterconnectProxyTCP::StartResumeHandshake, ReceiveContext->LockLastProcessedPacketSerial()); + } + + void TInterconnectSessionTCP::ReestablishConnectionWithHandshake(TDisconnectReason reason) { + ReestablishConnection({}, true, std::move(reason)); + } + + void TInterconnectSessionTCP::ReestablishConnection(TEvHandshakeDone::TPtr&& ev, bool startHandshakeOnSessionClose, + TDisconnectReason reason) { + if (Socket) { + LOG_INFO_IC_SESSION("ICS13", "reestablish connection"); + ShutdownSocket(std::move(reason)); // stop sending/receiving on socket + PendingHandshakeDoneEvent = std::move(ev); + StartHandshakeOnSessionClose = startHandshakeOnSessionClose; + if (!ReceiverId) { + ReestablishConnectionExecute(); + } + } + } + + void TInterconnectSessionTCP::OnDisconnect(TEvSocketDisconnect::TPtr& ev) { + if (ev->Sender == ReceiverId) { + const bool wasConnected(Socket); + LOG_INFO_IC_SESSION("ICS07", "socket disconnect %" PRIi64 " reason# %s", Socket ? i64(*Socket) : -1, ev->Get()->Reason.ToString().data()); + ReceiverId = TActorId(); // reset receiver actor id as we have no more receiver yet + if (wasConnected) { + // we were sucessfully connected and did not expect failure, so it arrived from the input side; we should + // restart handshake process, closing our part of socket first + ShutdownSocket(ev->Get()->Reason); + StartHandshake(); + } else { + ReestablishConnectionExecute(); + } + } + } + + void TInterconnectSessionTCP::ShutdownSocket(TDisconnectReason reason) { + if (Socket) { + if (const TString& s = reason.ToString()) { + Proxy->Metrics->IncDisconnectByReason(s); + } + + LOG_INFO_IC_SESSION("ICS25", "shutdown socket, reason# %s", reason.ToString().data()); + Proxy->UpdateErrorStateLog(TActivationContext::Now(), "close_socket", reason.ToString().data()); + Socket->Shutdown(SHUT_RDWR); + Socket.Reset(); + Proxy->Metrics->IncDisconnections(); + CloseOnIdleWatchdog.Disarm(); + LostConnectionWatchdog.Arm(SelfId()); + Proxy->Metrics->SetConnected(0); + LOG_INFO(*TlsActivationContext, NActorsServices::INTERCONNECT_STATUS, "[%u] disconnected", Proxy->PeerNodeId); + } + } + + void TInterconnectSessionTCP::ReestablishConnectionExecute() { + bool startHandshakeOnSessionClose = std::exchange(StartHandshakeOnSessionClose, false); + TEvHandshakeDone::TPtr ev = std::move(PendingHandshakeDoneEvent); + + if (startHandshakeOnSessionClose) { + StartHandshake(); + } else if (ev) { + SetNewConnection(ev); + } + } + + void TInterconnectSessionTCP::Handle(TEvPollerReady::TPtr& ev) { + LOG_DEBUG_IC_SESSION("ICS29", "HandleReadyWrite WriteBlockedByFullSendBuffer# %s", + ReceiveContext->WriteBlockedByFullSendBuffer ? "true" : "false"); + if (std::exchange(ReceiveContext->WriteBlockedByFullSendBuffer, false)) { + Proxy->Metrics->IncUsefulWriteWakeups(); + ui64 nowCycles = GetCycleCountFast(); + double blockedUs = NHPTimer::GetSeconds(nowCycles - WriteBlockedCycles) * 1000000.0; + LWPROBE(ReadyWrite, Proxy->PeerNodeId, NHPTimer::GetSeconds(nowCycles - ev->SendTime) * 1000.0, blockedUs / 1000.0); + WriteBlockedTotal += TDuration::MicroSeconds(blockedUs); + GenerateTraffic(); + } else if (!ev->Cookie) { + Proxy->Metrics->IncSpuriousWriteWakeups(); + } + if (Params.Encryption && ReceiveContext->ReadPending && !ev->Cookie) { + Send(ReceiverId, ev->Release().Release(), 0, 1); + } + } + + void TInterconnectSessionTCP::Handle(TEvPollerRegisterResult::TPtr ev) { + PollerToken = std::move(ev->Get()->PollerToken); + if (ReceiveContext->WriteBlockedByFullSendBuffer) { + if (Params.Encryption) { + auto *secure = static_cast<NInterconnect::TSecureSocket*>(Socket.Get()); + PollerToken->Request(secure->WantRead(), secure->WantWrite()); + } else { + PollerToken->Request(false, true); + } + } + } + + void TInterconnectSessionTCP::WriteData() { + ui64 written = 0; + + Y_VERIFY(Socket); // ensure that socket wasn't closed + + LWPROBE_IF_TOO_LONG(SlowICWriteData, Proxy->PeerNodeId, ms) { + constexpr ui32 iovLimit = 256; +#ifdef _linux_ + ui32 maxElementsInIOV = Min<ui32>(iovLimit, sysconf(_SC_IOV_MAX)); +#else + ui32 maxElementsInIOV = 64; +#endif + if (Params.Encryption) { + maxElementsInIOV = 1; + } + + // vector of write buffers with preallocated stack space + TStackVec<TConstIoVec, iovLimit> wbuffers; + + LOG_DEBUG_IC_SESSION("ICS30", "WriteData WriteBlockedByFullSendBuffer# %s SendQueue.size# %zu", + ReceiveContext->WriteBlockedByFullSendBuffer ? "true" : "false", SendQueue.size()); + + // update last confirmed packet number if it has changed + if (SendQueuePos != SendQueue.end()) { + SendQueuePos->UpdateConfirmIfPossible(ReceiveContext->GetLastProcessedPacketSerial()); + } + + while (SendQueuePos != SendQueue.end() && !ReceiveContext->WriteBlockedByFullSendBuffer) { + for (auto it = SendQueuePos; it != SendQueue.end() && wbuffers.size() < maxElementsInIOV; ++it) { + it->AppendToIoVector(wbuffers, maxElementsInIOV); + } + + const struct iovec* iovec = reinterpret_cast<const struct iovec*>(wbuffers.data()); + int iovcnt = wbuffers.size(); + + Y_VERIFY(iovcnt > 0); + Y_VERIFY(iovec->iov_len > 0); + + TString err; + ssize_t r = 0; + do { +#ifndef _win_ + r = iovcnt == 1 ? Socket->Send(iovec[0].iov_base, iovec[0].iov_len, &err) : Socket->WriteV(iovec, iovcnt); +#else + r = Socket->Send(iovec[0].iov_base, iovec[0].iov_len, &err); +#endif + Proxy->Metrics->IncSendSyscalls(); + } while (r == -EINTR); + + LOG_DEBUG_IC_SESSION("ICS16", "written# %zd iovcnt# %d err# %s", r, iovcnt, err.data()); + + wbuffers.clear(); + + if (r > 0) { + Y_VERIFY(static_cast<size_t>(r) <= BytesUnwritten); + BytesUnwritten -= r; + written += r; + ui64 packets = 0; + + // advance SendQueuePos to eat all processed items + for (size_t amount = r; amount && SendQueuePos->DropBufs(amount); ++SendQueuePos) { + if (!SendQueuePos->IsEmpty()) { + LastSentSerial = Max(LastSentSerial, SendQueuePos->GetSerial()); + } + ++PacketsWrittenToSocket; + ++packets; + LWTRACK(PacketWrittenToSocket, SendQueuePos->Orbit, Proxy->PeerNodeId, PacketsWrittenToSocket, SendQueuePos->TriedWriting, SendQueuePos->GetDataSize(), BytesUnwritten, GetWriteBlockedTotal(), (SOCKET)*Socket); + } + + LWPROBE(WriteToSocket, Proxy->PeerNodeId, r, packets, PacketsWrittenToSocket, BytesUnwritten, GetWriteBlockedTotal(), (SOCKET)*Socket); + } else if (-r != EAGAIN && -r != EWOULDBLOCK) { + const TString message = r == 0 ? "connection closed by peer" + : err ? err + : Sprintf("socket: %s", strerror(-r)); + LOG_NOTICE_NET(Proxy->PeerNodeId, "%s", message.data()); + if (written) { + Proxy->Metrics->AddTotalBytesWritten(written); + } + return ReestablishConnectionWithHandshake(r == 0 ? TDisconnectReason::EndOfStream() : TDisconnectReason::FromErrno(-r)); + } else { + // we have to do some hack for secure socket -- mark the packet as 'tried writing' + if (Params.Encryption) { + Y_VERIFY(SendQueuePos != SendQueue.end()); + SendQueuePos->MarkTriedWriting(); // do not try to replace buffer under SSL + } + + // we have received EAGAIN error code, this means that we can't issue more data until we have received + // TEvPollerReadyWrite event from poller; set up flag meaning this and wait for that event + Y_VERIFY(!ReceiveContext->WriteBlockedByFullSendBuffer); + ReceiveContext->WriteBlockedByFullSendBuffer = true; + WriteBlockedCycles = GetCycleCountFast(); + LWPROBE(BlockedWrite, Proxy->PeerNodeId, SendQueue.size(), written); + LOG_DEBUG_IC_SESSION("ICS18", "hit send buffer limit"); + + if (PollerToken) { + if (Params.Encryption) { + auto *secure = static_cast<NInterconnect::TSecureSocket*>(Socket.Get()); + PollerToken->Request(secure->WantRead(), secure->WantWrite()); + } else { + PollerToken->Request(false, true); + } + } + } + } + } + if (written) { + Proxy->Metrics->AddTotalBytesWritten(written); + } + } + + void TInterconnectSessionTCP::SetForcePacketTimestamp(TDuration period) { + if (period != TDuration::Max()) { + const TInstant when = TActivationContext::Now() + period; + if (when < ForcePacketTimestamp) { + ForcePacketTimestamp = when; + ScheduleFlush(); + } + } + } + + void TInterconnectSessionTCP::ScheduleFlush() { + if (FlushSchedule.empty() || ForcePacketTimestamp < FlushSchedule.top()) { + Schedule(ForcePacketTimestamp - TActivationContext::Now(), new TEvFlush); + FlushSchedule.push(ForcePacketTimestamp); + MaxFlushSchedule = Max(MaxFlushSchedule, FlushSchedule.size()); + ++FlushEventsScheduled; + } + } + + void TInterconnectSessionTCP::HandleFlush() { + const TInstant now = TActivationContext::Now(); + while (FlushSchedule && now >= FlushSchedule.top()) { + FlushSchedule.pop(); + } + IssuePingRequest(); + if (Socket) { + if (now >= ForcePacketTimestamp) { + ++ConfirmPacketsForcedByTimeout; + ++FlushEventsProcessed; + MakePacket(false); // just generate confirmation packet if we have preconditions for this + } else if (ForcePacketTimestamp != TInstant::Max()) { + ScheduleFlush(); + } + } + } + + void TInterconnectSessionTCP::ResetFlushLogic() { + ForcePacketTimestamp = TInstant::Max(); + UnconfirmedBytes = 0; + const TDuration ping = Proxy->Common->Settings.PingPeriod; + if (ping != TDuration::Zero() && !NumEventsInReadyChannels) { + SetForcePacketTimestamp(ping); + } + } + + void TInterconnectSessionTCP::TrimSendQueueCache() { + static constexpr size_t maxItems = 32; + static constexpr size_t trimThreshold = maxItems * 2; + if (SendQueueCache.size() >= trimThreshold) { + auto it = SendQueueCache.end(); + for (size_t n = SendQueueCache.size() - maxItems; n; --n) { + --it; + } + + auto ev = std::make_unique<TEvFreeItems>(); + ev->Items.splice(ev->Items.end(), SendQueueCache, it, SendQueueCache.end()); + ev->NumBytes = ev->Items.size() * sizeof(TTcpPacketOutTask); + if (ev->GetInLineForDestruction(Proxy->Common)) { + Send(Proxy->Common->DestructorId, ev.release()); + } + } + } + + ui64 TInterconnectSessionTCP::MakePacket(bool data, TMaybe<ui64> pingMask) { + Y_VERIFY(Socket); + + TSendQueue::iterator packet; + if (SendQueueCache) { + // we have entries in cache, take one and move it to the end of SendQueue + packet = SendQueueCache.begin(); + SendQueue.splice(SendQueue.end(), SendQueueCache, packet); + packet->Reuse(); // reset packet to initial state + } else { + // we have to allocate new packet, so just do it + LWPROBE_IF_TOO_LONG(SlowICAllocPacketBuffer, Proxy->PeerNodeId, ms) { + packet = SendQueue.emplace(SendQueue.end(), Params); + } + } + + // update send queue position + if (SendQueuePos == SendQueue.end()) { + SendQueuePos = packet; // start sending this packet if we are not sending anything for now + } + + ui64 serial = 0; + + if (data) { + // generate serial for this data packet + serial = ++OutputCounter; + + // fill the data packet + Y_VERIFY(NumEventsInReadyChannels); + LWPROBE_IF_TOO_LONG(SlowICFillSendingBuffer, Proxy->PeerNodeId, ms) { + FillSendingBuffer(*packet, serial); + } + Y_VERIFY(!packet->IsEmpty()); + + InflightDataAmount += packet->GetDataSize(); + Proxy->Metrics->AddInflightDataAmount(packet->GetDataSize()); + if (InflightDataAmount > GetTotalInflightAmountOfData()) { + Proxy->Metrics->IncInflyLimitReach(); + } + + if (AtomicGet(ReceiveContext->ControlPacketId) == 0) { + AtomicSet(ReceiveContext->ControlPacketSendTimer, GetCycleCountFast()); + AtomicSet(ReceiveContext->ControlPacketId, OutputCounter); + } + + // update payload activity timer + LastPayloadActivityTimestamp = TActivationContext::Now(); + } else if (pingMask) { + serial = *pingMask; + + // make this packet a priority one + if (SendQueuePos != packet) { + Y_VERIFY(SendQueuePos != SendQueue.end()); + if (SendQueuePos->IsAtBegin()) { + // insert this packet just before the next being sent and step back + SendQueue.splice(SendQueuePos, SendQueue, packet); + --SendQueuePos; + Y_VERIFY(SendQueuePos == packet); + } else { + // current packet is already being sent, so move new packet just after it + SendQueue.splice(std::next(SendQueuePos), SendQueue, packet); + } + } + } + + const ui64 lastInputSerial = ReceiveContext->GetLastProcessedPacketSerial(); + packet->SetMetadata(serial, lastInputSerial); + packet->Sign(); + + // count number of bytes pending for write + ui64 packetSize = (Params.UseModernFrame ? sizeof(TTcpPacketHeader_v2) : sizeof(TTcpPacketHeader_v1)) + packet->GetDataSize(); + BytesUnwritten += packetSize; + + LOG_DEBUG_IC_SESSION("ICS22", "outgoing packet Serial# %" PRIu64 " Confirm# %" PRIu64 " DataSize# %zu" + " InflightDataAmount# %" PRIu64 " BytesUnwritten# %" PRIu64, serial, lastInputSerial, packet->GetDataSize(), + InflightDataAmount, BytesUnwritten); + + // reset forced packet sending timestamp as we have confirmed all received data + ResetFlushLogic(); + + ++PacketsGenerated; + LWTRACK(PacketGenerated, packet->Orbit, Proxy->PeerNodeId, BytesUnwritten, InflightDataAmount, PacketsGenerated, packetSize); + + if (!data) { + WriteData(); + } + + return packetSize; + } + + bool TInterconnectSessionTCP::DropConfirmed(ui64 confirm) { + LOG_DEBUG_IC_SESSION("ICS23", "confirm count: %" PRIu64, confirm); + + Y_VERIFY(LastConfirmed <= confirm && confirm <= LastSentSerial && LastSentSerial <= OutputCounter, + "%s confirm# %" PRIu64 " LastConfirmed# %" PRIu64 " OutputCounter# %" PRIu64 " LastSentSerial# %" PRIu64, + LogPrefix.data(), confirm, LastConfirmed, OutputCounter, LastSentSerial); + LastConfirmed = confirm; + + ui64 droppedDataAmount = 0; + ui32 numDropped = 0; + + // drop confirmed packets; this also includes any auxiliary packets as their serial is set to zero, effectively + // making Serial <= confirm true + TSendQueue::iterator it; + ui64 lastDroppedSerial = 0; + for (it = SendQueue.begin(); it != SendQueuePos && it->Confirmed(confirm); ++it) { + if (!it->IsEmpty()) { + lastDroppedSerial = it->GetSerial(); + } + droppedDataAmount += it->GetDataSize(); + ++numDropped; + } + SendQueueCache.splice(SendQueueCache.begin(), SendQueue, SendQueue.begin(), it); + TrimSendQueueCache(); + ChannelScheduler->ForEach([&](TEventOutputChannel& channel) { + channel.DropConfirmed(lastDroppedSerial); + }); + + const ui64 current = InflightDataAmount; + const ui64 limit = GetTotalInflightAmountOfData(); + const bool unblockedSomething = current >= limit && current < limit + droppedDataAmount; + + PacketsConfirmed += numDropped; + InflightDataAmount -= droppedDataAmount; + Proxy->Metrics->SubInflightDataAmount(droppedDataAmount); + LWPROBE(DropConfirmed, Proxy->PeerNodeId, droppedDataAmount, InflightDataAmount); + + LOG_DEBUG_IC_SESSION("ICS24", "exit InflightDataAmount: %" PRIu64 " bytes droppedDataAmount: %" PRIu64 " bytes" + " dropped %" PRIu32 " packets", InflightDataAmount, droppedDataAmount, numDropped); + + Pool->Trim(); // send any unsent free requests + + return unblockedSomething; + } + + void TInterconnectSessionTCP::FillSendingBuffer(TTcpPacketOutTask& task, ui64 serial) { + ui32 bytesGenerated = 0; + + Y_VERIFY(NumEventsInReadyChannels); + while (NumEventsInReadyChannels) { + TEventOutputChannel *channel = ChannelScheduler->PickChannelWithLeastConsumedWeight(); + Y_VERIFY_DEBUG(!channel->IsEmpty()); + + // generate some data within this channel + const ui64 netBefore = channel->GetBufferedAmountOfData(); + ui64 gross = 0; + const bool eventDone = channel->FeedBuf(task, serial, &gross); + channel->UnaccountedTraffic += gross; + const ui64 netAfter = channel->GetBufferedAmountOfData(); + Y_VERIFY_DEBUG(netAfter <= netBefore); // net amount should shrink + const ui64 net = netBefore - netAfter; // number of net bytes serialized + + // adjust metrics for local and global queue size + TotalOutputQueueSize -= net; + Proxy->Metrics->SubOutputBuffersTotalSize(net); + bytesGenerated += gross; + Y_VERIFY_DEBUG(!!net == !!gross && gross >= net, "net# %" PRIu64 " gross# %" PRIu64, net, gross); + + // return it back to queue or delete, depending on whether this channel is still working or not + ChannelScheduler->FinishPick(gross, EqualizeCounter); + + // update some stats if the packet was fully serialized + if (eventDone) { + ++MessagesWrittenToBuffer; + + Y_VERIFY(NumEventsInReadyChannels); + --NumEventsInReadyChannels; + + if (!NumEventsInReadyChannels) { + SetOutputStuckFlag(false); + } + } + + if (!gross) { // no progress -- almost full packet buffer + break; + } + } + + LWTRACK(FillSendingBuffer, task.Orbit, Proxy->PeerNodeId, bytesGenerated, NumEventsInReadyChannels, WriteBlockedTotal); + Y_VERIFY(bytesGenerated); // ensure we are not stalled in serialization + } + + ui32 TInterconnectSessionTCP::CalculateQueueUtilization() { + SwitchStuckPeriod(); + ui64 sumBusy = 0, sumPeriod = 0; + for (auto iter = OutputQueueUtilization.begin(); iter != OutputQueueUtilization.end() - 1; ++iter) { + sumBusy += iter->first; + sumPeriod += iter->second; + } + return sumBusy * 1000000 / sumPeriod; + } + + void TInterconnectSessionTCP::SendUpdateToWhiteboard(bool connected) { + const ui32 utilization = Socket ? CalculateQueueUtilization() : 0; + + if (const auto& callback = Proxy->Common->UpdateWhiteboard) { + enum class EFlag { + GREEN, + YELLOW, + ORANGE, + RED, + }; + EFlag flagState = EFlag::RED; + + if (Socket) { + flagState = EFlag::GREEN; + + do { + auto lastInputDelay = TActivationContext::Now() - LastInputActivityTimestamp; + if (lastInputDelay * 4 >= GetDeadPeerTimeout() * 3) { + flagState = EFlag::ORANGE; + break; + } else if (lastInputDelay * 2 >= GetDeadPeerTimeout()) { + flagState = EFlag::YELLOW; + } + + // check utilization + if (utilization > 875000) { // 7/8 + flagState = EFlag::ORANGE; + break; + } else if (utilization > 500000) { // 1/2 + flagState = EFlag::YELLOW; + } + } while (false); + } + + callback(Proxy->Metrics->GetHumanFriendlyPeerHostName(), + connected, + flagState == EFlag::GREEN, + flagState == EFlag::YELLOW, + flagState == EFlag::ORANGE, + flagState == EFlag::RED, + TlsActivationContext->ExecutorThread.ActorSystem); + } + + if (connected) { + Schedule(TDuration::Seconds(1), new TEvents::TEvWakeup); + } + } + + void TInterconnectSessionTCP::SetOutputStuckFlag(bool state) { + if (OutputStuckFlag == state) + return; + + if (OutputQueueUtilization.Size() == 0) + return; + + auto& lastpair = OutputQueueUtilization.Last(); + if (state) + lastpair.first -= GetCycleCountFast(); + else + lastpair.first += GetCycleCountFast(); + + OutputStuckFlag = state; + } + + void TInterconnectSessionTCP::SwitchStuckPeriod() { + auto now = GetCycleCountFast(); + if (OutputQueueUtilization.Size() != 0) { + auto& lastpair = OutputQueueUtilization.Last(); + lastpair.second = now - lastpair.second; + if (OutputStuckFlag) + lastpair.first += now; + } + + OutputQueueUtilization.Push(std::pair<ui64, ui64>(0, now)); + if (OutputStuckFlag) + OutputQueueUtilization.Last().first -= now; + } + + TDuration TInterconnectSessionTCP::GetDeadPeerTimeout() const { + return Coalesce(Proxy->Common->Settings.DeadPeer, DEFAULT_DEADPEER_TIMEOUT); + } + + TDuration TInterconnectSessionTCP::GetCloseOnIdleTimeout() const { + return Proxy->Common->Settings.CloseOnIdle; + } + + TDuration TInterconnectSessionTCP::GetLostConnectionTimeout() const { + return Coalesce(Proxy->Common->Settings.LostConnection, DEFAULT_LOST_CONNECTION_TIMEOUT); + } + + ui32 TInterconnectSessionTCP::GetTotalInflightAmountOfData() const { + return Coalesce(Proxy->Common->Settings.TotalInflightAmountOfData, DEFAULT_TOTAL_INFLIGHT_DATA); + } + + ui64 TInterconnectSessionTCP::GetMaxCyclesPerEvent() const { + return DurationToCycles(TDuration::MicroSeconds(50)); + } + + void TInterconnectSessionTCP::IssuePingRequest() { + const TInstant now = TActivationContext::Now(); + if (now >= LastPingTimestamp + PingPeriodicity) { + LOG_DEBUG_IC_SESSION("ICS22", "Issuing ping request"); + if (Socket) { + MakePacket(false, GetCycleCountFast() | TTcpPacketBuf::PingRequestMask); + } + if (Socket) { + MakePacket(false, TInstant::Now().MicroSeconds() | TTcpPacketBuf::ClockMask); + } + LastPingTimestamp = now; + } + } + + void TInterconnectSessionTCP::Handle(TEvProcessPingRequest::TPtr ev) { + if (Socket) { + MakePacket(false, ev->Get()->Payload | TTcpPacketBuf::PingResponseMask); + } + } + + void TInterconnectSessionTCP::GenerateHttpInfo(TStringStream& str) { + HTML(str) { + DIV_CLASS("panel panel-info") { + DIV_CLASS("panel-heading") { + str << "Session"; + } + DIV_CLASS("panel-body") { + TABLE_CLASS("table") { + TABLEHEAD() { + TABLER() { + TABLEH() { + str << "Sensor"; + } + TABLEH() { + str << "Value"; + } + } + } + TABLEBODY() { + TABLER() { + TABLED() { + str << "Encryption"; + } + TABLED() { + str << (Params.Encryption ? "<font color=green>Enabled</font>" : "<font color=red>Disabled</font>"); + } + } + if (auto *x = dynamic_cast<NInterconnect::TSecureSocket*>(Socket.Get())) { + TABLER() { + TABLED() { + str << "Cipher name"; + } + TABLED() { + str << x->GetCipherName(); + } + } + TABLER() { + TABLED() { + str << "Cipher bits"; + } + TABLED() { + str << x->GetCipherBits(); + } + } + TABLER() { + TABLED() { + str << "Protocol"; + } + TABLED() { + str << x->GetProtocolName(); + } + } + TABLER() { + TABLED() { + str << "Peer CN"; + } + TABLED() { + str << x->GetPeerCommonName(); + } + } + } + TABLER() { + TABLED() { str << "AuthOnly CN"; } + TABLED() { str << Params.AuthCN; } + } + TABLER() { + TABLED() { + str << "Local scope id"; + } + TABLED() { + str << ScopeIdToString(Proxy->Common->LocalScopeId); + } + } + TABLER() { + TABLED() { + str << "Peer scope id"; + } + TABLED() { + str << ScopeIdToString(Params.PeerScopeId); + } + } + TABLER() { + TABLED() { + str << "This page generated at"; + } + TABLED() { + str << TActivationContext::Now() << " / " << Now(); + } + } + TABLER() { + TABLED() { + str << "SelfID"; + } + TABLED() { + str << SelfId().ToString(); + } + } + TABLER() { + TABLED() { str << "Frame version/Checksum"; } + TABLED() { str << (!Params.UseModernFrame ? "v1/crc32c" : Params.Encryption ? "v2/none" : "v2/crc32c"); } + } +#define MON_VAR(NAME) \ + TABLER() { \ + TABLED() { \ + str << #NAME; \ + } \ + TABLED() { \ + str << NAME; \ + } \ + } + + MON_VAR(Created) + MON_VAR(NewConnectionSet) + MON_VAR(ReceiverId) + MON_VAR(MessagesGot) + MON_VAR(MessagesWrittenToBuffer) + MON_VAR(PacketsGenerated) + MON_VAR(PacketsWrittenToSocket) + MON_VAR(PacketsConfirmed) + MON_VAR(AtomicGet(ReceiveContext->PacketsReadFromSocket)) + MON_VAR(ConfirmPacketsForcedBySize) + MON_VAR(ConfirmPacketsForcedByTimeout) + + TABLER() { + TABLED() { + str << "Virtual self ID"; + } + TABLED() { + str << Proxy->SessionVirtualId.ToString(); + } + } + TABLER() { + TABLED() { + str << "Virtual peer ID"; + } + TABLED() { + str << Proxy->RemoteSessionVirtualId.ToString(); + } + } + TABLER() { + TABLED() { + str << "Socket"; + } + TABLED() { + str << (Socket ? i64(*Socket) : -1); + } + } + + ui32 unsentQueueSize = Socket ? Socket->GetUnsentQueueSize() : 0; + + MON_VAR(OutputStuckFlag) + MON_VAR(SendQueue.size()) + MON_VAR(SendQueueCache.size()) + MON_VAR(NumEventsInReadyChannels) + MON_VAR(TotalOutputQueueSize) + MON_VAR(BytesUnwritten) + MON_VAR(InflightDataAmount) + MON_VAR(unsentQueueSize) + MON_VAR(SendBufferSize) + MON_VAR(LastInputActivityTimestamp) + MON_VAR(LastPayloadActivityTimestamp) + MON_VAR(LastHandshakeDone) + MON_VAR(OutputCounter) + MON_VAR(LastSentSerial) + MON_VAR(ReceiveContext->GetLastProcessedPacketSerial()) + MON_VAR(LastConfirmed) + MON_VAR(FlushSchedule.size()) + MON_VAR(MaxFlushSchedule) + MON_VAR(FlushEventsScheduled) + MON_VAR(FlushEventsProcessed) + + TString clockSkew; + i64 x = GetClockSkew(); + if (x < 0) { + clockSkew = Sprintf("-%s", TDuration::MicroSeconds(-x).ToString().data()); + } else { + clockSkew = Sprintf("+%s", TDuration::MicroSeconds(x).ToString().data()); + } + + MON_VAR(LastPingTimestamp) + MON_VAR(GetPingRTT()) + MON_VAR(clockSkew) + + MON_VAR(GetDeadPeerTimeout()) + MON_VAR(GetTotalInflightAmountOfData()) + MON_VAR(GetCloseOnIdleTimeout()) + MON_VAR(Subscribers.size()) + } + } + } + } + } + } + + void CreateSessionKillingActor(TInterconnectProxyCommon::TPtr common) { + TlsActivationContext->ExecutorThread.ActorSystem->Register(new TInterconnectSessionKiller(common)); + } +} diff --git a/library/cpp/actors/interconnect/interconnect_tcp_session.h b/library/cpp/actors/interconnect/interconnect_tcp_session.h new file mode 100644 index 0000000000..7fc00dbcc5 --- /dev/null +++ b/library/cpp/actors/interconnect/interconnect_tcp_session.h @@ -0,0 +1,565 @@ +#pragma once + +#include <library/cpp/actors/core/hfunc.h> +#include <library/cpp/actors/core/event_pb.h> +#include <library/cpp/actors/core/events.h> +#include <library/cpp/actors/core/log.h> +#include <library/cpp/actors/helpers/mon_histogram_helper.h> +#include <library/cpp/actors/protos/services_common.pb.h> +#include <library/cpp/actors/util/datetime.h> +#include <library/cpp/actors/util/rope.h> +#include <library/cpp/actors/util/funnel_queue.h> +#include <library/cpp/actors/util/recentwnd.h> +#include <library/cpp/monlib/dynamic_counters/counters.h> +#include <library/cpp/actors/core/actor_bootstrapped.h> + +#include <util/generic/queue.h> +#include <util/generic/deque.h> +#include <util/datetime/cputimer.h> + +#include "interconnect_impl.h" +#include "poller_tcp.h" +#include "poller_actor.h" +#include "interconnect_channel.h" +#include "logging.h" +#include "watchdog_timer.h" +#include "event_holder_pool.h" +#include "channel_scheduler.h" + +#include <unordered_set> +#include <unordered_map> + +namespace NActors { + class TSlowPathChecker { + using TTraceCallback = std::function<void(double)>; + TTraceCallback Callback; + const NHPTimer::STime Start; + + public: + TSlowPathChecker(TTraceCallback&& callback) + : Callback(std::move(callback)) + , Start(GetCycleCountFast()) + { + } + + ~TSlowPathChecker() { + const NHPTimer::STime end = GetCycleCountFast(); + const NHPTimer::STime elapsed = end - Start; + if (elapsed > 1000000) { + Callback(NHPTimer::GetSeconds(elapsed) * 1000); + } + } + + operator bool() const { + return false; + } + }; + +#define LWPROBE_IF_TOO_LONG(...) \ + if (auto __x = TSlowPathChecker{[&](double ms) { LWPROBE(__VA_ARGS__); }}) \ + ; \ + else + + class TTimeLimit { + public: + TTimeLimit(ui64 limitInCycles) + : UpperLimit(limitInCycles == 0 ? 0 : GetCycleCountFast() + limitInCycles) + { + } + + TTimeLimit(ui64 startTS, ui64 limitInCycles) + : UpperLimit(limitInCycles == 0 ? 0 : startTS + limitInCycles) + { + } + + bool CheckExceeded() { + return UpperLimit != 0 && GetCycleCountFast() > UpperLimit; + } + + const ui64 UpperLimit; + }; + + static constexpr TDuration DEFAULT_DEADPEER_TIMEOUT = TDuration::Seconds(10); + static constexpr TDuration DEFAULT_LOST_CONNECTION_TIMEOUT = TDuration::Seconds(10); + static constexpr ui32 DEFAULT_MAX_INFLIGHT_DATA = 10240 * 1024; + static constexpr ui32 DEFAULT_TOTAL_INFLIGHT_DATA = 4 * 10240 * 1024; + + class TInterconnectProxyTCP; + + enum class EUpdateState : ui8 { + NONE, // no updates generated by input session yet + INFLIGHT, // one update is inflight, and no more pending + INFLIGHT_AND_PENDING, // one update is inflight, and one is pending + CONFIRMING, // confirmation inflight + }; + + struct TReceiveContext: public TAtomicRefCount<TReceiveContext> { + /* All invokations to these fields should be thread-safe */ + + ui64 ControlPacketSendTimer = 0; + ui64 ControlPacketId = 0; + + // number of packets received by input session + TAtomic PacketsReadFromSocket = 0; + TAtomic DataPacketsReadFromSocket = 0; + + // last processed packet by input session + std::atomic_uint64_t LastProcessedPacketSerial = 0; + static constexpr uint64_t LastProcessedPacketSerialLockBit = uint64_t(1) << 63; + + // for hardened checks + TAtomic NumInputSessions = 0; + + NHPTimer::STime StartTime; + + std::atomic<ui64> PingRTT_us = 0; + std::atomic<i64> ClockSkew_us = 0; + + std::atomic<EUpdateState> UpdateState; + static_assert(std::atomic<EUpdateState>::is_always_lock_free); + + bool WriteBlockedByFullSendBuffer = false; + bool ReadPending = false; + + std::array<TRope, 16> ChannelArray; + std::unordered_map<ui16, TRope> ChannelMap; + + TReceiveContext() { + GetTimeFast(&StartTime); + } + + // returns false if sessions needs to be terminated and packet not to be processed + bool AdvanceLastProcessedPacketSerial() { + for (;;) { + uint64_t value = LastProcessedPacketSerial.load(); + if (value & LastProcessedPacketSerialLockBit) { + return false; + } + if (LastProcessedPacketSerial.compare_exchange_weak(value, value + 1)) { + return true; + } + } + } + + ui64 LockLastProcessedPacketSerial() { + for (;;) { + uint64_t value = LastProcessedPacketSerial.load(); + if (value & LastProcessedPacketSerialLockBit) { + return value & ~LastProcessedPacketSerialLockBit; + } + if (LastProcessedPacketSerial.compare_exchange_strong(value, value | LastProcessedPacketSerialLockBit)) { + return value; + } + } + } + + void UnlockLastProcessedPacketSerial() { + LastProcessedPacketSerial = LastProcessedPacketSerial.load() & ~LastProcessedPacketSerialLockBit; + } + + ui64 GetLastProcessedPacketSerial() { + return LastProcessedPacketSerial.load() & ~LastProcessedPacketSerialLockBit; + } + }; + + class TInputSessionTCP + : public TActorBootstrapped<TInputSessionTCP> + , public TInterconnectLoggingBase + { + enum { + EvCheckDeadPeer = EventSpaceBegin(TEvents::ES_PRIVATE), + EvResumeReceiveData, + }; + + struct TEvCheckDeadPeer : TEventLocal<TEvCheckDeadPeer, EvCheckDeadPeer> {}; + struct TEvResumeReceiveData : TEventLocal<TEvResumeReceiveData, EvResumeReceiveData> {}; + + public: + static constexpr EActivityType ActorActivityType() { + return INTERCONNECT_SESSION_TCP; + } + + TInputSessionTCP(const TActorId& sessionId, + TIntrusivePtr<NInterconnect::TStreamSocket> socket, + TIntrusivePtr<TReceiveContext> context, + TInterconnectProxyCommon::TPtr common, + std::shared_ptr<IInterconnectMetrics> metrics, + ui32 nodeId, + ui64 lastConfirmed, + TDuration deadPeerTimeout, + TSessionParams params); + + private: + friend class TActorBootstrapped<TInputSessionTCP>; + + void Bootstrap(); + + STRICT_STFUNC(WorkingState, + cFunc(TEvents::TSystem::PoisonPill, PassAway) + hFunc(TEvPollerReady, Handle) + hFunc(TEvPollerRegisterResult, Handle) + cFunc(EvResumeReceiveData, HandleResumeReceiveData) + cFunc(TEvInterconnect::TEvCloseInputSession::EventType, CloseInputSession) + cFunc(EvCheckDeadPeer, HandleCheckDeadPeer) + cFunc(TEvConfirmUpdate::EventType, HandleConfirmUpdate) + ) + + private: + TRope IncomingData; + + const TActorId SessionId; + TIntrusivePtr<NInterconnect::TStreamSocket> Socket; + TPollerToken::TPtr PollerToken; + TIntrusivePtr<TReceiveContext> Context; + TInterconnectProxyCommon::TPtr Common; + const ui32 NodeId; + const TSessionParams Params; + + // header we are currently processing (parsed from the stream) + union { + TTcpPacketHeader_v1 v1; + TTcpPacketHeader_v2 v2; + char Data[1]; + } Header; + ui64 HeaderConfirm, HeaderSerial; + + size_t PayloadSize; + ui32 ChecksumExpected, Checksum; + bool IgnorePayload; + TRope Payload; + enum class EState { + HEADER, + PAYLOAD, + }; + EState State = EState::HEADER; + + THolder<TEvUpdateFromInputSession> UpdateFromInputSession; + + ui64 ConfirmedByInput; + + std::shared_ptr<IInterconnectMetrics> Metrics; + + bool CloseInputSessionRequested = false; + + void CloseInputSession(); + + void Handle(TEvPollerReady::TPtr ev); + void Handle(TEvPollerRegisterResult::TPtr ev); + void HandleResumeReceiveData(); + void HandleConfirmUpdate(); + void ReceiveData(); + void ProcessHeader(size_t headerLen); + void ProcessPayload(ui64& numDataBytes); + void ProcessEvent(TRope& data, TEventDescr& descr); + bool ReadMore(); + + void ReestablishConnection(TDisconnectReason reason); + void DestroySession(TDisconnectReason reason); + + TDeque<TIntrusivePtr<TRopeAlignedBuffer>> Buffers; + + static constexpr size_t NumPreallocatedBuffers = 16; + void PreallocateBuffers(); + + inline ui64 GetMaxCyclesPerEvent() const { + return DurationToCycles(TDuration::MicroSeconds(500)); + } + + const TDuration DeadPeerTimeout; + TInstant LastReceiveTimestamp; + void HandleCheckDeadPeer(); + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // pinger logic + + bool NewPingProtocol = false; + TDeque<TDuration> PingQ; // last N ping samples + TDeque<i64> SkewQ; // last N calculated clock skew samples + + void HandlePingResponse(TDuration passed); + void HandleClock(TInstant clock); + }; + + class TInterconnectSessionTCP + : public TActor<TInterconnectSessionTCP> + , public TInterconnectLoggingBase + { + enum { + EvCheckCloseOnIdle = EventSpaceBegin(TEvents::ES_PRIVATE), + EvCheckLostConnection, + EvRam, + EvTerminate, + EvFreeItems, + }; + + struct TEvCheckCloseOnIdle : TEventLocal<TEvCheckCloseOnIdle, EvCheckCloseOnIdle> {}; + struct TEvCheckLostConnection : TEventLocal<TEvCheckLostConnection, EvCheckLostConnection> {}; + + struct TEvRam : TEventLocal<TEvRam, EvRam> { + const bool Batching; + TEvRam(bool batching) : Batching(batching) {} + }; + + struct TEvTerminate : TEventLocal<TEvTerminate, EvTerminate> { + TDisconnectReason Reason; + + TEvTerminate(TDisconnectReason reason) + : Reason(std::move(reason)) + {} + }; + + const TInstant Created; + TInstant NewConnectionSet; + ui64 MessagesGot = 0; + ui64 MessagesWrittenToBuffer = 0; + ui64 PacketsGenerated = 0; + ui64 PacketsWrittenToSocket = 0; + ui64 PacketsConfirmed = 0; + + public: + static constexpr EActivityType ActorActivityType() { + return INTERCONNECT_SESSION_TCP; + } + + TInterconnectSessionTCP(TInterconnectProxyTCP* const proxy, TSessionParams params); + ~TInterconnectSessionTCP(); + + void Init(); + void CloseInputSession(); + + static TEvTerminate* NewEvTerminate(TDisconnectReason reason) { + return new TEvTerminate(std::move(reason)); + } + + TDuration GetPingRTT() const { + return TDuration::MicroSeconds(ReceiveContext->PingRTT_us); + } + + i64 GetClockSkew() const { + return ReceiveContext->ClockSkew_us; + } + + private: + friend class TInterconnectProxyTCP; + + void Handle(TEvTerminate::TPtr& ev); + void HandlePoison(); + void Terminate(TDisconnectReason reason); + void PassAway() override; + + void Forward(STATEFN_SIG); + void Subscribe(STATEFN_SIG); + void Unsubscribe(STATEFN_SIG); + + STRICT_STFUNC(StateFunc, + fFunc(TEvInterconnect::EvForward, Forward) + cFunc(TEvents::TEvPoisonPill::EventType, HandlePoison) + fFunc(TEvInterconnect::TEvConnectNode::EventType, Subscribe) + fFunc(TEvents::TEvSubscribe::EventType, Subscribe) + fFunc(TEvents::TEvUnsubscribe::EventType, Unsubscribe) + cFunc(TEvFlush::EventType, HandleFlush) + hFunc(TEvPollerReady, Handle) + hFunc(TEvPollerRegisterResult, Handle) + hFunc(TEvUpdateFromInputSession, Handle) + hFunc(TEvRam, HandleRam) + hFunc(TEvCheckCloseOnIdle, CloseOnIdleWatchdog) + hFunc(TEvCheckLostConnection, LostConnectionWatchdog) + cFunc(TEvents::TSystem::Wakeup, SendUpdateToWhiteboard) + hFunc(TEvSocketDisconnect, OnDisconnect) + hFunc(TEvTerminate, Handle) + hFunc(TEvProcessPingRequest, Handle) + ) + + void Handle(TEvUpdateFromInputSession::TPtr& ev); + + void OnDisconnect(TEvSocketDisconnect::TPtr& ev); + + THolder<TEvHandshakeAck> ProcessHandshakeRequest(TEvHandshakeAsk::TPtr& ev); + void SetNewConnection(TEvHandshakeDone::TPtr& ev); + + TEvRam* RamInQueue = nullptr; + ui64 RamStartedCycles = 0; + void HandleRam(TEvRam::TPtr& ev); + void GenerateTraffic(); + + void SendUpdateToWhiteboard(bool connected = true); + ui32 CalculateQueueUtilization(); + + void Handle(TEvPollerReady::TPtr& ev); + void Handle(TEvPollerRegisterResult::TPtr ev); + void WriteData(); + + ui64 MakePacket(bool data, TMaybe<ui64> pingMask = {}); + void FillSendingBuffer(TTcpPacketOutTask& packet, ui64 serial); + bool DropConfirmed(ui64 confirm); + void ShutdownSocket(TDisconnectReason reason); + + void StartHandshake(); + void ReestablishConnection(TEvHandshakeDone::TPtr&& ev, bool startHandshakeOnSessionClose, + TDisconnectReason reason); + void ReestablishConnectionWithHandshake(TDisconnectReason reason); + void ReestablishConnectionExecute(); + + TInterconnectProxyTCP* const Proxy; + + // various connection settings access + TDuration GetDeadPeerTimeout() const; + TDuration GetCloseOnIdleTimeout() const; + TDuration GetLostConnectionTimeout() const; + ui32 GetTotalInflightAmountOfData() const; + ui64 GetMaxCyclesPerEvent() const; + + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // pinger + + TInstant LastPingTimestamp; + static constexpr TDuration PingPeriodicity = TDuration::Seconds(1); + void IssuePingRequest(); + void Handle(TEvProcessPingRequest::TPtr ev); + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + TInstant LastInputActivityTimestamp; + TInstant LastPayloadActivityTimestamp; + TWatchdogTimer<TEvCheckCloseOnIdle> CloseOnIdleWatchdog; + TWatchdogTimer<TEvCheckLostConnection> LostConnectionWatchdog; + + void OnCloseOnIdleTimerHit() { + LOG_INFO_IC("ICS27", "CloseOnIdle timer hit, session terminated"); + Terminate(TDisconnectReason::CloseOnIdle()); + } + + void OnLostConnectionTimerHit() { + LOG_ERROR_IC("ICS28", "LostConnection timer hit, session terminated"); + Terminate(TDisconnectReason::LostConnection()); + } + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + const TSessionParams Params; + TMaybe<TEventHolderPool> Pool; + TMaybe<TChannelScheduler> ChannelScheduler; + ui64 TotalOutputQueueSize; + bool OutputStuckFlag; + TRecentWnd<std::pair<ui64, ui64>> OutputQueueUtilization; + size_t NumEventsInReadyChannels = 0; + + void SetOutputStuckFlag(bool state); + void SwitchStuckPeriod(); + + using TSendQueue = TList<TTcpPacketOutTask>; + TSendQueue SendQueue; + TSendQueue SendQueueCache; + TSendQueue::iterator SendQueuePos; + ui64 WriteBlockedCycles = 0; // start of current block period + TDuration WriteBlockedTotal; // total incremental duration that session has been blocked + ui64 BytesUnwritten = 0; + + void TrimSendQueueCache(); + + TDuration GetWriteBlockedTotal() const { + if (ReceiveContext->WriteBlockedByFullSendBuffer) { + double blockedUs = NHPTimer::GetSeconds(GetCycleCountFast() - WriteBlockedCycles) * 1000000.0; + return WriteBlockedTotal + TDuration::MicroSeconds(blockedUs); // append current blocking period if any + } else { + return WriteBlockedTotal; + } + } + + ui64 OutputCounter; + ui64 LastSentSerial = 0; + + TInstant LastHandshakeDone; + + TIntrusivePtr<NInterconnect::TStreamSocket> Socket; + TPollerToken::TPtr PollerToken; + ui32 SendBufferSize; + ui64 InflightDataAmount = 0; + + std::unordered_map<TActorId, ui64, TActorId::THash> Subscribers; + + // time at which we want to send confirmation packet even if there was no outgoing data + ui64 UnconfirmedBytes = 0; + TInstant ForcePacketTimestamp = TInstant::Max(); + TPriorityQueue<TInstant, TVector<TInstant>, std::greater<TInstant>> FlushSchedule; + size_t MaxFlushSchedule = 0; + ui64 FlushEventsScheduled = 0; + ui64 FlushEventsProcessed = 0; + + void SetForcePacketTimestamp(TDuration period); + void ScheduleFlush(); + void HandleFlush(); + void ResetFlushLogic(); + + void GenerateHttpInfo(TStringStream& str); + + TIntrusivePtr<TReceiveContext> ReceiveContext; + TActorId ReceiverId; + TDuration Ping; + + ui64 ConfirmPacketsForcedBySize = 0; + ui64 ConfirmPacketsForcedByTimeout = 0; + + ui64 LastConfirmed = 0; + + TEvHandshakeDone::TPtr PendingHandshakeDoneEvent; + bool StartHandshakeOnSessionClose = false; + + ui64 EqualizeCounter = 0; + }; + + class TInterconnectSessionKiller + : public TActorBootstrapped<TInterconnectSessionKiller> { + ui32 RepliesReceived = 0; + ui32 RepliesNumber = 0; + TActorId LargestSession = TActorId(); + ui64 MaxBufferSize = 0; + TInterconnectProxyCommon::TPtr Common; + + public: + static constexpr EActivityType ActorActivityType() { + return INTERCONNECT_SESSION_KILLER; + } + + TInterconnectSessionKiller(TInterconnectProxyCommon::TPtr common) + : Common(common) + { + } + + void Bootstrap() { + auto sender = SelfId(); + const auto eventFabric = [&sender](const TActorId& recp) -> IEventHandle* { + auto ev = new TEvSessionBufferSizeRequest(); + return new IEventHandle(recp, sender, ev, IEventHandle::FlagTrackDelivery); + }; + RepliesNumber = TlsActivationContext->ExecutorThread.ActorSystem->BroadcastToProxies(eventFabric); + Become(&TInterconnectSessionKiller::StateFunc); + } + + STRICT_STFUNC(StateFunc, + hFunc(TEvSessionBufferSizeResponse, ProcessResponse) + cFunc(TEvents::TEvUndelivered::EventType, ProcessUndelivered) + ) + + void ProcessResponse(TEvSessionBufferSizeResponse::TPtr& ev) { + RepliesReceived++; + if (MaxBufferSize < ev->Get()->BufferSize) { + MaxBufferSize = ev->Get()->BufferSize; + LargestSession = ev->Get()->SessionID; + } + if (RepliesReceived == RepliesNumber) { + Send(LargestSession, new TEvents::TEvPoisonPill); + AtomicUnlock(&Common->StartedSessionKiller); + PassAway(); + } + } + + void ProcessUndelivered() { + RepliesReceived++; + } + }; + + void CreateSessionKillingActor(TInterconnectProxyCommon::TPtr common); + +} diff --git a/library/cpp/actors/interconnect/load.cpp b/library/cpp/actors/interconnect/load.cpp new file mode 100644 index 0000000000..2a8443da71 --- /dev/null +++ b/library/cpp/actors/interconnect/load.cpp @@ -0,0 +1,405 @@ +#include "load.h" +#include "interconnect_common.h" +#include "events_local.h" +#include <library/cpp/actors/protos/services_common.pb.h> +#include <library/cpp/actors/core/log.h> +#include <library/cpp/actors/core/actor_bootstrapped.h> +#include <library/cpp/actors/core/events.h> +#include <library/cpp/actors/core/hfunc.h> +#include <util/generic/queue.h> + +namespace NInterconnect { + using namespace NActors; + + enum { + EvGenerateMessages = EventSpaceBegin(TEvents::ES_PRIVATE), + EvPublishResults, + EvQueryTrafficCounter, + EvTrafficCounter, + }; + + struct TEvQueryTrafficCounter : TEventLocal<TEvQueryTrafficCounter, EvQueryTrafficCounter> {}; + + struct TEvTrafficCounter : TEventLocal<TEvTrafficCounter, EvTrafficCounter> { + std::shared_ptr<std::atomic_uint64_t> Traffic; + + TEvTrafficCounter(std::shared_ptr<std::atomic_uint64_t> traffic) + : Traffic(std::move(traffic)) + {} + }; + + class TLoadResponderActor : public TActor<TLoadResponderActor> { + STRICT_STFUNC(StateFunc, + HFunc(TEvLoadMessage, Handle); + CFunc(TEvents::TSystem::PoisonPill, Die); + ) + + void Handle(TEvLoadMessage::TPtr& ev, const TActorContext& ctx) { + ui64 bytes = ev->Get()->CalculateSerializedSizeCached(); + auto& record = ev->Get()->Record; + auto *hops = record.MutableHops(); + while (!hops->empty() && !hops->begin()->HasNextHop()) { + record.ClearPayload(); + ev->Get()->StripPayload(); + hops->erase(hops->begin()); + } + if (!hops->empty()) { + // extract actor id of the next hop + const TActorId nextHopActorId = ActorIdFromProto(hops->begin()->GetNextHop()); + hops->erase(hops->begin()); + + // forward message to next hop; preserve flags and cookie + auto msg = MakeHolder<TEvLoadMessage>(); + record.Swap(&msg->Record); + bytes += msg->CalculateSerializedSizeCached(); + ctx.Send(nextHopActorId, msg.Release(), ev->Flags, ev->Cookie); + } + *Traffic += bytes; + } + + public: + TLoadResponderActor(std::shared_ptr<std::atomic_uint64_t> traffic) + : TActor(&TLoadResponderActor::StateFunc) + , Traffic(std::move(traffic)) + {} + + static constexpr IActor::EActivityType ActorActivityType() { + return IActor::INTERCONNECT_LOAD_RESPONDER; + } + + private: + std::shared_ptr<std::atomic_uint64_t> Traffic; + }; + + class TLoadResponderMasterActor : public TActorBootstrapped<TLoadResponderMasterActor> { + TVector<TActorId> Slaves; + ui32 SlaveIndex = 0; + + STRICT_STFUNC(StateFunc, + HFunc(TEvLoadMessage, Handle); + HFunc(TEvQueryTrafficCounter, Handle); + CFunc(TEvents::TSystem::PoisonPill, Die); + ) + + void Handle(TEvLoadMessage::TPtr& ev, const TActorContext& ctx) { + ctx.ExecutorThread.ActorSystem->Send(ev->Forward(Slaves[SlaveIndex])); + if (++SlaveIndex == Slaves.size()) { + SlaveIndex = 0; + } + } + + void Handle(TEvQueryTrafficCounter::TPtr ev, const TActorContext& ctx) { + ctx.Send(ev->Sender, new TEvTrafficCounter(Traffic)); + } + + void Die(const TActorContext& ctx) override { + for (const TActorId& actorId : Slaves) { + ctx.Send(actorId, new TEvents::TEvPoisonPill); + } + TActorBootstrapped::Die(ctx); + } + + public: + static constexpr IActor::EActivityType ActorActivityType() { + return IActor::INTERCONNECT_LOAD_RESPONDER; + } + + TLoadResponderMasterActor() + {} + + void Bootstrap(const TActorContext& ctx) { + Become(&TLoadResponderMasterActor::StateFunc); + while (Slaves.size() < 10) { + Slaves.push_back(ctx.Register(new TLoadResponderActor(Traffic))); + } + } + + private: + std::shared_ptr<std::atomic_uint64_t> Traffic = std::make_shared<std::atomic_uint64_t>(); + }; + + IActor* CreateLoadResponderActor() { + return new TLoadResponderMasterActor(); + } + + TActorId MakeLoadResponderActorId(ui32 nodeId) { + char x[12] = {'I', 'C', 'L', 'o', 'a', 'd', 'R', 'e', 's', 'p', 'A', 'c'}; + return TActorId(nodeId, TStringBuf(x, 12)); + } + + class TLoadActor: public TActorBootstrapped<TLoadActor> { + struct TEvGenerateMessages : TEventLocal<TEvGenerateMessages, EvGenerateMessages> {}; + struct TEvPublishResults : TEventLocal<TEvPublishResults, EvPublishResults> {}; + + struct TMessageInfo { + TInstant SendTimestamp; + + TMessageInfo(const TInstant& sendTimestamp) + : SendTimestamp(sendTimestamp) + { + } + }; + + const TLoadParams Params; + TInstant NextMessageTimestamp; + THashMap<TString, TMessageInfo> InFly; + ui64 NextId = 1; + TVector<TActorId> Hops; + TActorId FirstHop; + ui64 NumDropped = 0; + std::shared_ptr<std::atomic_uint64_t> Traffic; + + public: + static constexpr IActor::EActivityType ActorActivityType() { + return IActor::INTERCONNECT_LOAD_ACTOR; + } + + TLoadActor(const TLoadParams& params) + : Params(params) + {} + + void Bootstrap(const TActorContext& ctx) { + Become(&TLoadActor::QueryTrafficCounter); + ctx.Send(MakeLoadResponderActorId(SelfId().NodeId()), new TEvQueryTrafficCounter); + } + + void Handle(TEvTrafficCounter::TPtr ev, const TActorContext& ctx) { + Traffic = std::move(ev->Get()->Traffic); + + for (const ui32 nodeId : Params.NodeHops) { + const TActorId& actorId = nodeId ? MakeLoadResponderActorId(nodeId) : TActorId(); + if (!FirstHop) { + FirstHop = actorId; + } else { + Hops.push_back(actorId); + } + } + + Hops.push_back(ctx.SelfID); + + Become(&TLoadActor::StateFunc); + NextMessageTimestamp = ctx.Now(); + ResetThroughput(NextMessageTimestamp, *Traffic); + GenerateMessages(ctx); + ctx.Schedule(Params.Duration, new TEvents::TEvPoisonPill); + SchedulePublishResults(ctx); + } + + void GenerateMessages(const TActorContext& ctx) { + while (InFly.size() < Params.InFlyMax && ctx.Now() >= NextMessageTimestamp) { + // generate payload + const ui32 size = Params.SizeMin + RandomNumber(Params.SizeMax - Params.SizeMin + 1); + + // generate message id + const ui64 cookie = NextId++; + TString id = Sprintf("%" PRIu64, cookie); + + // create message and send it to the first hop + THolder<TEvLoadMessage> ev; + if (Params.UseProtobufWithPayload && size) { + auto buffer = TRopeAlignedBuffer::Allocate(size); + memset(buffer->GetBuffer(), '*', size); + ev.Reset(new TEvLoadMessage(Hops, id, TRope(buffer))); + } else { + TString payload; + if (size) { + payload = TString::Uninitialized(size); + memset(payload.Detach(), '*', size); + } + ev.Reset(new TEvLoadMessage(Hops, id, payload ? &payload : nullptr)); + } + UpdateThroughput(ev->CalculateSerializedSizeCached()); + ctx.Send(FirstHop, ev.Release(), IEventHandle::MakeFlags(Params.Channel, 0), cookie); + + // register in the map + InFly.emplace(id, TMessageInfo(ctx.Now())); + + // put item into timeout queue + PutTimeoutQueueItem(ctx, id); + + const TDuration duration = TDuration::MicroSeconds(Params.IntervalMin.GetValue() + + RandomNumber(Params.IntervalMax.GetValue() - Params.IntervalMin.GetValue() + 1)); + if (Params.SoftLoad) { + NextMessageTimestamp += duration; + } else { + NextMessageTimestamp = ctx.Now() + duration; + } + } + + // schedule next generate messages call + if (NextMessageTimestamp > ctx.Now() && InFly.size() < Params.InFlyMax) { + ctx.Schedule(NextMessageTimestamp - ctx.Now(), new TEvGenerateMessages); + } + } + + void Handle(TEvLoadMessage::TPtr& ev, const TActorContext& ctx) { + const auto& record = ev->Get()->Record; + auto it = InFly.find(record.GetId()); + if (it != InFly.end()) { + // record message rtt + const TDuration rtt = ctx.Now() - it->second.SendTimestamp; + UpdateHistogram(ctx.Now(), rtt); + + // update throughput + UpdateThroughput(ev->Get()->CalculateSerializedSizeCached()); + + // remove message from the in fly map + InFly.erase(it); + } else { + ++NumDropped; + } + GenerateMessages(ctx); + } + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // RTT HISTOGRAM + //////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + const TDuration AggregationPeriod = TDuration::Seconds(20); + TDeque<std::pair<TInstant, TDuration>> Histogram; + + void UpdateHistogram(TInstant when, TDuration rtt) { + Histogram.emplace_back(when, rtt); + + const TInstant barrier = when - AggregationPeriod; + while (Histogram && Histogram.front().first < barrier) { + Histogram.pop_front(); + } + } + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // THROUGHPUT + //////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + TInstant ThroughputFirstSample = TInstant::Zero(); + ui64 ThroughputSamples = 0; + ui64 ThroughputBytes = 0; + ui64 TrafficAtBegin = 0; + + void UpdateThroughput(ui64 bytes) { + ThroughputBytes += bytes; + ++ThroughputSamples; + } + + void ResetThroughput(TInstant when, ui64 traffic) { + ThroughputFirstSample = when; + ThroughputSamples = 0; + ThroughputBytes = 0; + TrafficAtBegin = traffic; + } + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // TIMEOUT QUEUE OPERATIONS + //////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + TQueue<std::pair<TInstant, TString>> TimeoutQueue; + + void PutTimeoutQueueItem(const TActorContext& ctx, TString id) { + TimeoutQueue.emplace(ctx.Now() + TDuration::Minutes(1), std::move(id)); + if (TimeoutQueue.size() == 1) { + ScheduleWakeup(ctx); + } + } + + void ScheduleWakeup(const TActorContext& ctx) { + ctx.Schedule(TimeoutQueue.front().first - ctx.Now(), new TEvents::TEvWakeup); + } + + void HandleWakeup(const TActorContext& ctx) { + ui32 numDropped = 0; + + while (TimeoutQueue && TimeoutQueue.front().first <= ctx.Now()) { + numDropped += InFly.erase(TimeoutQueue.front().second); + TimeoutQueue.pop(); + } + if (TimeoutQueue) { + // we still have some elements in timeout queue, so schedule next wake up to tidy up + ScheduleWakeup(ctx); + } + + GenerateMessages(ctx); + } + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // RESULT PUBLISHING + //////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + const TDuration ResultPublishPeriod = TDuration::Seconds(15); + + void SchedulePublishResults(const TActorContext& ctx) { + ctx.Schedule(ResultPublishPeriod, new TEvPublishResults); + } + + void PublishResults(const TActorContext& ctx, bool schedule = true) { + const TInstant now = ctx.Now(); + + TStringStream msg; + + msg << "Load# '" << Params.Name << "'"; + + msg << " Throughput# "; + const TDuration duration = now - ThroughputFirstSample; + const ui64 traffic = *Traffic; + msg << "{window# " << duration + << " bytes# " << ThroughputBytes + << " samples# " << ThroughputSamples + << " b/s# " << ui64(ThroughputBytes * 1000000 / duration.MicroSeconds()) + << " common# " << ui64((traffic - TrafficAtBegin) * 1000000 / duration.MicroSeconds()) + << "}"; + ResetThroughput(now, traffic); + + msg << " RTT# "; + if (Histogram) { + const TDuration duration = Histogram.back().first - Histogram.front().first; + msg << "{window# " << duration << " samples# " << Histogram.size(); + TVector<TDuration> v; + v.reserve(Histogram.size()); + for (const auto& item : Histogram) { + v.push_back(item.second); + } + std::sort(v.begin(), v.end()); + for (double q : {0.5, 0.9, 0.99, 0.999, 0.9999, 1.0}) { + const size_t pos = q * (v.size() - 1); + msg << Sprintf(" %.4f# %s", q, v[pos].ToString().data()); + } + msg << "}"; + } else { + msg << "<empty>"; + } + + msg << " NumDropped# " << NumDropped; + + if (!schedule) { + msg << " final"; + } + + LOG_NOTICE(ctx, NActorsServices::INTERCONNECT_SPEED_TEST, "%s", msg.Str().data()); + + if (schedule) { + SchedulePublishResults(ctx); + } + } + + STRICT_STFUNC(QueryTrafficCounter, + HFunc(TEvTrafficCounter, Handle); + ) + + STRICT_STFUNC(StateFunc, + CFunc(TEvents::TSystem::PoisonPill, Die); + CFunc(TEvents::TSystem::Wakeup, HandleWakeup); + CFunc(EvPublishResults, PublishResults); + CFunc(EvGenerateMessages, GenerateMessages); + HFunc(TEvLoadMessage, Handle); + ) + + void Die(const TActorContext& ctx) override { + PublishResults(ctx, false); + TActorBootstrapped::Die(ctx); + } + }; + + IActor* CreateLoadActor(const TLoadParams& params) { + return new TLoadActor(params); + } + +} diff --git a/library/cpp/actors/interconnect/load.h b/library/cpp/actors/interconnect/load.h new file mode 100644 index 0000000000..0a01a0dc04 --- /dev/null +++ b/library/cpp/actors/interconnect/load.h @@ -0,0 +1,24 @@ +#pragma once + +#include <library/cpp/actors/core/actor.h> + +namespace NInterconnect { + // load responder -- lives on every node as a service actor + NActors::IActor* CreateLoadResponderActor(); + NActors::TActorId MakeLoadResponderActorId(ui32 node); + + // load actor -- generates load with specific parameters + struct TLoadParams { + TString Name; + ui32 Channel; + TVector<ui32> NodeHops; // node ids for the message route + ui32 SizeMin, SizeMax; // min and max size for payloads + ui32 InFlyMax; // maximum number of in fly messages + TDuration IntervalMin, IntervalMax; // min and max intervals between sending messages + bool SoftLoad; // is the load soft? + TDuration Duration; // test duration + bool UseProtobufWithPayload; // store payload separately + }; + NActors::IActor* CreateLoadActor(const TLoadParams& params); + +} diff --git a/library/cpp/actors/interconnect/logging.h b/library/cpp/actors/interconnect/logging.h new file mode 100644 index 0000000000..c429d1cade --- /dev/null +++ b/library/cpp/actors/interconnect/logging.h @@ -0,0 +1,68 @@ +#pragma once + +#include <library/cpp/actors/core/log.h> +#include <library/cpp/actors/protos/services_common.pb.h> + +#define LOG_LOG_IC_X(component, marker, priority, ...) \ + do { \ + LOG_LOG(this->GetActorContext(), (priority), (component), "%s " marker " %s", LogPrefix.data(), Sprintf(__VA_ARGS__).data()); \ + } while (false) + +#define LOG_LOG_NET_X(priority, NODE_ID, FMT, ...) \ + do { \ + const TActorContext& ctx = this->GetActorContext(); \ + LOG_LOG(ctx, (priority), ::NActorsServices::INTERCONNECT_NETWORK, "[%" PRIu32 " <-> %" PRIu32 "] %s", \ + ctx.SelfID.NodeId(), (NODE_ID), Sprintf(FMT, __VA_ARGS__).data()); \ + } while (false) + +#define LOG_LOG_IC(component, marker, priority, ...) \ + do { \ + LOG_LOG(::NActors::TActivationContext::AsActorContext(), (priority), (component), "%s " marker " %s", LogPrefix.data(), Sprintf(__VA_ARGS__).data()); \ + } while (false) + +#define LOG_LOG_NET(priority, NODE_ID, FMT, ...) \ + do { \ + const TActorContext& ctx = ::NActors::TActivationContext::AsActorContext(); \ + LOG_LOG(ctx, (priority), ::NActorsServices::INTERCONNECT_NETWORK, "[%" PRIu32 " <-> %" PRIu32 "] %s", \ + ctx.SelfID.NodeId(), (NODE_ID), Sprintf(FMT, __VA_ARGS__).data()); \ + } while (false) + +#define LOG_EMER_IC(marker, ...) LOG_LOG_IC(::NActorsServices::INTERCONNECT, marker, ::NActors::NLog::PRI_EMER, __VA_ARGS__) +#define LOG_ALERT_IC(marker, ...) LOG_LOG_IC(::NActorsServices::INTERCONNECT, marker, ::NActors::NLog::PRI_ALERT, __VA_ARGS__) +#define LOG_CRIT_IC(marker, ...) LOG_LOG_IC(::NActorsServices::INTERCONNECT, marker, ::NActors::NLog::PRI_CRIT, __VA_ARGS__) +#define LOG_ERROR_IC(marker, ...) LOG_LOG_IC(::NActorsServices::INTERCONNECT, marker, ::NActors::NLog::PRI_ERROR, __VA_ARGS__) +#define LOG_WARN_IC(marker, ...) LOG_LOG_IC(::NActorsServices::INTERCONNECT, marker, ::NActors::NLog::PRI_WARN, __VA_ARGS__) +#define LOG_NOTICE_IC(marker, ...) LOG_LOG_IC(::NActorsServices::INTERCONNECT, marker, ::NActors::NLog::PRI_NOTICE, __VA_ARGS__) +#define LOG_INFO_IC(marker, ...) LOG_LOG_IC(::NActorsServices::INTERCONNECT, marker, ::NActors::NLog::PRI_INFO, __VA_ARGS__) +#define LOG_DEBUG_IC(marker, ...) LOG_LOG_IC(::NActorsServices::INTERCONNECT, marker, ::NActors::NLog::PRI_DEBUG, __VA_ARGS__) + +#define LOG_EMER_IC_SESSION(marker, ...) LOG_LOG_IC(::NActorsServices::INTERCONNECT_SESSION, marker, ::NActors::NLog::PRI_EMER, __VA_ARGS__) +#define LOG_ALERT_IC_SESSION(marker, ...) LOG_LOG_IC(::NActorsServices::INTERCONNECT_SESSION, marker, ::NActors::NLog::PRI_ALERT, __VA_ARGS__) +#define LOG_CRIT_IC_SESSION(marker, ...) LOG_LOG_IC(::NActorsServices::INTERCONNECT_SESSION, marker, ::NActors::NLog::PRI_CRIT, __VA_ARGS__) +#define LOG_ERROR_IC_SESSION(marker, ...) LOG_LOG_IC(::NActorsServices::INTERCONNECT_SESSION, marker, ::NActors::NLog::PRI_ERROR, __VA_ARGS__) +#define LOG_WARN_IC_SESSION(marker, ...) LOG_LOG_IC(::NActorsServices::INTERCONNECT_SESSION, marker, ::NActors::NLog::PRI_WARN, __VA_ARGS__) +#define LOG_NOTICE_IC_SESSION(marker, ...) LOG_LOG_IC(::NActorsServices::INTERCONNECT_SESSION, marker, ::NActors::NLog::PRI_NOTICE, __VA_ARGS__) +#define LOG_INFO_IC_SESSION(marker, ...) LOG_LOG_IC(::NActorsServices::INTERCONNECT_SESSION, marker, ::NActors::NLog::PRI_INFO, __VA_ARGS__) +#define LOG_DEBUG_IC_SESSION(marker, ...) LOG_LOG_IC(::NActorsServices::INTERCONNECT_SESSION, marker, ::NActors::NLog::PRI_DEBUG, __VA_ARGS__) + +#define LOG_NOTICE_NET(NODE_ID, FMT, ...) LOG_LOG_NET(::NActors::NLog::PRI_NOTICE, NODE_ID, FMT, __VA_ARGS__) +#define LOG_DEBUG_NET(NODE_ID, FMT, ...) LOG_LOG_NET(::NActors::NLog::PRI_DEBUG, NODE_ID, FMT, __VA_ARGS__) + +namespace NActors { + class TInterconnectLoggingBase { + protected: + const TString LogPrefix; + + public: + TInterconnectLoggingBase() = default; + + TInterconnectLoggingBase(const TString& prefix) + : LogPrefix(prefix) + { + } + + void SetPrefix(TString logPrefix) const { + logPrefix.swap(const_cast<TString&>(LogPrefix)); + } + }; +} diff --git a/library/cpp/actors/interconnect/mock/ic_mock.cpp b/library/cpp/actors/interconnect/mock/ic_mock.cpp new file mode 100644 index 0000000000..884503e602 --- /dev/null +++ b/library/cpp/actors/interconnect/mock/ic_mock.cpp @@ -0,0 +1,298 @@ +#include "ic_mock.h" +#include <library/cpp/actors/core/interconnect.h> +#include <util/system/yield.h> +#include <thread> + +namespace NActors { + + class TInterconnectMock::TImpl { + enum { + EvInject = EventSpaceBegin(TEvents::ES_PRIVATE), + EvCheckSession, + EvRam, + }; + + struct TEvInject : TEventLocal<TEvInject, EvInject> { + std::deque<std::unique_ptr<IEventHandle>> Messages; + const TScopeId OriginScopeId; + const ui64 SenderSessionId; + + TEvInject(std::deque<std::unique_ptr<IEventHandle>>&& messages, const TScopeId& originScopeId, ui64 senderSessionId) + : Messages(std::move(messages)) + , OriginScopeId(originScopeId) + , SenderSessionId(senderSessionId) + {} + }; + + class TProxyMockActor; + + class TConnectionState { + struct TPeerInfo { + TRWMutex Mutex; + TActorSystem *ActorSystem = nullptr; + TActorId ProxyId; + }; + + const ui64 Key; + TPeerInfo PeerInfo[2]; + std::atomic_uint64_t SessionId = 0; + + public: + TConnectionState(ui64 key) + : Key(key) + {} + + void Attach(ui32 nodeId, TActorSystem *as, const TActorId& actorId) { + TPeerInfo *peer = GetPeer(nodeId); + auto guard = TWriteGuard(peer->Mutex); + Y_VERIFY(!peer->ActorSystem); + peer->ActorSystem = as; + peer->ProxyId = actorId; + as->DeferPreStop([peer] { + auto guard = TWriteGuard(peer->Mutex); + peer->ActorSystem = nullptr; + }); + } + + void Inject(ui32 peerNodeId, std::deque<std::unique_ptr<IEventHandle>>&& messages, + const TScopeId& originScopeId, ui64 senderSessionId) { + TPeerInfo *peer = GetPeer(peerNodeId); + auto guard = TReadGuard(peer->Mutex); + if (peer->ActorSystem) { + peer->ActorSystem->Send(new IEventHandle(peer->ProxyId, TActorId(), new TEvInject(std::move(messages), + originScopeId, senderSessionId))); + } else { + for (auto&& ev : messages) { + TActivationContext::Send(ev->ForwardOnNondelivery(TEvents::TEvUndelivered::Disconnected)); + } + } + } + + ui64 GetValidSessionId() const { + return SessionId; + } + + void InvalidateSessionId(ui32 peerNodeId) { + ++SessionId; + TPeerInfo *peer = GetPeer(peerNodeId); + auto guard = TReadGuard(peer->Mutex); + if (peer->ActorSystem) { + peer->ActorSystem->Send(new IEventHandle(EvCheckSession, 0, peer->ProxyId, {}, nullptr, 0)); + } + } + + private: + TPeerInfo *GetPeer(ui32 nodeId) { + if (nodeId == ui32(Key)) { + return PeerInfo; + } else if (nodeId == ui32(Key >> 32)) { + return PeerInfo + 1; + } else { + Y_FAIL(); + } + } + }; + + class TProxyMockActor : public TActor<TProxyMockActor> { + class TSessionMockActor : public TActor<TSessionMockActor> { + std::map<TActorId, ui64> Subscribers; + TProxyMockActor* const Proxy; + std::deque<std::unique_ptr<IEventHandle>> Queue; + + public: + const ui64 SessionId; + + public: + TSessionMockActor(TProxyMockActor *proxy, ui64 sessionId) + : TActor(&TThis::StateFunc) + , Proxy(proxy) + , SessionId(sessionId) + {} + + void Terminate() { + for (auto&& ev : std::exchange(Queue, {})) { + TActivationContext::Send(ev->ForwardOnNondelivery(TEvents::TEvUndelivered::Disconnected)); + } + for (const auto& kv : Subscribers) { + Send(kv.first, new TEvInterconnect::TEvNodeDisconnected(Proxy->PeerNodeId), 0, kv.second); + } + Y_VERIFY(Proxy->Session == this); + Proxy->Session = nullptr; + PassAway(); + } + + void HandleForward(TAutoPtr<IEventHandle> ev) { + if (ev->Flags & IEventHandle::FlagSubscribeOnSession) { + Subscribe(ev->Sender, ev->Cookie); + } + if (Queue.empty()) { + TActivationContext::Send(new IEventHandle(EvRam, 0, SelfId(), {}, {}, 0)); + } + Queue.emplace_back(ev.Release()); + } + + void HandleRam() { + if (SessionId != Proxy->State.GetValidSessionId()) { + Terminate(); + } else { + Proxy->PeerInject(std::exchange(Queue, {})); + } + } + + void Handle(TEvInterconnect::TEvConnectNode::TPtr ev) { + Subscribe(ev->Sender, ev->Cookie); + } + + void Handle(TEvents::TEvSubscribe::TPtr ev) { + Subscribe(ev->Sender, ev->Cookie); + } + + void Handle(TEvents::TEvUnsubscribe::TPtr ev) { + Subscribers.erase(ev->Sender); + } + + void HandlePoison() { + Proxy->Disconnect(); + } + + STRICT_STFUNC(StateFunc, + fFunc(TEvInterconnect::EvForward, HandleForward) + hFunc(TEvInterconnect::TEvConnectNode, Handle) + hFunc(TEvents::TEvSubscribe, Handle) + hFunc(TEvents::TEvUnsubscribe, Handle) + cFunc(TEvents::TSystem::Poison, HandlePoison) + cFunc(EvRam, HandleRam) + ) + + private: + void Subscribe(const TActorId& actorId, ui64 cookie) { + Subscribers[actorId] = cookie; + Send(actorId, new TEvInterconnect::TEvNodeConnected(Proxy->PeerNodeId), 0, cookie); + } + }; + + friend class TSessionMockActor; + + const ui32 NodeId; + const ui32 PeerNodeId; + TConnectionState& State; + const TInterconnectProxyCommon::TPtr Common; + TSessionMockActor *Session = nullptr; + + public: + TProxyMockActor(ui32 nodeId, ui32 peerNodeId, TConnectionState& state, TInterconnectProxyCommon::TPtr common) + : TActor(&TThis::StateFunc) + , NodeId(nodeId) + , PeerNodeId(peerNodeId) + , State(state) + , Common(std::move(common)) + {} + + void Registered(TActorSystem *as, const TActorId& parent) override { + TActor::Registered(as, parent); + State.Attach(NodeId, as, SelfId()); + } + + void Handle(TEvInject::TPtr ev) { + auto *msg = ev->Get(); + if (Session && Session->SessionId != msg->SenderSessionId) { + return; // drop messages from other sessions + } + if (auto *session = GetSession()) { + for (auto&& ev : ev->Get()->Messages) { + auto fw = std::make_unique<IEventHandle>( + session->SelfId(), + ev->Type, + ev->Flags & ~IEventHandle::FlagForwardOnNondelivery, + ev->Recipient, + ev->Sender, + ev->ReleaseChainBuffer(), + ev->Cookie, + msg->OriginScopeId, + std::move(ev->TraceId) + ); + if (!Common->EventFilter || Common->EventFilter->CheckIncomingEvent(*fw, Common->LocalScopeId)) { + TActivationContext::Send(fw.release()); + } + } + } + } + + void PassAway() override { + Disconnect(); + TActor::PassAway(); + } + + TSessionMockActor *GetSession() { + CheckSession(); + if (!Session) { + Session = new TSessionMockActor(this, State.GetValidSessionId()); + RegisterWithSameMailbox(Session); + } + return Session; + } + + void HandleSessionEvent(TAutoPtr<IEventHandle> ev) { + auto *session = GetSession(); + InvokeOtherActor(*session, &TSessionMockActor::Receive, ev, + TActivationContext::ActorContextFor(session->SelfId())); + } + + void Disconnect() { + State.InvalidateSessionId(PeerNodeId); + if (Session) { + Session->Terminate(); + } + } + + void CheckSession() { + if (Session && Session->SessionId != State.GetValidSessionId()) { + Session->Terminate(); + } + } + + void PeerInject(std::deque<std::unique_ptr<IEventHandle>>&& messages) { + Y_VERIFY(Session); + return State.Inject(PeerNodeId, std::move(messages), Common->LocalScopeId, Session->SessionId); + } + + STRICT_STFUNC(StateFunc, + cFunc(TEvents::TSystem::Poison, PassAway) + fFunc(TEvInterconnect::EvForward, HandleSessionEvent) + fFunc(TEvInterconnect::EvConnectNode, HandleSessionEvent) + fFunc(TEvents::TSystem::Subscribe, HandleSessionEvent) + fFunc(TEvents::TSystem::Unsubscribe, HandleSessionEvent) + cFunc(TEvInterconnect::EvDisconnect, Disconnect) + IgnoreFunc(TEvInterconnect::TEvClosePeerSocket) + IgnoreFunc(TEvInterconnect::TEvCloseInputSession) + cFunc(TEvInterconnect::EvPoisonSession, Disconnect) + hFunc(TEvInject, Handle) + cFunc(EvCheckSession, CheckSession) + ) + }; + + std::unordered_map<ui64, TConnectionState> States; + + public: + IActor *CreateProxyMock(ui32 nodeId, ui32 peerNodeId, TInterconnectProxyCommon::TPtr common) { + Y_VERIFY(nodeId != peerNodeId); + Y_VERIFY(nodeId); + Y_VERIFY(peerNodeId); + const ui64 key = std::min(nodeId, peerNodeId) | ui64(std::max(nodeId, peerNodeId)) << 32; + auto it = States.try_emplace(key, key).first; + return new TProxyMockActor(nodeId, peerNodeId, it->second, std::move(common)); + } + }; + + TInterconnectMock::TInterconnectMock() + : Impl(std::make_unique<TImpl>()) + {} + + TInterconnectMock::~TInterconnectMock() + {} + + IActor *TInterconnectMock::CreateProxyMock(ui32 nodeId, ui32 peerNodeId, TInterconnectProxyCommon::TPtr common) { + return Impl->CreateProxyMock(nodeId, peerNodeId, std::move(common)); + } + +} // NActors diff --git a/library/cpp/actors/interconnect/mock/ic_mock.h b/library/cpp/actors/interconnect/mock/ic_mock.h new file mode 100644 index 0000000000..636bdc2b7f --- /dev/null +++ b/library/cpp/actors/interconnect/mock/ic_mock.h @@ -0,0 +1,19 @@ +#pragma once + +#include <library/cpp/actors/core/actor.h> + +#include <library/cpp/actors/interconnect/interconnect_common.h> + +namespace NActors { + + class TInterconnectMock { + class TImpl; + std::unique_ptr<TImpl> Impl; + + public: + TInterconnectMock(); + ~TInterconnectMock(); + IActor *CreateProxyMock(ui32 nodeId, ui32 peerNodeId, TInterconnectProxyCommon::TPtr common); + }; + +} // NActors diff --git a/library/cpp/actors/interconnect/mock/tsan.supp b/library/cpp/actors/interconnect/mock/tsan.supp new file mode 100644 index 0000000000..19fd059419 --- /dev/null +++ b/library/cpp/actors/interconnect/mock/tsan.supp @@ -0,0 +1 @@ +deadlock:Attach diff --git a/library/cpp/actors/interconnect/mock/ya.make b/library/cpp/actors/interconnect/mock/ya.make new file mode 100644 index 0000000000..19a2834162 --- /dev/null +++ b/library/cpp/actors/interconnect/mock/ya.make @@ -0,0 +1,16 @@ +LIBRARY() + +OWNER(alexvru) + +SRCS( + ic_mock.cpp + ic_mock.h +) + +SUPPRESSIONS(tsan.supp) + +PEERDIR( + library/cpp/actors/interconnect +) + +END() diff --git a/library/cpp/actors/interconnect/packet.cpp b/library/cpp/actors/interconnect/packet.cpp new file mode 100644 index 0000000000..e2c289ed59 --- /dev/null +++ b/library/cpp/actors/interconnect/packet.cpp @@ -0,0 +1,32 @@ +#include "packet.h" + +#include <library/cpp/actors/core/probes.h> + +#include <util/system/datetime.h> + +LWTRACE_USING(ACTORLIB_PROVIDER); + +ui32 TEventHolder::Fill(IEventHandle& ev) { + Serial = 0; + Descr.Type = ev.Type; + Descr.Flags = ev.Flags; + Descr.Recipient = ev.Recipient; + Descr.Sender = ev.Sender; + Descr.Cookie = ev.Cookie; + ev.TraceId.Serialize(&Descr.TraceId); + ForwardRecipient = ev.GetForwardOnNondeliveryRecipient(); + EventActuallySerialized = 0; + Descr.Checksum = 0; + + if (ev.HasBuffer()) { + Buffer = ev.ReleaseChainBuffer(); + EventSerializedSize = Buffer->GetSize(); + } else if (ev.HasEvent()) { + Event.Reset(ev.ReleaseBase()); + EventSerializedSize = Event->CalculateSerializedSize(); + } else { + EventSerializedSize = 0; + } + + return EventSerializedSize; +} diff --git a/library/cpp/actors/interconnect/packet.h b/library/cpp/actors/interconnect/packet.h new file mode 100644 index 0000000000..4ba50a2b5f --- /dev/null +++ b/library/cpp/actors/interconnect/packet.h @@ -0,0 +1,324 @@ +#pragma once + +#include <library/cpp/actors/core/event_pb.h> +#include <library/cpp/actors/core/event_load.h> +#include <library/cpp/actors/core/events.h> +#include <library/cpp/actors/core/actor.h> +#include <library/cpp/containers/stack_vector/stack_vec.h> +#include <library/cpp/actors/util/rope.h> +#include <library/cpp/actors/prof/tag.h> +#include <library/cpp/digest/crc32c/crc32c.h> +#include <library/cpp/lwtrace/shuttle.h> +#include <util/generic/string.h> +#include <util/generic/list.h> + +#ifndef FORCE_EVENT_CHECKSUM +#define FORCE_EVENT_CHECKSUM 0 +#endif + +using NActors::IEventBase; +using NActors::IEventHandle; +using NActors::TActorId; +using NActors::TConstIoVec; +using NActors::TEventSerializedData; + +Y_FORCE_INLINE ui32 Crc32cExtendMSanCompatible(ui32 checksum, const void *data, size_t len) { + if constexpr (NSan::MSanIsOn()) { + const char *begin = static_cast<const char*>(data); + const char *end = begin + len; + begin -= reinterpret_cast<uintptr_t>(begin) & 15; + end += -reinterpret_cast<uintptr_t>(end) & 15; + NSan::Unpoison(begin, end - begin); + } + return Crc32cExtend(checksum, data, len); +} + +struct TSessionParams { + bool Encryption = {}; + bool UseModernFrame = {}; + bool AuthOnly = {}; + TString AuthCN; + NActors::TScopeId PeerScopeId; +}; + +struct TTcpPacketHeader_v1 { + ui32 HeaderCRC32; + ui32 PayloadCRC32; + ui64 Confirm; + ui64 Serial; + ui64 DataSize; + + inline bool Check() const { + ui32 actual = Crc32cExtendMSanCompatible(0, &PayloadCRC32, sizeof(TTcpPacketHeader_v1) - sizeof(HeaderCRC32)); + return actual == HeaderCRC32; + } + + inline void Sign() { + HeaderCRC32 = Crc32cExtendMSanCompatible(0, &PayloadCRC32, sizeof(TTcpPacketHeader_v1) - sizeof(HeaderCRC32)); + } + + TString ToString() const { + return Sprintf("{Confirm# %" PRIu64 " Serial# %" PRIu64 " DataSize# %" PRIu64 "}", Confirm, Serial, DataSize); + } +}; + +#pragma pack(push, 1) +struct TTcpPacketHeader_v2 { + ui64 Confirm; + ui64 Serial; + ui32 Checksum; // for the whole frame + ui16 PayloadLength; +}; +#pragma pack(pop) + +union TTcpPacketBuf { + static constexpr ui64 PingRequestMask = 0x8000000000000000ULL; + static constexpr ui64 PingResponseMask = 0x4000000000000000ULL; + static constexpr ui64 ClockMask = 0x2000000000000000ULL; + + static constexpr size_t PacketDataLen = 4096 * 2 - 96 - Max(sizeof(TTcpPacketHeader_v1), sizeof(TTcpPacketHeader_v2)); + struct { + TTcpPacketHeader_v1 Header; + char Data[PacketDataLen]; + } v1; + struct { + TTcpPacketHeader_v2 Header; + char Data[PacketDataLen]; + } v2; +}; + +#pragma pack(push, 1) +struct TEventDescr { + ui32 Type; + ui32 Flags; + TActorId Recipient; + TActorId Sender; + ui64 Cookie; + // wilson trace id is stored as a serialized entity to avoid using complex object with prohibited copy ctor + NWilson::TTraceId::TSerializedTraceId TraceId; + ui32 Checksum; +}; +#pragma pack(pop) + +struct TEventHolder : TNonCopyable { + TEventDescr Descr; + TActorId ForwardRecipient; + THolder<IEventBase> Event; + TIntrusivePtr<TEventSerializedData> Buffer; + ui64 Serial; + ui32 EventSerializedSize; + ui32 EventActuallySerialized; + mutable NLWTrace::TOrbit Orbit; + + ui32 Fill(IEventHandle& ev); + + void InitChecksum() { + Descr.Checksum = 0; + } + + void UpdateChecksum(const TSessionParams& params, const void *buffer, size_t len) { + if (FORCE_EVENT_CHECKSUM || !params.UseModernFrame) { + Descr.Checksum = Crc32cExtendMSanCompatible(Descr.Checksum, buffer, len); + } + } + + void ForwardOnNondelivery(bool unsure) { + TEventDescr& d = Descr; + const TActorId& r = d.Recipient; + const TActorId& s = d.Sender; + const TActorId *f = ForwardRecipient ? &ForwardRecipient : nullptr; + auto ev = Event + ? std::make_unique<IEventHandle>(r, s, Event.Release(), d.Flags, d.Cookie, f, NWilson::TTraceId(d.TraceId)) + : std::make_unique<IEventHandle>(d.Type, d.Flags, r, s, std::move(Buffer), d.Cookie, f, NWilson::TTraceId(d.TraceId)); + NActors::TActivationContext::Send(ev->ForwardOnNondelivery(NActors::TEvents::TEvUndelivered::Disconnected, unsure)); + } + + void Clear() { + Event.Reset(); + Buffer.Reset(); + Orbit.Reset(); + } +}; + +namespace NActors { + class TEventOutputChannel; +} + +struct TTcpPacketOutTask : TNonCopyable { + const TSessionParams& Params; + TTcpPacketBuf Packet; + size_t DataSize; + TStackVec<TConstIoVec, 32> Bufs; + size_t BufferIndex; + size_t FirstBufferOffset; + bool TriedWriting; + char *FreeArea; + char *End; + mutable NLWTrace::TOrbit Orbit; + +public: + TTcpPacketOutTask(const TSessionParams& params) + : Params(params) + { + Reuse(); + } + + template<typename T> + auto ApplyToHeader(T&& callback) { + return Params.UseModernFrame ? callback(Packet.v2.Header) : callback(Packet.v1.Header); + } + + template<typename T> + auto ApplyToHeader(T&& callback) const { + return Params.UseModernFrame ? callback(Packet.v2.Header) : callback(Packet.v1.Header); + } + + bool IsAtBegin() const { + return !BufferIndex && !FirstBufferOffset && !TriedWriting; + } + + void MarkTriedWriting() { + TriedWriting = true; + } + + void Reuse() { + DataSize = 0; + ApplyToHeader([this](auto& header) { Bufs.assign(1, {&header, sizeof(header)}); }); + BufferIndex = 0; + FirstBufferOffset = 0; + TriedWriting = false; + FreeArea = Params.UseModernFrame ? Packet.v2.Data : Packet.v1.Data; + End = FreeArea + TTcpPacketBuf::PacketDataLen; + Orbit.Reset(); + } + + bool IsEmpty() const { + return !DataSize; + } + + void SetMetadata(ui64 serial, ui64 confirm) { + ApplyToHeader([&](auto& header) { + header.Serial = serial; + header.Confirm = confirm; + }); + } + + void UpdateConfirmIfPossible(ui64 confirm) { + // we don't want to recalculate whole packet checksum for single confirmation update on v2 + if (!Params.UseModernFrame && IsAtBegin() && confirm != Packet.v1.Header.Confirm) { + Packet.v1.Header.Confirm = confirm; + Packet.v1.Header.Sign(); + } + } + + size_t GetDataSize() const { return DataSize; } + + ui64 GetSerial() const { + return ApplyToHeader([](auto& header) { return header.Serial; }); + } + + bool Confirmed(ui64 confirm) const { + return ApplyToHeader([&](auto& header) { return IsEmpty() || header.Serial <= confirm; }); + } + + void *GetFreeArea() { + return FreeArea; + } + + size_t GetVirtualFreeAmount() const { + return TTcpPacketBuf::PacketDataLen - DataSize; + } + + void AppendBuf(const void *buf, size_t size) { + DataSize += size; + Y_VERIFY_DEBUG(DataSize <= TTcpPacketBuf::PacketDataLen, "DataSize# %zu AppendBuf buf# %p size# %zu" + " FreeArea# %p End# %p", DataSize, buf, size, FreeArea, End); + + if (Bufs && static_cast<const char*>(Bufs.back().Data) + Bufs.back().Size == buf) { + Bufs.back().Size += size; + } else { + Bufs.push_back({buf, size}); + } + + if (buf >= FreeArea && buf < End) { + Y_VERIFY_DEBUG(buf == FreeArea); + FreeArea = const_cast<char*>(static_cast<const char*>(buf)) + size; + Y_VERIFY_DEBUG(FreeArea <= End); + } + } + + void Undo(size_t size) { + Y_VERIFY(Bufs); + auto& buf = Bufs.back(); + Y_VERIFY(buf.Data == FreeArea - buf.Size); + buf.Size -= size; + if (!buf.Size) { + Bufs.pop_back(); + } + FreeArea -= size; + DataSize -= size; + } + + bool DropBufs(size_t& amount) { + while (BufferIndex != Bufs.size()) { + TConstIoVec& item = Bufs[BufferIndex]; + // calculate number of bytes to the end in current buffer + const size_t remain = item.Size - FirstBufferOffset; + if (amount >= remain) { + // vector item completely fits into the received amount, drop it out and switch to next buffer + amount -= remain; + ++BufferIndex; + FirstBufferOffset = 0; + } else { + // adjust first buffer by "amount" bytes forward and reset amount to zero + FirstBufferOffset += amount; + amount = 0; + // return false meaning that we have some more data to send + return false; + } + } + return true; + } + + void ResetBufs() { + BufferIndex = FirstBufferOffset = 0; + TriedWriting = false; + } + + template <typename TVectorType> + void AppendToIoVector(TVectorType& vector, size_t max) { + for (size_t k = BufferIndex, offset = FirstBufferOffset; k != Bufs.size() && vector.size() < max; ++k, offset = 0) { + TConstIoVec v = Bufs[k]; + v.Data = static_cast<const char*>(v.Data) + offset; + v.Size -= offset; + vector.push_back(v); + } + } + + void Sign() { + if (Params.UseModernFrame) { + Packet.v2.Header.Checksum = 0; + Packet.v2.Header.PayloadLength = DataSize; + if (!Params.Encryption) { + ui32 sum = 0; + for (const auto& item : Bufs) { + sum = Crc32cExtendMSanCompatible(sum, item.Data, item.Size); + } + Packet.v2.Header.Checksum = sum; + } + } else { + Y_VERIFY(!Bufs.empty()); + auto it = Bufs.begin(); + static constexpr size_t headerLen = sizeof(TTcpPacketHeader_v1); + Y_VERIFY(it->Data == &Packet.v1.Header && it->Size >= headerLen); + ui32 sum = Crc32cExtendMSanCompatible(0, Packet.v1.Data, it->Size - headerLen); + while (++it != Bufs.end()) { + sum = Crc32cExtendMSanCompatible(sum, it->Data, it->Size); + } + + Packet.v1.Header.PayloadCRC32 = sum; + Packet.v1.Header.DataSize = DataSize; + Packet.v1.Header.Sign(); + } + } +}; diff --git a/library/cpp/actors/interconnect/poller.h b/library/cpp/actors/interconnect/poller.h new file mode 100644 index 0000000000..ff7979369f --- /dev/null +++ b/library/cpp/actors/interconnect/poller.h @@ -0,0 +1,23 @@ +#pragma once + +#include <functional> +#include <library/cpp/actors/core/events.h> + +namespace NActors { + class TSharedDescriptor: public TThrRefBase { + public: + virtual int GetDescriptor() = 0; + }; + + using TDelegate = std::function<void()>; + using TFDDelegate = std::function<TDelegate(const TIntrusivePtr<TSharedDescriptor>&)>; + + class IPoller: public TThrRefBase { + public: + virtual ~IPoller() = default; + + virtual void StartRead(const TIntrusivePtr<TSharedDescriptor>& s, TFDDelegate&& operation) = 0; + virtual void StartWrite(const TIntrusivePtr<TSharedDescriptor>& s, TFDDelegate&& operation) = 0; + }; + +} diff --git a/library/cpp/actors/interconnect/poller_actor.cpp b/library/cpp/actors/interconnect/poller_actor.cpp new file mode 100644 index 0000000000..e75cbcaef4 --- /dev/null +++ b/library/cpp/actors/interconnect/poller_actor.cpp @@ -0,0 +1,294 @@ +#include "poller_actor.h" +#include "interconnect_common.h" + +#include <library/cpp/actors/core/actor_bootstrapped.h> +#include <library/cpp/actors/core/actorsystem.h> +#include <library/cpp/actors/core/hfunc.h> +#include <library/cpp/actors/core/log.h> +#include <library/cpp/actors/core/probes.h> +#include <library/cpp/actors/protos/services_common.pb.h> +#include <library/cpp/actors/util/funnel_queue.h> + +#include <util/generic/intrlist.h> +#include <util/system/thread.h> +#include <util/system/event.h> +#include <util/system/pipe.h> + +#include <variant> + +namespace NActors { + + LWTRACE_USING(ACTORLIB_PROVIDER); + + namespace { + int LastSocketError() { +#if defined(_win_) + return WSAGetLastError(); +#else + return errno; +#endif + } + } + + struct TSocketRecord : TThrRefBase { + const TIntrusivePtr<TSharedDescriptor> Socket; + const TActorId ReadActorId; + const TActorId WriteActorId; + std::atomic_uint32_t Flags = 0; + + TSocketRecord(TEvPollerRegister& ev) + : Socket(std::move(ev.Socket)) + , ReadActorId(ev.ReadActorId) + , WriteActorId(ev.WriteActorId) + {} + }; + + template<typename TDerived> + class TPollerThreadBase : public ISimpleThread { + protected: + struct TPollerExitThread {}; // issued then we need to terminate the poller thread + + struct TPollerWakeup {}; + + struct TPollerUnregisterSocket { + TIntrusivePtr<TSharedDescriptor> Socket; + + TPollerUnregisterSocket(TIntrusivePtr<TSharedDescriptor> socket) + : Socket(std::move(socket)) + {} + }; + + using TPollerSyncOperation = std::variant<TPollerExitThread, TPollerWakeup, TPollerUnregisterSocket>; + + struct TPollerSyncOperationWrapper { + TPollerSyncOperation Operation; + TManualEvent Event; + + TPollerSyncOperationWrapper(TPollerSyncOperation&& operation) + : Operation(std::move(operation)) + {} + + void Wait() { + Event.WaitI(); + } + + void SignalDone() { + Event.Signal(); + } + }; + + TActorSystem *ActorSystem; + TPipeHandle ReadEnd, WriteEnd; // pipe for sync event processor + TFunnelQueue<TPollerSyncOperationWrapper*> SyncOperationsQ; // operation queue + + public: + TPollerThreadBase(TActorSystem *actorSystem) + : ActorSystem(actorSystem) + { + // create a pipe for notifications + try { + TPipeHandle::Pipe(ReadEnd, WriteEnd, CloseOnExec); + } catch (const TFileError& err) { + Y_FAIL("failed to create pipe"); + } + + // switch the read/write ends to nonblocking mode + SetNonBlock(ReadEnd); + SetNonBlock(WriteEnd); + } + + void UnregisterSocket(const TIntrusivePtr<TSocketRecord>& record) { + ExecuteSyncOperation(TPollerUnregisterSocket(record->Socket)); + } + + protected: + void Notify(TSocketRecord *record, bool read, bool write) { + auto issue = [&](const TActorId& recipient) { + ActorSystem->Send(new IEventHandle(recipient, {}, new TEvPollerReady(record->Socket, read, write))); + }; + if (read && record->ReadActorId) { + issue(record->ReadActorId); + if (write && record->WriteActorId && record->WriteActorId != record->ReadActorId) { + issue(record->WriteActorId); + } + } else if (write && record->WriteActorId) { + issue(record->WriteActorId); + } + } + + void Stop() { + // signal poller thread to stop and wait for the thread + ExecuteSyncOperation(TPollerExitThread()); + ISimpleThread::Join(); + } + + void ExecuteSyncOperation(TPollerSyncOperation&& op) { + TPollerSyncOperationWrapper wrapper(std::move(op)); + if (SyncOperationsQ.Push(&wrapper)) { + // this was the first entry, so we push notification through the pipe + for (;;) { + char buffer = '\x00'; + ssize_t nwritten = WriteEnd.Write(&buffer, sizeof(buffer)); + if (nwritten < 0) { + const int err = LastSocketError(); + if (err == EINTR) { + continue; + } else { + Y_FAIL("WriteEnd.Write() failed with %s", strerror(err)); + } + } else { + Y_VERIFY(nwritten); + break; + } + } + } + // wait for operation to complete + wrapper.Wait(); + } + + bool DrainReadEnd() { + size_t totalRead = 0; + char buffer[4096]; + for (;;) { + ssize_t n = ReadEnd.Read(buffer, sizeof(buffer)); + if (n < 0) { + const int error = LastSocketError(); + if (error == EINTR) { + continue; + } else if (error == EAGAIN || error == EWOULDBLOCK) { + break; + } else { + Y_FAIL("read() failed with %s", strerror(errno)); + } + } else { + Y_VERIFY(n); + totalRead += n; + } + } + return totalRead; + } + + bool ProcessSyncOpQueue() { + if (DrainReadEnd()) { + Y_VERIFY(!SyncOperationsQ.IsEmpty()); + do { + TPollerSyncOperationWrapper *op = SyncOperationsQ.Top(); + if (auto *unregister = std::get_if<TPollerUnregisterSocket>(&op->Operation)) { + static_cast<TDerived&>(*this).UnregisterSocketInLoop(unregister->Socket); + op->SignalDone(); + } else if (std::get_if<TPollerExitThread>(&op->Operation)) { + op->SignalDone(); + return false; // terminate the thread + } else if (std::get_if<TPollerWakeup>(&op->Operation)) { + op->SignalDone(); + } else { + Y_FAIL(); + } + } while (SyncOperationsQ.Pop()); + } + return true; + } + + void *ThreadProc() override { + SetCurrentThreadName("network poller"); + while (ProcessSyncOpQueue()) { + static_cast<TDerived&>(*this).ProcessEventsInLoop(); + } + return nullptr; + } + }; + +} // namespace NActors + +#if defined(_linux_) +# include "poller_actor_linux.h" +#elif defined(_darwin_) +# include "poller_actor_darwin.h" +#elif defined(_win_) +# include "poller_actor_win.h" +#else +# error "Unsupported platform" +#endif + +namespace NActors { + + class TPollerToken::TImpl { + std::weak_ptr<TPollerThread> Thread; + TIntrusivePtr<TSocketRecord> Record; // valid only when Thread is held locked + + public: + TImpl(std::shared_ptr<TPollerThread> thread, TIntrusivePtr<TSocketRecord> record) + : Thread(thread) + , Record(std::move(record)) + { + thread->RegisterSocket(Record); + } + + ~TImpl() { + if (auto thread = Thread.lock()) { + thread->UnregisterSocket(Record); + } + } + + void Request(bool read, bool write) { + if (auto thread = Thread.lock()) { + thread->Request(Record, read, write); + } + } + + const TIntrusivePtr<TSharedDescriptor>& Socket() const { + return Record->Socket; + } + }; + + class TPollerActor: public TActorBootstrapped<TPollerActor> { + // poller thread + std::shared_ptr<TPollerThread> PollerThread; + + public: + static constexpr IActor::EActivityType ActorActivityType() { + return IActor::INTERCONNECT_POLLER; + } + + void Bootstrap() { + PollerThread = std::make_shared<TPollerThread>(TlsActivationContext->ExecutorThread.ActorSystem); + Become(&TPollerActor::StateFunc); + } + + STRICT_STFUNC(StateFunc, + hFunc(TEvPollerRegister, Handle); + cFunc(TEvents::TSystem::Poison, PassAway); + ) + + void Handle(TEvPollerRegister::TPtr& ev) { + auto *msg = ev->Get(); + auto impl = std::make_unique<TPollerToken::TImpl>(PollerThread, MakeIntrusive<TSocketRecord>(*msg)); + auto socket = impl->Socket(); + TPollerToken::TPtr token(new TPollerToken(std::move(impl))); + if (msg->ReadActorId && msg->WriteActorId && msg->WriteActorId != msg->ReadActorId) { + Send(msg->ReadActorId, new TEvPollerRegisterResult(socket, token)); + Send(msg->WriteActorId, new TEvPollerRegisterResult(socket, std::move(token))); + } else if (msg->ReadActorId) { + Send(msg->ReadActorId, new TEvPollerRegisterResult(socket, std::move(token))); + } else if (msg->WriteActorId) { + Send(msg->WriteActorId, new TEvPollerRegisterResult(socket, std::move(token))); + } + } + }; + + TPollerToken::TPollerToken(std::unique_ptr<TImpl> impl) + : Impl(std::move(impl)) + {} + + TPollerToken::~TPollerToken() + {} + + void TPollerToken::Request(bool read, bool write) { + Impl->Request(read, write); + } + + IActor* CreatePollerActor() { + return new TPollerActor; + } + +} diff --git a/library/cpp/actors/interconnect/poller_actor.h b/library/cpp/actors/interconnect/poller_actor.h new file mode 100644 index 0000000000..f927b82089 --- /dev/null +++ b/library/cpp/actors/interconnect/poller_actor.h @@ -0,0 +1,63 @@ +#pragma once + +#include "events_local.h" +#include "poller.h" +#include <library/cpp/actors/core/actor.h> + +namespace NActors { + struct TEvPollerRegister : TEventLocal<TEvPollerRegister, ui32(ENetwork::EvPollerRegister)> { + const TIntrusivePtr<TSharedDescriptor> Socket; // socket to watch for + const TActorId ReadActorId; // actor id to notify about read availability + const TActorId WriteActorId; // actor id to notify about write availability; may be the same as the ReadActorId + + TEvPollerRegister(TIntrusivePtr<TSharedDescriptor> socket, const TActorId& readActorId, const TActorId& writeActorId) + : Socket(std::move(socket)) + , ReadActorId(readActorId) + , WriteActorId(writeActorId) + {} + }; + + // poller token is sent in response to TEvPollerRegister; it allows requesting poll when read/write returns EAGAIN + class TPollerToken : public TThrRefBase { + class TImpl; + std::unique_ptr<TImpl> Impl; + + friend class TPollerActor; + TPollerToken(std::unique_ptr<TImpl> impl); + + public: + ~TPollerToken(); + void Request(bool read, bool write); + + using TPtr = TIntrusivePtr<TPollerToken>; + }; + + struct TEvPollerRegisterResult : TEventLocal<TEvPollerRegisterResult, ui32(ENetwork::EvPollerRegisterResult)> { + TIntrusivePtr<TSharedDescriptor> Socket; + TPollerToken::TPtr PollerToken; + + TEvPollerRegisterResult(TIntrusivePtr<TSharedDescriptor> socket, TPollerToken::TPtr pollerToken) + : Socket(std::move(socket)) + , PollerToken(std::move(pollerToken)) + {} + }; + + struct TEvPollerReady : TEventLocal<TEvPollerReady, ui32(ENetwork::EvPollerReady)> { + TIntrusivePtr<TSharedDescriptor> Socket; + const bool Read, Write; + + TEvPollerReady(TIntrusivePtr<TSharedDescriptor> socket, bool read, bool write) + : Socket(std::move(socket)) + , Read(read) + , Write(write) + {} + }; + + IActor* CreatePollerActor(); + + inline TActorId MakePollerActorId() { + char x[12] = {'I', 'C', 'P', 'o', 'l', 'l', 'e', 'r', '\xDE', '\xAD', '\xBE', '\xEF'}; + return TActorId(0, TStringBuf(std::begin(x), std::end(x))); + } + +} diff --git a/library/cpp/actors/interconnect/poller_actor_darwin.h b/library/cpp/actors/interconnect/poller_actor_darwin.h new file mode 100644 index 0000000000..4cb0a58f8d --- /dev/null +++ b/library/cpp/actors/interconnect/poller_actor_darwin.h @@ -0,0 +1,95 @@ +#pragma once + +#include <sys/event.h> + +namespace NActors { + + class TKqueueThread : public TPollerThreadBase<TKqueueThread> { + // KQueue file descriptor + int KqDescriptor; + + void SafeKevent(const struct kevent* ev, int size) { + int rc; + do { + rc = kevent(KqDescriptor, ev, size, nullptr, 0, nullptr); + } while (rc == -1 && errno == EINTR); + Y_VERIFY(rc != -1, "kevent() failed with %s", strerror(errno)); + } + + public: + TKqueueThread(TActorSystem *actorSystem) + : TPollerThreadBase(actorSystem) + { + // create kqueue + KqDescriptor = kqueue(); + Y_VERIFY(KqDescriptor != -1, "kqueue() failed with %s", strerror(errno)); + + // set close-on-exit flag + { + int flags = fcntl(KqDescriptor, F_GETFD); + Y_VERIFY(flags >= 0, "fcntl(F_GETFD) failed with %s", strerror(errno)); + int rc = fcntl(KqDescriptor, F_SETFD, flags | FD_CLOEXEC); + Y_VERIFY(rc != -1, "fcntl(F_SETFD, +FD_CLOEXEC) failed with %s", strerror(errno)); + } + + // register pipe's read end in poller + struct kevent ev; + EV_SET(&ev, (int)ReadEnd, EVFILT_READ, EV_ADD | EV_ENABLE, 0, 0, nullptr); + SafeKevent(&ev, 1); + + ISimpleThread::Start(); // start poller thread + } + + ~TKqueueThread() { + Stop(); + close(KqDescriptor); + } + + void ProcessEventsInLoop() { + std::array<struct kevent, 256> events; + + int numReady = kevent(KqDescriptor, nullptr, 0, events.data(), events.size(), nullptr); + if (numReady == -1) { + if (errno == EINTR) { + return; + } else { + Y_FAIL("kevent() failed with %s", strerror(errno)); + } + } + + for (int i = 0; i < numReady; ++i) { + const struct kevent& ev = events[i]; + if (ev.udata) { + TSocketRecord *it = static_cast<TSocketRecord*>(ev.udata); + const bool error = ev.flags & (EV_EOF | EV_ERROR); + const bool read = error || ev.filter == EVFILT_READ; + const bool write = error || ev.filter == EVFILT_WRITE; + Notify(it, read, write); + } + } + } + + void UnregisterSocketInLoop(const TIntrusivePtr<TSharedDescriptor>& socket) { + struct kevent ev[2]; + const int fd = socket->GetDescriptor(); + EV_SET(&ev[0], fd, EVFILT_READ, EV_DELETE, 0, 0, nullptr); + EV_SET(&ev[1], fd, EVFILT_WRITE, EV_DELETE, 0, 0, nullptr); + SafeKevent(ev, 2); + } + + void RegisterSocket(const TIntrusivePtr<TSocketRecord>& record) { + int flags = EV_ADD | EV_CLEAR | EV_ENABLE; + struct kevent ev[2]; + const int fd = record->Socket->GetDescriptor(); + EV_SET(&ev[0], fd, EVFILT_READ, flags, 0, 0, record.Get()); + EV_SET(&ev[1], fd, EVFILT_WRITE, flags, 0, 0, record.Get()); + SafeKevent(ev, 2); + } + + void Request(const TIntrusivePtr<TSocketRecord>& /*socket*/, bool /*read*/, bool /*write*/) + {} // no special processing here as we use kqueue in edge-triggered mode + }; + + using TPollerThread = TKqueueThread; + +} diff --git a/library/cpp/actors/interconnect/poller_actor_linux.h b/library/cpp/actors/interconnect/poller_actor_linux.h new file mode 100644 index 0000000000..dd4f7c0124 --- /dev/null +++ b/library/cpp/actors/interconnect/poller_actor_linux.h @@ -0,0 +1,114 @@ +#pragma once + +#include <sys/epoll.h> + +namespace NActors { + + class TEpollThread : public TPollerThreadBase<TEpollThread> { + // epoll file descriptor + int EpollDescriptor; + + public: + TEpollThread(TActorSystem *actorSystem) + : TPollerThreadBase(actorSystem) + { + EpollDescriptor = epoll_create1(EPOLL_CLOEXEC); + Y_VERIFY(EpollDescriptor != -1, "epoll_create1() failed with %s", strerror(errno)); + + epoll_event event; + event.data.ptr = nullptr; + event.events = EPOLLIN; + if (epoll_ctl(EpollDescriptor, EPOLL_CTL_ADD, ReadEnd, &event) == -1) { + Y_FAIL("epoll_ctl(EPOLL_CTL_ADD) failed with %s", strerror(errno)); + } + + ISimpleThread::Start(); // start poller thread + } + + ~TEpollThread() { + Stop(); + close(EpollDescriptor); + } + + void ProcessEventsInLoop() { + // preallocated array for events + std::array<epoll_event, 256> events; + + // wait indefinitely for event to arrive + LWPROBE(EpollStartWaitIn); + int numReady = epoll_wait(EpollDescriptor, events.data(), events.size(), -1); + LWPROBE(EpollFinishWaitIn, numReady); + + // check return status for any errors + if (numReady == -1) { + if (errno == EINTR) { + return; // restart the call a bit later + } else { + Y_FAIL("epoll_wait() failed with %s", strerror(errno)); + } + } + + for (int i = 0; i < numReady; ++i) { + const epoll_event& ev = events[i]; + if (auto *record = static_cast<TSocketRecord*>(ev.data.ptr)) { + const bool read = ev.events & (EPOLLIN | EPOLLHUP | EPOLLRDHUP | EPOLLERR); + const bool write = ev.events & (EPOLLOUT | EPOLLERR); + + // remove hit flags from the bit set + ui32 flags = record->Flags; + const ui32 remove = (read ? EPOLLIN : 0) | (write ? EPOLLOUT : 0); + while (!record->Flags.compare_exchange_weak(flags, flags & ~remove)) + {} + flags &= ~remove; + + // rearm poller if some flags remain + if (flags) { + epoll_event event; + event.events = EPOLLONESHOT | EPOLLRDHUP | flags; + event.data.ptr = record; + if (epoll_ctl(EpollDescriptor, EPOLL_CTL_MOD, record->Socket->GetDescriptor(), &event) == -1) { + Y_FAIL("epoll_ctl(EPOLL_CTL_MOD) failed with %s", strerror(errno)); + } + } + + // issue notifications + Notify(record, read, write); + } + } + } + + void UnregisterSocketInLoop(const TIntrusivePtr<TSharedDescriptor>& socket) { + if (epoll_ctl(EpollDescriptor, EPOLL_CTL_DEL, socket->GetDescriptor(), nullptr) == -1) { + Y_FAIL("epoll_ctl(EPOLL_CTL_DEL) failed with %s", strerror(errno)); + } + } + + void RegisterSocket(const TIntrusivePtr<TSocketRecord>& record) { + epoll_event event; + event.events = EPOLLONESHOT | EPOLLRDHUP; + event.data.ptr = record.Get(); + if (epoll_ctl(EpollDescriptor, EPOLL_CTL_ADD, record->Socket->GetDescriptor(), &event) == -1) { + Y_FAIL("epoll_ctl(EPOLL_CTL_ADD) failed with %s", strerror(errno)); + } + } + + void Request(const TIntrusivePtr<TSocketRecord>& record, bool read, bool write) { + const ui32 add = (read ? EPOLLIN : 0) | (write ? EPOLLOUT : 0); + ui32 flags = record->Flags; + while (!record->Flags.compare_exchange_weak(flags, flags | add)) + {} + flags |= add; + if (flags) { + epoll_event event; + event.events = EPOLLONESHOT | EPOLLRDHUP | flags; + event.data.ptr = record.Get(); + if (epoll_ctl(EpollDescriptor, EPOLL_CTL_MOD, record->Socket->GetDescriptor(), &event) == -1) { + Y_FAIL("epoll_ctl(EPOLL_CTL_MOD) failed with %s", strerror(errno)); + } + } + } + }; + + using TPollerThread = TEpollThread; + +} // namespace NActors diff --git a/library/cpp/actors/interconnect/poller_actor_win.h b/library/cpp/actors/interconnect/poller_actor_win.h new file mode 100644 index 0000000000..4b4caa0ebd --- /dev/null +++ b/library/cpp/actors/interconnect/poller_actor_win.h @@ -0,0 +1,103 @@ +#pragma once + +namespace NActors { + + class TSelectThread : public TPollerThreadBase<TSelectThread> { + TMutex Mutex; + std::unordered_map<SOCKET, TIntrusivePtr<TSocketRecord>> Descriptors; + + enum { + READ = 1, + WRITE = 2, + }; + + public: + TSelectThread(TActorSystem *actorSystem) + : TPollerThreadBase(actorSystem) + { + Descriptors.emplace(ReadEnd, nullptr); + ISimpleThread::Start(); + } + + ~TSelectThread() { + Stop(); + } + + void ProcessEventsInLoop() { + fd_set readfds, writefds, exceptfds; + + FD_ZERO(&readfds); + FD_ZERO(&writefds); + FD_ZERO(&exceptfds); + int nfds = 0; + with_lock (Mutex) { + for (const auto& [key, record] : Descriptors) { + const int fd = key; + auto add = [&](auto& set) { + FD_SET(fd, &set); + nfds = Max<int>(nfds, fd + 1); + }; + if (!record || (record->Flags & READ)) { + add(readfds); + } + if (!record || (record->Flags & WRITE)) { + add(writefds); + } + add(exceptfds); + } + } + + int res = select(nfds, &readfds, &writefds, &exceptfds, nullptr); + if (res == -1) { + const int err = LastSocketError(); + if (err == EINTR) { + return; // try a bit later + } else { + Y_FAIL("select() failed with %s", strerror(err)); + } + } + + with_lock (Mutex) { + for (const auto& [fd, record] : Descriptors) { + if (record) { + const bool error = FD_ISSET(fd, &exceptfds); + const bool read = error || FD_ISSET(fd, &readfds); + const bool write = error || FD_ISSET(fd, &writefds); + if (read) { + record->Flags &= ~READ; + } + if (write) { + record->Flags &= ~WRITE; + } + Notify(record.Get(), read, write); + } + } + } + } + + void UnregisterSocketInLoop(const TIntrusivePtr<TSharedDescriptor>& socket) { + with_lock (Mutex) { + Descriptors.erase(socket->GetDescriptor()); + } + } + + void RegisterSocket(const TIntrusivePtr<TSocketRecord>& record) { + with_lock (Mutex) { + Descriptors.emplace(record->Socket->GetDescriptor(), record); + } + ExecuteSyncOperation(TPollerWakeup()); + } + + void Request(const TIntrusivePtr<TSocketRecord>& record, bool read, bool write) { + with_lock (Mutex) { + const auto it = Descriptors.find(record->Socket->GetDescriptor()); + Y_VERIFY(it != Descriptors.end()); + it->second->Flags |= (read ? READ : 0) | (write ? WRITE : 0); + } + ExecuteSyncOperation(TPollerWakeup()); + } + }; + + using TPollerThread = TSelectThread; + +} // NActors diff --git a/library/cpp/actors/interconnect/poller_tcp.cpp b/library/cpp/actors/interconnect/poller_tcp.cpp new file mode 100644 index 0000000000..8267df31ea --- /dev/null +++ b/library/cpp/actors/interconnect/poller_tcp.cpp @@ -0,0 +1,35 @@ +#include "poller_tcp.h" + +namespace NInterconnect { + TPollerThreads::TPollerThreads(size_t units, bool useSelect) + : Units(units) + { + Y_VERIFY_DEBUG(!Units.empty()); + for (auto& unit : Units) + unit = TPollerUnit::Make(useSelect); + } + + TPollerThreads::~TPollerThreads() { + } + + void TPollerThreads::Start() { + for (const auto& unit : Units) + unit->Start(); + } + + void TPollerThreads::Stop() { + for (const auto& unit : Units) + unit->Stop(); + } + + void TPollerThreads::StartRead(const TIntrusivePtr<TSharedDescriptor>& s, TFDDelegate&& operation) { + auto& unit = Units[THash<SOCKET>()(s->GetDescriptor()) % Units.size()]; + unit->StartReadOperation(s, std::move(operation)); + } + + void TPollerThreads::StartWrite(const TIntrusivePtr<TSharedDescriptor>& s, TFDDelegate&& operation) { + auto& unit = Units[THash<SOCKET>()(s->GetDescriptor()) % Units.size()]; + unit->StartWriteOperation(s, std::move(operation)); + } + +} diff --git a/library/cpp/actors/interconnect/poller_tcp.h b/library/cpp/actors/interconnect/poller_tcp.h new file mode 100644 index 0000000000..310265eccd --- /dev/null +++ b/library/cpp/actors/interconnect/poller_tcp.h @@ -0,0 +1,25 @@ +#pragma once + +#include "poller_tcp_unit.h" +#include "poller.h" + +#include <util/generic/vector.h> +#include <util/generic/hash.h> + +namespace NInterconnect { + class TPollerThreads: public NActors::IPoller { + public: + TPollerThreads(size_t units = 1U, bool useSelect = false); + ~TPollerThreads(); + + void Start(); + void Stop(); + + void StartRead(const TIntrusivePtr<TSharedDescriptor>& s, TFDDelegate&& operation) override; + void StartWrite(const TIntrusivePtr<TSharedDescriptor>& s, TFDDelegate&& operation) override; + + private: + TVector<TPollerUnit::TPtr> Units; + }; + +} diff --git a/library/cpp/actors/interconnect/poller_tcp_unit.cpp b/library/cpp/actors/interconnect/poller_tcp_unit.cpp new file mode 100644 index 0000000000..59e7dda810 --- /dev/null +++ b/library/cpp/actors/interconnect/poller_tcp_unit.cpp @@ -0,0 +1,126 @@ +#include "poller_tcp_unit.h" + +#if !defined(_win_) && !defined(_darwin_) +#include "poller_tcp_unit_epoll.h" +#endif + +#include "poller_tcp_unit_select.h" +#include "poller.h" + +#include <library/cpp/actors/prof/tag.h> +#include <library/cpp/actors/util/intrinsics.h> + +#if defined _linux_ +#include <pthread.h> +#endif + +namespace NInterconnect { + TPollerUnit::TPtr + TPollerUnit::Make(bool useSelect) { +#if defined(_win_) || defined(_darwin_) + Y_UNUSED(useSelect); + return TPtr(new TPollerUnitSelect); +#else + return useSelect ? TPtr(new TPollerUnitSelect) : TPtr(new TPollerUnitEpoll); +#endif + } + + TPollerUnit::TPollerUnit() + : StopFlag(true) + , ReadLoop(TThread::TParams(IdleThread<false>, this).SetName("network read")) + , WriteLoop(TThread::TParams(IdleThread<true>, this).SetName("network write")) + { + } + + TPollerUnit::~TPollerUnit() { + if (!AtomicLoad(&StopFlag)) + Stop(); + } + + void + TPollerUnit::Start() { + AtomicStore(&StopFlag, false); + ReadLoop.Start(); + WriteLoop.Start(); + } + + void + TPollerUnit::Stop() { + AtomicStore(&StopFlag, true); + ReadLoop.Join(); + WriteLoop.Join(); + } + + template <> + TPollerUnit::TSide& + TPollerUnit::GetSide<false>() { + return Read; + } + + template <> + TPollerUnit::TSide& + TPollerUnit::GetSide<true>() { + return Write; + } + + void + TPollerUnit::StartReadOperation( + const TIntrusivePtr<TSharedDescriptor>& stream, + TFDDelegate&& operation) { + Y_VERIFY_DEBUG(stream); + if (AtomicLoad(&StopFlag)) + return; + GetSide<false>().InputQueue.Push(TSide::TItem(stream, std::move(operation))); + } + + void + TPollerUnit::StartWriteOperation( + const TIntrusivePtr<TSharedDescriptor>& stream, + TFDDelegate&& operation) { + Y_VERIFY_DEBUG(stream); + if (AtomicLoad(&StopFlag)) + return; + GetSide<true>().InputQueue.Push(TSide::TItem(stream, std::move(operation))); + } + + template <bool IsWrite> + void* + TPollerUnit::IdleThread(void* param) { + // TODO: musl-libc version of `sched_param` struct is for some reason different from pthread + // version in Ubuntu 12.04 +#if defined(_linux_) && !defined(_musl_) + pthread_t threadSelf = pthread_self(); + sched_param sparam = {20}; + pthread_setschedparam(threadSelf, SCHED_FIFO, &sparam); +#endif + + static_cast<TPollerUnit*>(param)->RunLoop<IsWrite>(); + return nullptr; + } + + template <> + void + TPollerUnit::RunLoop<false>() { + NProfiling::TMemoryTagScope tag("INTERCONNECT_RECEIVED_DATA"); + while (!AtomicLoad(&StopFlag)) + ProcessRead(); + } + + template <> + void + TPollerUnit::RunLoop<true>() { + NProfiling::TMemoryTagScope tag("INTERCONNECT_SEND_DATA"); + while (!AtomicLoad(&StopFlag)) + ProcessWrite(); + } + + void + TPollerUnit::TSide::ProcessInput() { + if (!InputQueue.IsEmpty()) + do { + auto sock = InputQueue.Top().first->GetDescriptor(); + if (!Operations.emplace(sock, std::move(InputQueue.Top())).second) + Y_FAIL("Descriptor is already in pooler."); + } while (InputQueue.Pop()); + } +} diff --git a/library/cpp/actors/interconnect/poller_tcp_unit.h b/library/cpp/actors/interconnect/poller_tcp_unit.h new file mode 100644 index 0000000000..692168b968 --- /dev/null +++ b/library/cpp/actors/interconnect/poller_tcp_unit.h @@ -0,0 +1,67 @@ +#pragma once + +#include <util/system/thread.h> +#include <library/cpp/actors/util/funnel_queue.h> + +#include "interconnect_stream.h" + +#include <memory> +#include <functional> +#include <unordered_map> + +namespace NInterconnect { + using NActors::TFDDelegate; + using NActors::TSharedDescriptor; + + class TPollerUnit { + public: + typedef std::unique_ptr<TPollerUnit> TPtr; + + static TPtr Make(bool useSelect); + + void Start(); + void Stop(); + + virtual void StartReadOperation( + const TIntrusivePtr<TSharedDescriptor>& stream, + TFDDelegate&& operation); + + virtual void StartWriteOperation( + const TIntrusivePtr<TSharedDescriptor>& stream, + TFDDelegate&& operation); + + virtual ~TPollerUnit(); + + private: + virtual void ProcessRead() = 0; + virtual void ProcessWrite() = 0; + + template <bool IsWrite> + static void* IdleThread(void* param); + + template <bool IsWrite> + void RunLoop(); + + volatile bool StopFlag; + TThread ReadLoop, WriteLoop; + + protected: + TPollerUnit(); + + struct TSide { + using TOperations = + std::unordered_map<SOCKET, + std::pair<TIntrusivePtr<TSharedDescriptor>, TFDDelegate>>; + + TOperations Operations; + using TItem = TOperations::mapped_type; + TFunnelQueue<TItem> InputQueue; + + void ProcessInput(); + } Read, Write; + + template <bool IsWrite> + TSide& GetSide(); + }; + +} diff --git a/library/cpp/actors/interconnect/poller_tcp_unit_epoll.cpp b/library/cpp/actors/interconnect/poller_tcp_unit_epoll.cpp new file mode 100644 index 0000000000..c78538b95b --- /dev/null +++ b/library/cpp/actors/interconnect/poller_tcp_unit_epoll.cpp @@ -0,0 +1,125 @@ +#include "poller_tcp_unit_epoll.h" +#if !defined(_win_) && !defined(_darwin_) +#include <unistd.h> +#include <sys/epoll.h> + +#include <csignal> +#include <cerrno> +#include <cstring> + +namespace NInterconnect { + namespace { + void + DeleteEpoll(int epoll, SOCKET stream) { + ::epoll_event event = {0, {.fd = stream}}; + if (::epoll_ctl(epoll, EPOLL_CTL_DEL, stream, &event)) { + Cerr << "epoll_ctl errno: " << errno << Endl; + Y_FAIL("epoll delete error!"); + } + } + + template <ui32 Events> + void + AddEpoll(int epoll, SOCKET stream) { + ::epoll_event event = {.events = Events}; + event.data.fd = stream; + if (::epoll_ctl(epoll, EPOLL_CTL_ADD, stream, &event)) { + Cerr << "epoll_ctl errno: " << errno << Endl; + Y_FAIL("epoll add error!"); + } + } + + int + Initialize() { + const auto epoll = ::epoll_create(10000); + Y_VERIFY_DEBUG(epoll > 0); + return epoll; + } + + } + + TPollerUnitEpoll::TPollerUnitEpoll() + : ReadDescriptor(Initialize()) + , WriteDescriptor(Initialize()) + { + // Block on the epoll descriptor. + ::sigemptyset(&sigmask); + ::sigaddset(&sigmask, SIGPIPE); + ::sigaddset(&sigmask, SIGTERM); + } + + TPollerUnitEpoll::~TPollerUnitEpoll() { + ::close(ReadDescriptor); + ::close(WriteDescriptor); + } + + template <> + int TPollerUnitEpoll::GetDescriptor<false>() const { + return ReadDescriptor; + } + + template <> + int TPollerUnitEpoll::GetDescriptor<true>() const { + return WriteDescriptor; + } + + void + TPollerUnitEpoll::StartReadOperation( + const TIntrusivePtr<TSharedDescriptor>& s, + TFDDelegate&& operation) { + TPollerUnit::StartReadOperation(s, std::move(operation)); + AddEpoll<EPOLLRDHUP | EPOLLIN>(ReadDescriptor, s->GetDescriptor()); + } + + void + TPollerUnitEpoll::StartWriteOperation( + const TIntrusivePtr<TSharedDescriptor>& s, + TFDDelegate&& operation) { + TPollerUnit::StartWriteOperation(s, std::move(operation)); + AddEpoll<EPOLLRDHUP | EPOLLOUT>(WriteDescriptor, s->GetDescriptor()); + } + + constexpr int EVENTS_BUF_SIZE = 128; + + template <bool WriteOp> + void + TPollerUnitEpoll::Process() { + ::epoll_event events[EVENTS_BUF_SIZE]; + + const int epoll = GetDescriptor<WriteOp>(); + + /* Timeout just to check StopFlag sometimes */ + const int result = + ::epoll_pwait(epoll, events, EVENTS_BUF_SIZE, 200, &sigmask); + + if (result == -1 && errno != EINTR) + Y_FAIL("epoll wait error!"); + + auto& side = GetSide<WriteOp>(); + side.ProcessInput(); + + for (int i = 0; i < result; ++i) { + const auto it = side.Operations.find(events[i].data.fd); + if (side.Operations.end() == it) + continue; + if (const auto& finalizer = it->second.second(it->second.first)) { + DeleteEpoll(epoll, it->first); + side.Operations.erase(it); + finalizer(); + } + } + } + + void + TPollerUnitEpoll::ProcessRead() { + Process<false>(); + } + + void + TPollerUnitEpoll::ProcessWrite() { + Process<true>(); + } + +} + +#endif diff --git a/library/cpp/actors/interconnect/poller_tcp_unit_epoll.h b/library/cpp/actors/interconnect/poller_tcp_unit_epoll.h new file mode 100644 index 0000000000..ff7893eba2 --- /dev/null +++ b/library/cpp/actors/interconnect/poller_tcp_unit_epoll.h @@ -0,0 +1,33 @@ +#pragma once + +#include "poller_tcp_unit.h" + +namespace NInterconnect { + class TPollerUnitEpoll: public TPollerUnit { + public: + TPollerUnitEpoll(); + virtual ~TPollerUnitEpoll(); + + private: + virtual void StartReadOperation( + const TIntrusivePtr<TSharedDescriptor>& s, + TFDDelegate&& operation) override; + + virtual void StartWriteOperation( + const TIntrusivePtr<TSharedDescriptor>& s, + TFDDelegate&& operation) override; + + virtual void ProcessRead() override; + virtual void ProcessWrite() override; + + template <bool Write> + void Process(); + + template <bool Write> + int GetDescriptor() const; + + const int ReadDescriptor, WriteDescriptor; + ::sigset_t sigmask; + }; + +} diff --git a/library/cpp/actors/interconnect/poller_tcp_unit_select.cpp b/library/cpp/actors/interconnect/poller_tcp_unit_select.cpp new file mode 100644 index 0000000000..ae7aaad566 --- /dev/null +++ b/library/cpp/actors/interconnect/poller_tcp_unit_select.cpp @@ -0,0 +1,86 @@ +#include "poller_tcp_unit_select.h" + +#include <csignal> + +#if defined(_win_) +#include <winsock2.h> +#define SOCKET_ERROR_SOURCE ::WSAGetLastError() +#elif defined(_darwin_) +#include <cerrno> +#define SOCKET_ERROR_SOURCE errno +typedef timeval TIMEVAL; +#else +#include <cerrno> +#define SOCKET_ERROR_SOURCE errno +#endif + +namespace NInterconnect { + TPollerUnitSelect::TPollerUnitSelect() { + } + + TPollerUnitSelect::~TPollerUnitSelect() { + } + + template <bool IsWrite> + void + TPollerUnitSelect::Process() { + auto& side = GetSide<IsWrite>(); + side.ProcessInput(); + + enum : size_t { R, + W, + E }; + static const auto O = IsWrite ? W : R; + + ::fd_set sets[3]; + + FD_ZERO(&sets[R]); + FD_ZERO(&sets[W]); + FD_ZERO(&sets[E]); + + for (const auto& operation : side.Operations) { + FD_SET(operation.first, &sets[O]); + FD_SET(operation.first, &sets[E]); + } + +#if defined(_win_) + ::TIMEVAL timeout = {0L, 99991L}; + const auto numberEvents = !side.Operations.empty() ? ::select(FD_SETSIZE, &sets[R], &sets[W], &sets[E], &timeout) + : (::Sleep(100), 0); +#elif defined(_darwin_) + ::TIMEVAL timeout = {0L, 99991L}; + const auto numberEvents = ::select(FD_SETSIZE, &sets[R], &sets[W], &sets[E], &timeout); +#else + ::sigset_t sigmask; + ::sigemptyset(&sigmask); + ::sigaddset(&sigmask, SIGPIPE); + ::sigaddset(&sigmask, SIGTERM); + + struct ::timespec timeout = {0L, 99999989L}; + const auto numberEvents = ::pselect(FD_SETSIZE, &sets[R], &sets[W], &sets[E], &timeout, &sigmask); +#endif + + Y_VERIFY_DEBUG(numberEvents >= 0); + + for (auto it = side.Operations.cbegin(); side.Operations.cend() != it;) { + if (FD_ISSET(it->first, &sets[O]) || FD_ISSET(it->first, &sets[E])) + if (const auto& finalizer = it->second.second(it->second.first)) { + side.Operations.erase(it++); + finalizer(); + continue; + } + ++it; + } + } + + void + TPollerUnitSelect::ProcessRead() { + Process<false>(); + } + + void + TPollerUnitSelect::ProcessWrite() { + Process<true>(); + } + +} diff --git a/library/cpp/actors/interconnect/poller_tcp_unit_select.h b/library/cpp/actors/interconnect/poller_tcp_unit_select.h new file mode 100644 index 0000000000..0c15217796 --- /dev/null +++ b/library/cpp/actors/interconnect/poller_tcp_unit_select.h @@ -0,0 +1,19 @@ +#pragma once + +#include "poller_tcp_unit.h" + +namespace NInterconnect { + class TPollerUnitSelect: public TPollerUnit { + public: + TPollerUnitSelect(); + virtual ~TPollerUnitSelect(); + + private: + virtual void ProcessRead() override; + virtual void ProcessWrite() override; + + template <bool IsWrite> + void Process(); + }; + +} diff --git a/library/cpp/actors/interconnect/profiler.h b/library/cpp/actors/interconnect/profiler.h new file mode 100644 index 0000000000..77a59e3179 --- /dev/null +++ b/library/cpp/actors/interconnect/profiler.h @@ -0,0 +1,142 @@ +#pragma once + +#include <library/cpp/actors/util/datetime.h> + +namespace NActors { + + class TProfiled { + enum class EType : ui32 { + ENTRY, + EXIT, + }; + + struct TItem { + EType Type; // entry kind + int Line; + const char *Marker; // name of the profiled function/part + ui64 Timestamp; // cycles + }; + + bool Enable = false; + mutable TDeque<TItem> Items; + + friend class TFunction; + + public: + class TFunction { + const TProfiled& Profiled; + + public: + TFunction(const TProfiled& profiled, const char *name, int line) + : Profiled(profiled) + { + Log(EType::ENTRY, name, line); + } + + ~TFunction() { + Log(EType::EXIT, nullptr, 0); + } + + private: + void Log(EType type, const char *marker, int line) { + if (Profiled.Enable) { + Profiled.Items.push_back(TItem{ + type, + line, + marker, + GetCycleCountFast() + }); + } + } + }; + + public: + void Start() { + Enable = true; + } + + void Finish() { + Items.clear(); + Enable = false; + } + + TDuration Duration() const { + return CyclesToDuration(Items ? Items.back().Timestamp - Items.front().Timestamp : 0); + } + + TString Format() const { + TDeque<TItem>::iterator it = Items.begin(); + TString res = FormatLevel(it); + Y_VERIFY(it == Items.end()); + return res; + } + + private: + TString FormatLevel(TDeque<TItem>::iterator& it) const { + struct TRecord { + TString Marker; + ui64 Duration; + TString Interior; + + bool operator <(const TRecord& other) const { + return Duration < other.Duration; + } + }; + TVector<TRecord> records; + + while (it != Items.end() && it->Type != EType::EXIT) { + Y_VERIFY(it->Type == EType::ENTRY); + const TString marker = Sprintf("%s:%d", it->Marker, it->Line); + const ui64 begin = it->Timestamp; + ++it; + const TString interior = FormatLevel(it); + Y_VERIFY(it != Items.end()); + Y_VERIFY(it->Type == EType::EXIT); + const ui64 end = it->Timestamp; + records.push_back(TRecord{marker, end - begin, interior}); + ++it; + } + + TStringStream s; + const ui64 cyclesPerMs = GetCyclesPerMillisecond(); + + if (records.size() <= 10) { + bool first = true; + for (const TRecord& record : records) { + if (first) { + first = false; + } else { + s << " "; + } + s << record.Marker << "(" << (record.Duration * 1000000 / cyclesPerMs) << "ns)"; + if (record.Interior) { + s << " {" << record.Interior << "}"; + } + } + } else { + TMap<TString, TVector<TRecord>> m; + for (TRecord& r : records) { + const TString key = r.Marker; + m[key].push_back(std::move(r)); + } + + s << "unordered "; + for (auto& [key, value] : m) { + auto i = std::max_element(value.begin(), value.end()); + ui64 sum = 0; + for (const auto& item : value) { + sum += item.Duration; + } + sum = sum * 1000000 / cyclesPerMs; + s << key << " num# " << value.size() << " sum# " << sum << "ns max# " << (i->Duration * 1000000 / cyclesPerMs) << "ns"; + if (i->Interior) { + s << " {" << i->Interior << "}"; + } + } + } + + return s.Str(); + } + }; + +} // NActors diff --git a/library/cpp/actors/interconnect/slowpoke_actor.h b/library/cpp/actors/interconnect/slowpoke_actor.h new file mode 100644 index 0000000000..4b02e5da48 --- /dev/null +++ b/library/cpp/actors/interconnect/slowpoke_actor.h @@ -0,0 +1,47 @@ +#pragma once + +#include <library/cpp/actors/core/actor_bootstrapped.h> + +namespace NActors { + + class TSlowpokeActor : public TActorBootstrapped<TSlowpokeActor> { + const TDuration Duration; + const TDuration SleepMin; + const TDuration SleepMax; + const TDuration RescheduleMin; + const TDuration RescheduleMax; + + public: + static constexpr NKikimrServices::TActivity::EType ActorActivityType() { + return NKikimrServices::TActivity::INTERCONNECT_COMMON; + } + + TSlowpokeActor(TDuration duration, TDuration sleepMin, TDuration sleepMax, TDuration rescheduleMin, TDuration rescheduleMax) + : Duration(duration) + , SleepMin(sleepMin) + , SleepMax(sleepMax) + , RescheduleMin(rescheduleMin) + , RescheduleMax(rescheduleMax) + {} + + void Bootstrap(const TActorContext& ctx) { + Become(&TThis::StateFunc, ctx, Duration, new TEvents::TEvPoisonPill); + HandleWakeup(ctx); + } + + void HandleWakeup(const TActorContext& ctx) { + Sleep(RandomDuration(SleepMin, SleepMax)); + ctx.Schedule(RandomDuration(RescheduleMin, RescheduleMax), new TEvents::TEvWakeup); + } + + static TDuration RandomDuration(TDuration min, TDuration max) { + return min + TDuration::FromValue(RandomNumber<ui64>(max.GetValue() - min.GetValue() + 1)); + } + + STRICT_STFUNC(StateFunc, + CFunc(TEvents::TSystem::PoisonPill, Die) + CFunc(TEvents::TSystem::Wakeup, HandleWakeup) + ) + }; + +} // NActors diff --git a/library/cpp/actors/interconnect/types.cpp b/library/cpp/actors/interconnect/types.cpp new file mode 100644 index 0000000000..979c55f277 --- /dev/null +++ b/library/cpp/actors/interconnect/types.cpp @@ -0,0 +1,564 @@ +#include "types.h" +#include <util/string/printf.h> +#include <util/generic/vector.h> +#include <errno.h> + +namespace NActors { + + TVector<const char*> TDisconnectReason::Reasons = { + "EndOfStream", + "CloseOnIdle", + "LostConnection", + "DeadPeer", + "NewSession", + "HandshakeFailTransient", + "HandshakeFailPermanent", + "UserRequest", + "Debug", + "ChecksumError", + "FormatError", + "EventTooLarge", + "QueueOverload", + "E2BIG", + "EACCES", + "EADDRINUSE", + "EADDRNOTAVAIL", + "EADV", + "EAFNOSUPPORT", + "EAGAIN", + "EALREADY", + "EBADE", + "EBADF", + "EBADFD", + "EBADMSG", + "EBADR", + "EBADRQC", + "EBADSLT", + "EBFONT", + "EBUSY", + "ECANCELED", + "ECHILD", + "ECHRNG", + "ECOMM", + "ECONNABORTED", + "ECONNREFUSED", + "ECONNRESET", + "EDEADLK", + "EDEADLOCK", + "EDESTADDRREQ", + "EDOM", + "EDOTDOT", + "EDQUOT", + "EEXIST", + "EFAULT", + "EFBIG", + "EHOSTDOWN", + "EHOSTUNREACH", + "EHWPOISON", + "EIDRM", + "EILSEQ", + "EINPROGRESS", + "EINTR", + "EINVAL", + "EIO", + "EISCONN", + "EISDIR", + "EISNAM", + "EKEYEXPIRED", + "EKEYREJECTED", + "EKEYREVOKED", + "EL2HLT", + "EL2NSYNC", + "EL3HLT", + "EL3RST", + "ELIBACC", + "ELIBBAD", + "ELIBEXEC", + "ELIBMAX", + "ELIBSCN", + "ELNRNG", + "ELOOP", + "EMEDIUMTYPE", + "EMFILE", + "EMLINK", + "EMSGSIZE", + "EMULTIHOP", + "ENAMETOOLONG", + "ENAVAIL", + "ENETDOWN", + "ENETRESET", + "ENETUNREACH", + "ENFILE", + "ENOANO", + "ENOBUFS", + "ENOCSI", + "ENODATA", + "ENODEV", + "ENOENT", + "ENOEXEC", + "ENOKEY", + "ENOLCK", + "ENOLINK", + "ENOMEDIUM", + "ENOMEM", + "ENOMSG", + "ENONET", + "ENOPKG", + "ENOPROTOOPT", + "ENOSPC", + "ENOSR", + "ENOSTR", + "ENOSYS", + "ENOTBLK", + "ENOTCONN", + "ENOTDIR", + "ENOTEMPTY", + "ENOTNAM", + "ENOTRECOVERABLE", + "ENOTSOCK", + "ENOTTY", + "ENOTUNIQ", + "ENXIO", + "EOPNOTSUPP", + "EOVERFLOW", + "EOWNERDEAD", + "EPERM", + "EPFNOSUPPORT", + "EPIPE", + "EPROTO", + "EPROTONOSUPPORT", + "EPROTOTYPE", + "ERANGE", + "EREMCHG", + "EREMOTE", + "EREMOTEIO", + "ERESTART", + "ERFKILL", + "EROFS", + "ESHUTDOWN", + "ESOCKTNOSUPPORT", + "ESPIPE", + "ESRCH", + "ESRMNT", + "ESTALE", + "ESTRPIPE", + "ETIME", + "ETIMEDOUT", + "ETOOMANYREFS", + "ETXTBSY", + "EUCLEAN", + "EUNATCH", + "EUSERS", + "EWOULDBLOCK", + "EXDEV", + "EXFULL", + }; + + TDisconnectReason TDisconnectReason::FromErrno(int err) { + switch (err) { +#define REASON(ERRNO) case ERRNO: return TDisconnectReason(TString(#ERRNO)) +#if defined(E2BIG) + REASON(E2BIG); +#endif +#if defined(EACCES) + REASON(EACCES); +#endif +#if defined(EADDRINUSE) + REASON(EADDRINUSE); +#endif +#if defined(EADDRNOTAVAIL) + REASON(EADDRNOTAVAIL); +#endif +#if defined(EADV) + REASON(EADV); +#endif +#if defined(EAFNOSUPPORT) + REASON(EAFNOSUPPORT); +#endif +#if defined(EAGAIN) + REASON(EAGAIN); +#endif +#if defined(EALREADY) + REASON(EALREADY); +#endif +#if defined(EBADE) + REASON(EBADE); +#endif +#if defined(EBADF) + REASON(EBADF); +#endif +#if defined(EBADFD) + REASON(EBADFD); +#endif +#if defined(EBADMSG) + REASON(EBADMSG); +#endif +#if defined(EBADR) + REASON(EBADR); +#endif +#if defined(EBADRQC) + REASON(EBADRQC); +#endif +#if defined(EBADSLT) + REASON(EBADSLT); +#endif +#if defined(EBFONT) + REASON(EBFONT); +#endif +#if defined(EBUSY) + REASON(EBUSY); +#endif +#if defined(ECANCELED) + REASON(ECANCELED); +#endif +#if defined(ECHILD) + REASON(ECHILD); +#endif +#if defined(ECHRNG) + REASON(ECHRNG); +#endif +#if defined(ECOMM) + REASON(ECOMM); +#endif +#if defined(ECONNABORTED) + REASON(ECONNABORTED); +#endif +#if defined(ECONNREFUSED) + REASON(ECONNREFUSED); +#endif +#if defined(ECONNRESET) + REASON(ECONNRESET); +#endif +#if defined(EDEADLK) + REASON(EDEADLK); +#endif +#if defined(EDEADLOCK) && (!defined(EDEADLK) || EDEADLOCK != EDEADLK) + REASON(EDEADLOCK); +#endif +#if defined(EDESTADDRREQ) + REASON(EDESTADDRREQ); +#endif +#if defined(EDOM) + REASON(EDOM); +#endif +#if defined(EDOTDOT) + REASON(EDOTDOT); +#endif +#if defined(EDQUOT) + REASON(EDQUOT); +#endif +#if defined(EEXIST) + REASON(EEXIST); +#endif +#if defined(EFAULT) + REASON(EFAULT); +#endif +#if defined(EFBIG) + REASON(EFBIG); +#endif +#if defined(EHOSTDOWN) + REASON(EHOSTDOWN); +#endif +#if defined(EHOSTUNREACH) + REASON(EHOSTUNREACH); +#endif +#if defined(EHWPOISON) + REASON(EHWPOISON); +#endif +#if defined(EIDRM) + REASON(EIDRM); +#endif +#if defined(EILSEQ) + REASON(EILSEQ); +#endif +#if defined(EINPROGRESS) + REASON(EINPROGRESS); +#endif +#if defined(EINTR) + REASON(EINTR); +#endif +#if defined(EINVAL) + REASON(EINVAL); +#endif +#if defined(EIO) + REASON(EIO); +#endif +#if defined(EISCONN) + REASON(EISCONN); +#endif +#if defined(EISDIR) + REASON(EISDIR); +#endif +#if defined(EISNAM) + REASON(EISNAM); +#endif +#if defined(EKEYEXPIRED) + REASON(EKEYEXPIRED); +#endif +#if defined(EKEYREJECTED) + REASON(EKEYREJECTED); +#endif +#if defined(EKEYREVOKED) + REASON(EKEYREVOKED); +#endif +#if defined(EL2HLT) + REASON(EL2HLT); +#endif +#if defined(EL2NSYNC) + REASON(EL2NSYNC); +#endif +#if defined(EL3HLT) + REASON(EL3HLT); +#endif +#if defined(EL3RST) + REASON(EL3RST); +#endif +#if defined(ELIBACC) + REASON(ELIBACC); +#endif +#if defined(ELIBBAD) + REASON(ELIBBAD); +#endif +#if defined(ELIBEXEC) + REASON(ELIBEXEC); +#endif +#if defined(ELIBMAX) + REASON(ELIBMAX); +#endif +#if defined(ELIBSCN) + REASON(ELIBSCN); +#endif +#if defined(ELNRNG) + REASON(ELNRNG); +#endif +#if defined(ELOOP) + REASON(ELOOP); +#endif +#if defined(EMEDIUMTYPE) + REASON(EMEDIUMTYPE); +#endif +#if defined(EMFILE) + REASON(EMFILE); +#endif +#if defined(EMLINK) + REASON(EMLINK); +#endif +#if defined(EMSGSIZE) + REASON(EMSGSIZE); +#endif +#if defined(EMULTIHOP) + REASON(EMULTIHOP); +#endif +#if defined(ENAMETOOLONG) + REASON(ENAMETOOLONG); +#endif +#if defined(ENAVAIL) + REASON(ENAVAIL); +#endif +#if defined(ENETDOWN) + REASON(ENETDOWN); +#endif +#if defined(ENETRESET) + REASON(ENETRESET); +#endif +#if defined(ENETUNREACH) + REASON(ENETUNREACH); +#endif +#if defined(ENFILE) + REASON(ENFILE); +#endif +#if defined(ENOANO) + REASON(ENOANO); +#endif +#if defined(ENOBUFS) + REASON(ENOBUFS); +#endif +#if defined(ENOCSI) + REASON(ENOCSI); +#endif +#if defined(ENODATA) + REASON(ENODATA); +#endif +#if defined(ENODEV) + REASON(ENODEV); +#endif +#if defined(ENOENT) + REASON(ENOENT); +#endif +#if defined(ENOEXEC) + REASON(ENOEXEC); +#endif +#if defined(ENOKEY) + REASON(ENOKEY); +#endif +#if defined(ENOLCK) + REASON(ENOLCK); +#endif +#if defined(ENOLINK) + REASON(ENOLINK); +#endif +#if defined(ENOMEDIUM) + REASON(ENOMEDIUM); +#endif +#if defined(ENOMEM) + REASON(ENOMEM); +#endif +#if defined(ENOMSG) + REASON(ENOMSG); +#endif +#if defined(ENONET) + REASON(ENONET); +#endif +#if defined(ENOPKG) + REASON(ENOPKG); +#endif +#if defined(ENOPROTOOPT) + REASON(ENOPROTOOPT); +#endif +#if defined(ENOSPC) + REASON(ENOSPC); +#endif +#if defined(ENOSR) + REASON(ENOSR); +#endif +#if defined(ENOSTR) + REASON(ENOSTR); +#endif +#if defined(ENOSYS) + REASON(ENOSYS); +#endif +#if defined(ENOTBLK) + REASON(ENOTBLK); +#endif +#if defined(ENOTCONN) + REASON(ENOTCONN); +#endif +#if defined(ENOTDIR) + REASON(ENOTDIR); +#endif +#if defined(ENOTEMPTY) + REASON(ENOTEMPTY); +#endif +#if defined(ENOTNAM) + REASON(ENOTNAM); +#endif +#if defined(ENOTRECOVERABLE) + REASON(ENOTRECOVERABLE); +#endif +#if defined(ENOTSOCK) + REASON(ENOTSOCK); +#endif +#if defined(ENOTTY) + REASON(ENOTTY); +#endif +#if defined(ENOTUNIQ) + REASON(ENOTUNIQ); +#endif +#if defined(ENXIO) + REASON(ENXIO); +#endif +#if defined(EOPNOTSUPP) + REASON(EOPNOTSUPP); +#endif +#if defined(EOVERFLOW) + REASON(EOVERFLOW); +#endif +#if defined(EOWNERDEAD) + REASON(EOWNERDEAD); +#endif +#if defined(EPERM) + REASON(EPERM); +#endif +#if defined(EPFNOSUPPORT) + REASON(EPFNOSUPPORT); +#endif +#if defined(EPIPE) + REASON(EPIPE); +#endif +#if defined(EPROTO) + REASON(EPROTO); +#endif +#if defined(EPROTONOSUPPORT) + REASON(EPROTONOSUPPORT); +#endif +#if defined(EPROTOTYPE) + REASON(EPROTOTYPE); +#endif +#if defined(ERANGE) + REASON(ERANGE); +#endif +#if defined(EREMCHG) + REASON(EREMCHG); +#endif +#if defined(EREMOTE) + REASON(EREMOTE); +#endif +#if defined(EREMOTEIO) + REASON(EREMOTEIO); +#endif +#if defined(ERESTART) + REASON(ERESTART); +#endif +#if defined(ERFKILL) + REASON(ERFKILL); +#endif +#if defined(EROFS) + REASON(EROFS); +#endif +#if defined(ESHUTDOWN) + REASON(ESHUTDOWN); +#endif +#if defined(ESOCKTNOSUPPORT) + REASON(ESOCKTNOSUPPORT); +#endif +#if defined(ESPIPE) + REASON(ESPIPE); +#endif +#if defined(ESRCH) + REASON(ESRCH); +#endif +#if defined(ESRMNT) + REASON(ESRMNT); +#endif +#if defined(ESTALE) + REASON(ESTALE); +#endif +#if defined(ESTRPIPE) + REASON(ESTRPIPE); +#endif +#if defined(ETIME) + REASON(ETIME); +#endif +#if defined(ETIMEDOUT) + REASON(ETIMEDOUT); +#endif +#if defined(ETOOMANYREFS) + REASON(ETOOMANYREFS); +#endif +#if defined(ETXTBSY) + REASON(ETXTBSY); +#endif +#if defined(EUCLEAN) + REASON(EUCLEAN); +#endif +#if defined(EUNATCH) + REASON(EUNATCH); +#endif +#if defined(EUSERS) + REASON(EUSERS); +#endif +#if defined(EWOULDBLOCK) && (!defined(EAGAIN) || EWOULDBLOCK != EAGAIN) + REASON(EWOULDBLOCK); +#endif +#if defined(EXDEV) + REASON(EXDEV); +#endif +#if defined(EXFULL) + REASON(EXFULL); +#endif + default: + return TDisconnectReason(Sprintf("errno=%d", errno)); + } + } + +} // NActors diff --git a/library/cpp/actors/interconnect/types.h b/library/cpp/actors/interconnect/types.h new file mode 100644 index 0000000000..2662c50c22 --- /dev/null +++ b/library/cpp/actors/interconnect/types.h @@ -0,0 +1,43 @@ +#pragma once + +#include <util/generic/string.h> + +namespace NActors { + + class TDisconnectReason { + TString Text; + + private: + explicit TDisconnectReason(TString text) + : Text(std::move(text)) + {} + + public: + TDisconnectReason() = default; + TDisconnectReason(const TDisconnectReason&) = default; + TDisconnectReason(TDisconnectReason&&) = default; + + static TDisconnectReason FromErrno(int err); + + static TDisconnectReason EndOfStream() { return TDisconnectReason("EndOfStream"); } + static TDisconnectReason CloseOnIdle() { return TDisconnectReason("CloseOnIdle"); } + static TDisconnectReason LostConnection() { return TDisconnectReason("LostConnection"); } + static TDisconnectReason DeadPeer() { return TDisconnectReason("DeadPeer"); } + static TDisconnectReason NewSession() { return TDisconnectReason("NewSession"); } + static TDisconnectReason HandshakeFailTransient() { return TDisconnectReason("HandshakeFailTransient"); } + static TDisconnectReason HandshakeFailPermanent() { return TDisconnectReason("HandshakeFailPermanent"); } + static TDisconnectReason UserRequest() { return TDisconnectReason("UserRequest"); } + static TDisconnectReason Debug() { return TDisconnectReason("Debug"); } + static TDisconnectReason ChecksumError() { return TDisconnectReason("ChecksumError"); } + static TDisconnectReason FormatError() { return TDisconnectReason("FormatError"); } + static TDisconnectReason EventTooLarge() { return TDisconnectReason("EventTooLarge"); } + static TDisconnectReason QueueOverload() { return TDisconnectReason("QueueOverload"); } + + TString ToString() const { + return Text; + } + + static TVector<const char*> Reasons; + }; + +} // NActors diff --git a/library/cpp/actors/interconnect/ut/channel_scheduler_ut.cpp b/library/cpp/actors/interconnect/ut/channel_scheduler_ut.cpp new file mode 100644 index 0000000000..565a511859 --- /dev/null +++ b/library/cpp/actors/interconnect/ut/channel_scheduler_ut.cpp @@ -0,0 +1,115 @@ +#include <library/cpp/actors/interconnect/channel_scheduler.h> +#include <library/cpp/actors/interconnect/events_local.h> +#include <library/cpp/testing/unittest/registar.h> + +using namespace NActors; + +Y_UNIT_TEST_SUITE(ChannelScheduler) { + + Y_UNIT_TEST(PriorityTraffic) { + auto common = MakeIntrusive<TInterconnectProxyCommon>(); + common->MonCounters = MakeIntrusive<NMonitoring::TDynamicCounters>(); + std::shared_ptr<IInterconnectMetrics> ctr = CreateInterconnectCounters(common); + ctr->SetPeerInfo("peer", "1"); + auto callback = [](THolder<IEventBase>) {}; + TEventHolderPool pool(common, callback); + TSessionParams p; + TChannelScheduler scheduler(1, {}, ctr, pool, 64 << 20, p); + + ui32 numEvents = 0; + + auto pushEvent = [&](size_t size, int channel) { + TString payload(size, 'X'); + auto ev = MakeHolder<IEventHandle>(1, 0, TActorId(), TActorId(), MakeIntrusive<TEventSerializedData>(payload, false), 0); + auto& ch = scheduler.GetOutputChannel(channel); + const bool wasWorking = ch.IsWorking(); + ch.Push(*ev); + if (!wasWorking) { + scheduler.AddToHeap(ch, 0); + } + ++numEvents; + }; + + for (ui32 i = 0; i < 100; ++i) { + pushEvent(10000, 1); + } + + for (ui32 i = 0; i < 1000; ++i) { + pushEvent(1000, 2); + } + + std::map<ui16, ui32> run; + ui32 step = 0; + + std::deque<std::map<ui16, ui32>> window; + + for (; numEvents; ++step) { + TTcpPacketOutTask task(p); + + if (step == 100) { + for (ui32 i = 0; i < 200; ++i) { + pushEvent(1000, 3); + } + } + + std::map<ui16, ui32> ch; + + while (numEvents) { + TEventOutputChannel *channel = scheduler.PickChannelWithLeastConsumedWeight(); + ui32 before = task.GetDataSize(); + ui64 weightConsumed = 0; + numEvents -= channel->FeedBuf(task, 0, &weightConsumed); + ui32 after = task.GetDataSize(); + Y_VERIFY(after >= before); + scheduler.FinishPick(weightConsumed, 0); + const ui32 bytesAdded = after - before; + if (!bytesAdded) { + break; + } + ch[channel->ChannelId] += bytesAdded; + } + + scheduler.Equalize(); + + for (const auto& [key, value] : ch) { + run[key] += value; + } + window.push_back(ch); + + if (window.size() == 32) { + for (const auto& [key, value] : window.front()) { + run[key] -= value; + if (!run[key]) { + run.erase(key); + } + } + window.pop_front(); + } + + double mean = 0.0; + for (const auto& [key, value] : run) { + mean += value; + } + mean /= run.size(); + + double dev = 0.0; + for (const auto& [key, value] : run) { + dev += (value - mean) * (value - mean); + } + dev = sqrt(dev / run.size()); + + double devToMean = dev / mean; + + Cerr << step << ": "; + for (const auto& [key, value] : run) { + Cerr << "ch" << key << "=" << value << " "; + } + Cerr << "mean# " << mean << " dev# " << dev << " part# " << devToMean; + + Cerr << Endl; + + UNIT_ASSERT(devToMean < 1); + } + } + +} diff --git a/library/cpp/actors/interconnect/ut/dynamic_proxy_ut.cpp b/library/cpp/actors/interconnect/ut/dynamic_proxy_ut.cpp new file mode 100644 index 0000000000..3c474979dc --- /dev/null +++ b/library/cpp/actors/interconnect/ut/dynamic_proxy_ut.cpp @@ -0,0 +1,179 @@ +#include <library/cpp/actors/interconnect/ut/lib/node.h> +#include <library/cpp/actors/interconnect/ut/lib/ic_test_cluster.h> +#include <library/cpp/testing/unittest/registar.h> + +TActorId MakeResponderServiceId(ui32 nodeId) { + return TActorId(nodeId, TStringBuf("ResponderAct", 12)); +} + +class TArriveQueue { + struct TArrivedItem { + ui32 QueueId; + ui32 Index; + bool Success; + }; + + TMutex Lock; + std::size_t Counter = 0; + std::vector<TArrivedItem> Items; + +public: + TArriveQueue(size_t capacity) + : Items(capacity) + {} + + bool Done() const { + with_lock (Lock) { + return Counter == Items.size(); + } + } + + void Push(ui64 cookie, bool success) { + with_lock (Lock) { + const size_t pos = Counter++; + TArrivedItem item{.QueueId = static_cast<ui32>(cookie >> 32), .Index = static_cast<ui32>(cookie & 0xffff'ffff), + .Success = success}; + memcpy(&Items[pos], &item, sizeof(TArrivedItem)); + } + } + + void Check() { + struct TPerQueueState { + std::vector<ui32> Ok, Error; + }; + std::unordered_map<ui32, TPerQueueState> state; + for (const TArrivedItem& item : Items) { + auto& st = state[item.QueueId]; + auto& v = item.Success ? st.Ok : st.Error; + v.push_back(item.Index); + } + for (const auto& [queueId, st] : state) { + ui32 expected = 0; + for (const ui32 index : st.Ok) { + Y_VERIFY(index == expected); + ++expected; + } + for (const ui32 index : st.Error) { + Y_VERIFY(index == expected); + ++expected; + } + if (st.Error.size()) { + Cerr << "Error.size# " << st.Error.size() << Endl; + } + } + } +}; + +class TResponder : public TActor<TResponder> { + TArriveQueue& ArriveQueue; + +public: + TResponder(TArriveQueue& arriveQueue) + : TActor(&TResponder::StateFunc) + , ArriveQueue(arriveQueue) + {} + + STRICT_STFUNC(StateFunc, + hFunc(TEvents::TEvPing, Handle); + ) + + void Handle(TEvents::TEvPing::TPtr ev) { + ArriveQueue.Push(ev->Cookie, true); + } +}; + +class TSender : public TActor<TSender> { + TArriveQueue& ArriveQueue; + +public: + TSender(TArriveQueue& arriveQueue) + : TActor(&TThis::StateFunc) + , ArriveQueue(arriveQueue) + {} + + STRICT_STFUNC(StateFunc, + hFunc(TEvents::TEvUndelivered, Handle); + ) + + void Handle(TEvents::TEvUndelivered::TPtr ev) { + ArriveQueue.Push(ev->Cookie, false); + } +}; + +void SenderThread(TMutex& lock, TActorSystem *as, ui32 nodeId, ui32 queueId, ui32 count, TArriveQueue& arriveQueue) { + const TActorId sender = as->Register(new TSender(arriveQueue)); + with_lock(lock) {} + const TActorId target = MakeResponderServiceId(nodeId); + for (ui32 i = 0; i < count; ++i) { + const ui32 flags = IEventHandle::FlagTrackDelivery; + as->Send(new IEventHandle(TEvents::THelloWorld::Ping, flags, target, sender, nullptr, ((ui64)queueId << 32) | i)); + } +} + +void RaceTestIter(ui32 numThreads, ui32 count) { + TPortManager portman; + THashMap<ui32, ui16> nodeToPort; + const ui32 numNodes = 6; // total + const ui32 numDynamicNodes = 3; + for (ui32 i = 1; i <= numNodes; ++i) { + nodeToPort.emplace(i, portman.GetPort()); + } + + NMonitoring::TDynamicCounterPtr counters = new NMonitoring::TDynamicCounters; + std::list<TNode> nodes; + for (ui32 i = 1; i <= numNodes; ++i) { + nodes.emplace_back(i, numNodes, nodeToPort, "127.1.0.0", counters->GetSubgroup("nodeId", TStringBuilder() << i), + TDuration::Seconds(10), TChannelsConfig(), numDynamicNodes, numThreads); + } + + const ui32 numSenders = 10; + TArriveQueue arriveQueue(numSenders * numNodes * (numNodes - 1) * count); + for (TNode& node : nodes) { + node.RegisterServiceActor(MakeResponderServiceId(node.GetActorSystem()->NodeId), new TResponder(arriveQueue)); + } + + TMutex lock; + std::list<TThread> threads; + ui32 queueId = 0; + with_lock(lock) { + for (TNode& from : nodes) { + for (ui32 toId = 1; toId <= numNodes; ++toId) { + if (toId == from.GetActorSystem()->NodeId) { + continue; + } + for (ui32 i = 0; i < numSenders; ++i) { + threads.emplace_back([=, &lock, &from, &arriveQueue] { + SenderThread(lock, from.GetActorSystem(), toId, queueId, count, arriveQueue); + }); + ++queueId; + } + } + } + for (auto& thread : threads) { + thread.Start(); + } + } + for (auto& thread : threads) { + thread.Join(); + } + + for (THPTimer timer; !arriveQueue.Done(); TDuration::MilliSeconds(10)) { + Y_VERIFY(timer.Passed() < 10); + } + + nodes.clear(); + arriveQueue.Check(); +} + +Y_UNIT_TEST_SUITE(DynamicProxy) { + Y_UNIT_TEST(RaceCheck1) { + for (ui32 iteration = 0; iteration < 100; ++iteration) { + RaceTestIter(1 + iteration % 5, 1); + } + } + Y_UNIT_TEST(RaceCheck10) { + for (ui32 iteration = 0; iteration < 100; ++iteration) { + RaceTestIter(1 + iteration % 5, 10); + } + } +} diff --git a/library/cpp/actors/interconnect/ut/event_holder_pool_ut.cpp b/library/cpp/actors/interconnect/ut/event_holder_pool_ut.cpp new file mode 100644 index 0000000000..e6b2bd4e4c --- /dev/null +++ b/library/cpp/actors/interconnect/ut/event_holder_pool_ut.cpp @@ -0,0 +1,59 @@ +#include <library/cpp/testing/unittest/registar.h> +#include <library/cpp/actors/core/events.h> +#include <library/cpp/actors/core/event_local.h> +#include <library/cpp/actors/interconnect/interconnect_common.h> +#include <library/cpp/monlib/dynamic_counters/counters.h> +#include <library/cpp/actors/interconnect/event_holder_pool.h> + +#include <atomic> + +using namespace NActors; + +template<typename T> +TEventHolderPool Setup(T&& callback) { + auto common = MakeIntrusive<TInterconnectProxyCommon>(); + common->DestructorQueueSize = std::make_shared<std::atomic<TAtomicBase>>(); + common->MaxDestructorQueueSize = 1024 * 1024; + return TEventHolderPool(common, callback); +} + +Y_UNIT_TEST_SUITE(EventHolderPool) { + + Y_UNIT_TEST(Overflow) { + TDeque<THolder<IEventBase>> freeQ; + auto callback = [&](THolder<IEventBase> event) { + freeQ.push_back(std::move(event)); + }; + auto pool = Setup(std::move(callback)); + + std::list<TEventHolder> q; + + auto& ev1 = pool.Allocate(q); + ev1.Buffer = MakeIntrusive<TEventSerializedData>(TString::Uninitialized(512 * 1024), true); + + auto& ev2 = pool.Allocate(q); + ev2.Buffer = MakeIntrusive<TEventSerializedData>(TString::Uninitialized(512 * 1024), true); + + auto& ev3 = pool.Allocate(q); + ev3.Buffer = MakeIntrusive<TEventSerializedData>(TString::Uninitialized(512 * 1024), true); + + auto& ev4 = pool.Allocate(q); + ev4.Buffer = MakeIntrusive<TEventSerializedData>(TString::Uninitialized(512 * 1024), true); + + pool.Release(q, q.begin()); + pool.Release(q, q.begin()); + pool.Trim(); + UNIT_ASSERT_VALUES_EQUAL(freeQ.size(), 1); + + pool.Release(q, q.begin()); + UNIT_ASSERT_VALUES_EQUAL(freeQ.size(), 1); + + freeQ.clear(); + pool.Release(q, q.begin()); + pool.Trim(); + UNIT_ASSERT_VALUES_EQUAL(freeQ.size(), 1); + + freeQ.clear(); // if we don't this, we may probablty crash due to the order of object destruction + } + +} diff --git a/library/cpp/actors/interconnect/ut/interconnect_ut.cpp b/library/cpp/actors/interconnect/ut/interconnect_ut.cpp new file mode 100644 index 0000000000..8ef0b1507c --- /dev/null +++ b/library/cpp/actors/interconnect/ut/interconnect_ut.cpp @@ -0,0 +1,177 @@ +#include <library/cpp/actors/interconnect/ut/lib/ic_test_cluster.h> +#include <library/cpp/testing/unittest/registar.h> +#include <library/cpp/digest/md5/md5.h> +#include <util/random/fast.h> + +using namespace NActors; + +class TSenderActor : public TActorBootstrapped<TSenderActor> { + const TActorId Recipient; + using TSessionToCookie = std::unordered_multimap<TActorId, ui64, THash<TActorId>>; + TSessionToCookie SessionToCookie; + std::unordered_map<ui64, std::pair<TSessionToCookie::iterator, TString>> InFlight; + std::unordered_map<ui64, TString> Tentative; + ui64 NextCookie = 0; + TActorId SessionId; + bool SubscribeInFlight = false; + +public: + TSenderActor(TActorId recipient) + : Recipient(recipient) + {} + + void Bootstrap() { + Become(&TThis::StateFunc); + Subscribe(); + } + + void Subscribe() { + Cerr << (TStringBuilder() << "Subscribe" << Endl); + Y_VERIFY(!SubscribeInFlight); + SubscribeInFlight = true; + Send(TActivationContext::InterconnectProxy(Recipient.NodeId()), new TEvents::TEvSubscribe); + } + + void IssueQueries() { + if (!SessionId) { + return; + } + while (InFlight.size() < 10) { + size_t len = RandomNumber<size_t>(65536) + 1; + TString data = TString::Uninitialized(len); + TReallyFastRng32 rng(RandomNumber<ui32>()); + char *p = data.Detach(); + for (size_t i = 0; i < len; ++i) { + p[i] = rng(); + } + const TSessionToCookie::iterator s2cIt = SessionToCookie.emplace(SessionId, NextCookie); + InFlight.emplace(NextCookie, std::make_tuple(s2cIt, MD5::CalcRaw(data))); + TActivationContext::Send(new IEventHandle(TEvents::THelloWorld::Ping, IEventHandle::FlagTrackDelivery, Recipient, + SelfId(), MakeIntrusive<TEventSerializedData>(std::move(data), false), NextCookie)); +// Cerr << (TStringBuilder() << "Send# " << NextCookie << Endl); + ++NextCookie; + } + } + + void HandlePong(TAutoPtr<IEventHandle> ev) { +// Cerr << (TStringBuilder() << "Receive# " << ev->Cookie << Endl); + if (const auto it = InFlight.find(ev->Cookie); it != InFlight.end()) { + auto& [s2cIt, hash] = it->second; + Y_VERIFY(hash == ev->GetChainBuffer()->GetString()); + SessionToCookie.erase(s2cIt); + InFlight.erase(it); + } else if (const auto it = Tentative.find(ev->Cookie); it != Tentative.end()) { + Y_VERIFY(it->second == ev->GetChainBuffer()->GetString()); + Tentative.erase(it); + } else { + Y_FAIL("Cookie# %" PRIu64, ev->Cookie); + } + IssueQueries(); + } + + void Handle(TEvInterconnect::TEvNodeConnected::TPtr ev) { + Cerr << (TStringBuilder() << "TEvNodeConnected" << Endl); + Y_VERIFY(SubscribeInFlight); + SubscribeInFlight = false; + Y_VERIFY(!SessionId); + SessionId = ev->Sender; + IssueQueries(); + } + + void Handle(TEvInterconnect::TEvNodeDisconnected::TPtr ev) { + Cerr << (TStringBuilder() << "TEvNodeDisconnected" << Endl); + SubscribeInFlight = false; + if (SessionId) { + Y_VERIFY(SessionId == ev->Sender); + auto r = SessionToCookie.equal_range(SessionId); + for (auto it = r.first; it != r.second; ++it) { + const auto inFlightIt = InFlight.find(it->second); + Y_VERIFY(inFlightIt != InFlight.end()); + Tentative.emplace(inFlightIt->first, inFlightIt->second.second); + InFlight.erase(it->second); + } + SessionToCookie.erase(r.first, r.second); + SessionId = TActorId(); + } + Schedule(TDuration::MilliSeconds(100), new TEvents::TEvWakeup); + } + + void Handle(TEvents::TEvUndelivered::TPtr ev) { + Cerr << (TStringBuilder() << "TEvUndelivered Cookie# " << ev->Cookie << Endl); + if (const auto it = InFlight.find(ev->Cookie); it != InFlight.end()) { + auto& [s2cIt, hash] = it->second; + Tentative.emplace(it->first, hash); + SessionToCookie.erase(s2cIt); + InFlight.erase(it); + IssueQueries(); + } + } + + STRICT_STFUNC(StateFunc, + fFunc(TEvents::THelloWorld::Pong, HandlePong); + hFunc(TEvInterconnect::TEvNodeConnected, Handle); + hFunc(TEvInterconnect::TEvNodeDisconnected, Handle); + hFunc(TEvents::TEvUndelivered, Handle); + cFunc(TEvents::TSystem::Wakeup, Subscribe); + ) +}; + +class TRecipientActor : public TActor<TRecipientActor> { +public: + TRecipientActor() + : TActor(&TThis::StateFunc) + {} + + void HandlePing(TAutoPtr<IEventHandle>& ev) { + const TString& data = ev->GetChainBuffer()->GetString(); + const TString& response = MD5::CalcRaw(data); + TActivationContext::Send(new IEventHandle(TEvents::THelloWorld::Pong, 0, ev->Sender, SelfId(), + MakeIntrusive<TEventSerializedData>(response, false), ev->Cookie)); + } + + STRICT_STFUNC(StateFunc, + fFunc(TEvents::THelloWorld::Ping, HandlePing); + ) +}; + +Y_UNIT_TEST_SUITE(Interconnect) { + + Y_UNIT_TEST(SessionContinuation) { + TTestICCluster cluster(2); + const TActorId recipient = cluster.RegisterActor(new TRecipientActor, 1); + cluster.RegisterActor(new TSenderActor(recipient), 2); + for (ui32 i = 0; i < 100; ++i) { + const ui32 nodeId = 1 + RandomNumber(2u); + const ui32 peerNodeId = 3 - nodeId; + const ui32 action = RandomNumber(3u); + auto *node = cluster.GetNode(nodeId); + TActorId proxyId = node->InterconnectProxy(peerNodeId); + + switch (action) { + case 0: + node->Send(proxyId, new TEvInterconnect::TEvClosePeerSocket); + Cerr << (TStringBuilder() << "nodeId# " << nodeId << " peerNodeId# " << peerNodeId + << " TEvClosePeerSocket" << Endl); + break; + + case 1: + node->Send(proxyId, new TEvInterconnect::TEvCloseInputSession); + Cerr << (TStringBuilder() << "nodeId# " << nodeId << " peerNodeId# " << peerNodeId + << " TEvCloseInputSession" << Endl); + break; + + case 2: + node->Send(proxyId, new TEvInterconnect::TEvPoisonSession); + Cerr << (TStringBuilder() << "nodeId# " << nodeId << " peerNodeId# " << peerNodeId + << " TEvPoisonSession" << Endl); + break; + + default: + Y_FAIL(); + } + + Sleep(TDuration::MilliSeconds(RandomNumber<ui32>(500) + 100)); + } + } + +} diff --git a/library/cpp/actors/interconnect/ut/large.cpp b/library/cpp/actors/interconnect/ut/large.cpp new file mode 100644 index 0000000000..ba2a50c6f6 --- /dev/null +++ b/library/cpp/actors/interconnect/ut/large.cpp @@ -0,0 +1,85 @@ +#include "lib/ic_test_cluster.h" +#include "lib/test_events.h" +#include "lib/test_actors.h" + +#include <library/cpp/actors/interconnect/interconnect_tcp_proxy.h> + +#include <library/cpp/testing/unittest/tests_data.h> +#include <library/cpp/testing/unittest/registar.h> + +#include <util/system/event.h> +#include <util/system/sanitizers.h> + +Y_UNIT_TEST_SUITE(LargeMessage) { + using namespace NActors; + + class TProducer: public TActorBootstrapped<TProducer> { + const TActorId RecipientActorId; + + public: + TProducer(const TActorId& recipientActorId) + : RecipientActorId(recipientActorId) + {} + + void Bootstrap(const TActorContext& ctx) { + Become(&TThis::StateFunc); + ctx.Send(RecipientActorId, new TEvTest(1, "hello"), IEventHandle::FlagTrackDelivery, 1); + ctx.Send(RecipientActorId, new TEvTest(2, TString(128 * 1024 * 1024, 'X')), IEventHandle::FlagTrackDelivery, 2); + } + + void Handle(TEvents::TEvUndelivered::TPtr ev, const TActorContext& ctx) { + if (ev->Cookie == 2) { + Cerr << "TEvUndelivered\n"; + ctx.Send(RecipientActorId, new TEvTest(3, "hello"), IEventHandle::FlagTrackDelivery, 3); + } + } + + STRICT_STFUNC(StateFunc, + HFunc(TEvents::TEvUndelivered, Handle) + ) + }; + + class TConsumer : public TActorBootstrapped<TConsumer> { + TManualEvent& Done; + TActorId SessionId; + + public: + TConsumer(TManualEvent& done) + : Done(done) + { + } + + void Bootstrap(const TActorContext& /*ctx*/) { + Become(&TThis::StateFunc); + } + + void Handle(TEvTest::TPtr ev, const TActorContext& /*ctx*/) { + const auto& record = ev->Get()->Record; + Cerr << "RECEIVED TEvTest\n"; + if (record.GetSequenceNumber() == 1) { + Y_VERIFY(!SessionId); + SessionId = ev->InterconnectSession; + } else if (record.GetSequenceNumber() == 3) { + Y_VERIFY(SessionId != ev->InterconnectSession); + Done.Signal(); + } else { + Y_FAIL("incorrect sequence number"); + } + } + + STRICT_STFUNC(StateFunc, + HFunc(TEvTest, Handle) + ) + }; + + Y_UNIT_TEST(Test) { + TTestICCluster testCluster(2); + + TManualEvent done; + TConsumer* consumer = new TConsumer(done); + const TActorId recp = testCluster.RegisterActor(consumer, 1); + testCluster.RegisterActor(new TProducer(recp), 2); + done.WaitI(); + } + +} diff --git a/library/cpp/actors/interconnect/ut/lib/ic_test_cluster.h b/library/cpp/actors/interconnect/ut/lib/ic_test_cluster.h new file mode 100644 index 0000000000..2b6d27cd3f --- /dev/null +++ b/library/cpp/actors/interconnect/ut/lib/ic_test_cluster.h @@ -0,0 +1,84 @@ +#pragma once + +#include "node.h" +#include "interrupter.h" + +#include <library/cpp/actors/interconnect/interconnect_tcp_proxy.h> +#include <library/cpp/actors/core/events.h> +#include <library/cpp/testing/unittest/tests_data.h> + +#include <util/generic/noncopyable.h> + +class TTestICCluster: public TNonCopyable { +public: + struct TTrafficInterrupterSettings { + TDuration RejectingTrafficTimeout; + double BandWidth; + bool Disconnect; + }; + +private: + const ui32 NumNodes; + const TString Address = "::1"; + TDuration DeadPeerTimeout = TDuration::Seconds(2); + NMonitoring::TDynamicCounterPtr Counters; + THashMap<ui32, THolder<TNode>> Nodes; + TList<TTrafficInterrupter> interrupters; + NActors::TChannelsConfig ChannelsConfig; + TPortManager PortManager; + +public: + TTestICCluster(ui32 numNodes = 1, NActors::TChannelsConfig channelsConfig = NActors::TChannelsConfig(), + TTrafficInterrupterSettings* tiSettings = nullptr) + : NumNodes(numNodes) + , Counters(new NMonitoring::TDynamicCounters) + , ChannelsConfig(channelsConfig) + { + THashMap<ui32, ui16> nodeToPortMap; + THashMap<ui32, THashMap<ui32, ui16>> specificNodePortMap; + + for (ui32 i = 1; i <= NumNodes; ++i) { + nodeToPortMap.emplace(i, PortManager.GetPort()); + } + + if (tiSettings) { + ui32 nodeId; + ui16 listenPort; + ui16 forwardPort; + for (auto& item : nodeToPortMap) { + nodeId = item.first; + listenPort = item.second; + forwardPort = PortManager.GetPort(); + + specificNodePortMap[nodeId] = nodeToPortMap; + specificNodePortMap[nodeId].at(nodeId) = forwardPort; + interrupters.emplace_back(Address, listenPort, forwardPort, tiSettings->RejectingTrafficTimeout, tiSettings->BandWidth, tiSettings->Disconnect); + interrupters.back().Start(); + } + } + + for (ui32 i = 1; i <= NumNodes; ++i) { + auto& portMap = tiSettings ? specificNodePortMap[i] : nodeToPortMap; + Nodes.emplace(i, MakeHolder<TNode>(i, NumNodes, portMap, Address, Counters, DeadPeerTimeout, ChannelsConfig)); + } + } + + TNode* GetNode(ui32 id) { + return Nodes[id].Get(); + } + + ~TTestICCluster() { + } + + TActorId RegisterActor(NActors::IActor* actor, ui32 nodeId) { + return Nodes[nodeId]->RegisterActor(actor); + } + + TActorId InterconnectProxy(ui32 peerNodeId, ui32 nodeId) { + return Nodes[nodeId]->InterconnectProxy(peerNodeId); + } + + void KillActor(ui32 nodeId, const TActorId& id) { + Nodes[nodeId]->Send(id, new NActors::TEvents::TEvPoisonPill); + } +}; diff --git a/library/cpp/actors/interconnect/ut/lib/interrupter.h b/library/cpp/actors/interconnect/ut/lib/interrupter.h new file mode 100644 index 0000000000..48851de2c5 --- /dev/null +++ b/library/cpp/actors/interconnect/ut/lib/interrupter.h @@ -0,0 +1,249 @@ +#pragma once + +#include <library/cpp/testing/unittest/tests_data.h> + +#include <util/network/sock.h> +#include <util/network/poller.h> +#include <util/system/thread.h> +#include <util/system/hp_timer.h> +#include <util/generic/list.h> +#include <util/generic/set.h> +#include <util/generic/vector.h> +#include <util/generic/deque.h> +#include <util/random/random.h> + +#include <iterator> + +class TTrafficInterrupter + : public ISimpleThread { + const TString Address; + const ui16 ForwardPort; + TInet6StreamSocket ListenSocket; + + struct TConnectionDescriptor; + struct TDelayedPacket { + TInet6StreamSocket* ForwardSocket = nullptr; + TVector<char> Data; + }; + struct TCompare { + bool operator()(const std::pair<TInstant, TDelayedPacket>& x, const std::pair<TInstant, TDelayedPacket>& y) const { + return x.first > y.first; + }; + }; + + struct TDirectedConnection { + TInet6StreamSocket* Source = nullptr; + TInet6StreamSocket* Destination = nullptr; + TList<TConnectionDescriptor>::iterator ListIterator; + TInstant Timestamp; + TPriorityQueue<std::pair<TInstant, TDelayedPacket>, TVector<std::pair<TInstant, TDelayedPacket>>, TCompare> DelayedQueue; + + TDirectedConnection(TInet6StreamSocket* source, TInet6StreamSocket* destination) + : Source(source) + , Destination(destination) + { + } + }; + + struct TConnectionDescriptor { + std::unique_ptr<TInet6StreamSocket> FirstSocket; + std::unique_ptr<TInet6StreamSocket> SecondSocket; + TDirectedConnection ForwardConnection; + TDirectedConnection BackwardConnection; + + TConnectionDescriptor(std::unique_ptr<TInet6StreamSocket> firstSock, + std::unique_ptr<TInet6StreamSocket> secondSock) + : FirstSocket(std::move(firstSock)) + , SecondSocket(std::move(secondSock)) + , ForwardConnection(FirstSocket.get(), SecondSocket.get()) + , BackwardConnection(SecondSocket.get(), FirstSocket.get()) + { + } + }; + + template <class It = TList<TConnectionDescriptor>::iterator> + class TCustomListIteratorCompare { + public: + bool operator()(const It& it1, const It& it2) const { + return (&(*it1) < &(*it2)); + } + }; + + TList<TConnectionDescriptor> Connections; + TSet<TList<TConnectionDescriptor>::iterator, TCustomListIteratorCompare<>> DroppedConnections; + +public: + TTrafficInterrupter(TString address, ui16 listenPort, ui16 forwardPort, TDuration rejectingTrafficTimeout, double bandwidth, bool disconnect = true) + : Address(std::move(address)) + , ForwardPort(forwardPort) + , ListenSocket() + , RejectingTrafficTimeout(rejectingTrafficTimeout) + , CurrentRejectingTimeout(rejectingTrafficTimeout) + , RejectingStateTimer() + , Bandwidth(bandwidth) + , Disconnect(disconnect) + , RejectingTraffic(false) + { + SetReuseAddressAndPort(ListenSocket); + TSockAddrInet6 addr(Address.data(), listenPort); + Y_VERIFY(ListenSocket.Bind(&addr) == 0); + Y_VERIFY(ListenSocket.Listen(5) == 0); + + DelayTraffic = (Bandwidth == 0.0) ? false : true; + + ForwardAddrress.Reset(new TSockAddrInet6(Address.data(), ForwardPort)); + const ui32 BufSize = DelayTraffic ? 4096 : 65536 + 4096; + Buf.resize(BufSize); + } + + ~TTrafficInterrupter() { + AtomicSet(Running, 0); + this->Join(); + } + +private: + TAtomic Running = 1; + TVector<char> Buf; + TSocketPoller SocketPoller; + THolder<TSockAddrInet6> ForwardAddrress; + TVector<void*> Events; + TDuration RejectingTrafficTimeout; + TDuration CurrentRejectingTimeout; + TDuration DefaultPollTimeout = TDuration::MilliSeconds(100); + TDuration DisconnectTimeout = TDuration::MilliSeconds(100); + THPTimer RejectingStateTimer; + THPTimer DisconnectTimer; + double Bandwidth; + const bool Disconnect; + bool RejectingTraffic; + bool DelayTraffic; + + void UpdateRejectingState() { + if (TDuration::Seconds(std::abs(RejectingStateTimer.Passed())) > CurrentRejectingTimeout) { + RejectingStateTimer.Reset(); + CurrentRejectingTimeout = (RandomNumber<ui32>(1) ? RejectingTrafficTimeout + TDuration::Seconds(1.0) : RejectingTrafficTimeout - TDuration::Seconds(0.2)); + RejectingTraffic = !RejectingTraffic; + } + } + + void RandomlyDisconnect() { + if (TDuration::Seconds(std::abs(DisconnectTimer.Passed())) > DisconnectTimeout) { + DisconnectTimer.Reset(); + if (RandomNumber<ui32>(100) > 90) { + if (!Connections.empty()) { + auto it = Connections.begin(); + std::advance(it, RandomNumber<ui32>(Connections.size())); + SocketPoller.Unwait(static_cast<SOCKET>(*it->FirstSocket.get())); + SocketPoller.Unwait(static_cast<SOCKET>(*it->SecondSocket.get())); + Connections.erase(it); + } + } + } + } + + void* ThreadProc() override { + int pollReadyCount = 0; + SocketPoller.WaitRead(static_cast<SOCKET>(ListenSocket), &ListenSocket); + Events.resize(10); + + while (AtomicGet(Running)) { + if (RejectingTrafficTimeout != TDuration::Zero()) { + UpdateRejectingState(); + } + if (Disconnect) { + RandomlyDisconnect(); + } + if (!RejectingTraffic) { + TDuration timeout = DefaultPollTimeout; + auto updateTimout = [&timeout](TDirectedConnection& conn) { + if (conn.DelayedQueue) { + timeout = Min(timeout, conn.DelayedQueue.top().first - TInstant::Now()); + } + }; + for (auto& it : Connections) { + updateTimout(it.ForwardConnection); + updateTimout(it.BackwardConnection); + } + pollReadyCount = SocketPoller.WaitT(Events.data(), Events.size(), timeout); + if (pollReadyCount > 0) { + for (int i = 0; i < pollReadyCount; i++) { + HandleSocketPollEvent(Events[i]); + } + for (auto it : DroppedConnections) { + Connections.erase(it); + } + DroppedConnections.clear(); + } + } + if (DelayTraffic) { // process packets from DelayQueues + auto processDelayedPackages = [](TDirectedConnection& conn) { + while (!conn.DelayedQueue.empty()) { + auto& frontPackage = conn.DelayedQueue.top(); + if (TInstant::Now() >= frontPackage.first) { + TInet6StreamSocket* sock = frontPackage.second.ForwardSocket; + if (sock) { + sock->Send(frontPackage.second.Data.data(), frontPackage.second.Data.size()); + } + conn.DelayedQueue.pop(); + } else { + break; + } + } + }; + for (auto& it : Connections) { + processDelayedPackages(it.ForwardConnection); + processDelayedPackages(it.BackwardConnection); + } + } + } + ListenSocket.Close(); + return nullptr; + } + + void HandleSocketPollEvent(void* ev) { + if (ev == static_cast<void*>(&ListenSocket)) { + TSockAddrInet6 origin; + Connections.emplace_back(TConnectionDescriptor(std::unique_ptr<TInet6StreamSocket>(new TInet6StreamSocket), std::unique_ptr<TInet6StreamSocket>(new TInet6StreamSocket))); + int err = ListenSocket.Accept(Connections.back().FirstSocket.get(), &origin); + if (!err) { + err = Connections.back().SecondSocket->Connect(ForwardAddrress.Get()); + if (!err) { + Connections.back().ForwardConnection.ListIterator = --Connections.end(); + Connections.back().BackwardConnection.ListIterator = --Connections.end(); + SocketPoller.WaitRead(static_cast<SOCKET>(*Connections.back().FirstSocket), &Connections.back().ForwardConnection); + SocketPoller.WaitRead(static_cast<SOCKET>(*Connections.back().SecondSocket), &Connections.back().BackwardConnection); + } else { + Connections.back().FirstSocket->Close(); + } + } else { + Connections.pop_back(); + } + } else { + TDirectedConnection* directedConnection = static_cast<TDirectedConnection*>(ev); + int recvSize = 0; + do { + recvSize = directedConnection->Source->Recv(Buf.data(), Buf.size()); + } while (recvSize == -EINTR); + + if (recvSize > 0) { + if (DelayTraffic) { + // put packet into DelayQueue + const TDuration baseDelay = TDuration::MicroSeconds(recvSize * 1e6 / Bandwidth); + const TInstant now = TInstant::Now(); + directedConnection->Timestamp = Max(now, directedConnection->Timestamp) + baseDelay; + TDelayedPacket pkt; + pkt.ForwardSocket = directedConnection->Destination; + pkt.Data.resize(recvSize); + memcpy(pkt.Data.data(), Buf.data(), recvSize); + directedConnection->DelayedQueue.emplace(directedConnection->Timestamp, std::move(pkt)); + } else { + directedConnection->Destination->Send(Buf.data(), recvSize); + } + } else { + SocketPoller.Unwait(static_cast<SOCKET>(*directedConnection->Source)); + SocketPoller.Unwait(static_cast<SOCKET>(*directedConnection->Destination)); + DroppedConnections.emplace(directedConnection->ListIterator); + } + } + } +}; diff --git a/library/cpp/actors/interconnect/ut/lib/node.h b/library/cpp/actors/interconnect/ut/lib/node.h new file mode 100644 index 0000000000..ff30b1445e --- /dev/null +++ b/library/cpp/actors/interconnect/ut/lib/node.h @@ -0,0 +1,137 @@ +#pragma once + +#include <library/cpp/actors/core/actorsystem.h> +#include <library/cpp/actors/core/executor_pool_basic.h> +#include <library/cpp/actors/core/scheduler_basic.h> +#include <library/cpp/actors/core/mailbox.h> +#include <library/cpp/actors/dnsresolver/dnsresolver.h> + +#include <library/cpp/actors/interconnect/interconnect_tcp_server.h> +#include <library/cpp/actors/interconnect/interconnect_tcp_proxy.h> +#include <library/cpp/actors/interconnect/interconnect_proxy_wrapper.h> + +using namespace NActors; + +class TNode { + THolder<TActorSystem> ActorSystem; + +public: + TNode(ui32 nodeId, ui32 numNodes, const THashMap<ui32, ui16>& nodeToPort, const TString& address, + NMonitoring::TDynamicCounterPtr counters, TDuration deadPeerTimeout, + TChannelsConfig channelsSettings = TChannelsConfig(), + ui32 numDynamicNodes = 0, ui32 numThreads = 1) { + TActorSystemSetup setup; + setup.NodeId = nodeId; + setup.ExecutorsCount = 1; + setup.Executors.Reset(new TAutoPtr<IExecutorPool>[setup.ExecutorsCount]); + for (ui32 i = 0; i < setup.ExecutorsCount; ++i) { + setup.Executors[i].Reset(new TBasicExecutorPool(i, numThreads, 20 /* magic number */)); + } + setup.Scheduler.Reset(new TBasicSchedulerThread()); + const ui32 interconnectPoolId = 0; + + auto common = MakeIntrusive<TInterconnectProxyCommon>(); + common->NameserviceId = GetNameserviceActorId(); + common->MonCounters = counters->GetSubgroup("nodeId", ToString(nodeId)); + common->ChannelsConfig = channelsSettings; + common->ClusterUUID = "cluster"; + common->AcceptUUID = {common->ClusterUUID}; + common->TechnicalSelfHostName = address; + common->Settings.Handshake = TDuration::Seconds(1); + common->Settings.DeadPeer = deadPeerTimeout; + common->Settings.CloseOnIdle = TDuration::Minutes(1); + common->Settings.SendBufferDieLimitInMB = 512; + common->Settings.TotalInflightAmountOfData = 512 * 1024; + common->Settings.TCPSocketBufferSize = 2048 * 1024; + + setup.Interconnect.ProxyActors.resize(numNodes + 1 - numDynamicNodes); + setup.Interconnect.ProxyWrapperFactory = CreateProxyWrapperFactory(common, interconnectPoolId); + + for (ui32 i = 1; i <= numNodes; ++i) { + if (i == nodeId) { + // create listener actor for local node "nodeId" + setup.LocalServices.emplace_back(TActorId(), TActorSetupCmd(new TInterconnectListenerTCP(address, + nodeToPort.at(nodeId), common), TMailboxType::ReadAsFilled, interconnectPoolId)); + } else if (i <= numNodes - numDynamicNodes) { + // create proxy actor to reach node "i" + setup.Interconnect.ProxyActors[i] = {new TInterconnectProxyTCP(i, common), + TMailboxType::ReadAsFilled, interconnectPoolId}; + } + } + + setup.LocalServices.emplace_back(MakePollerActorId(), TActorSetupCmd(CreatePollerActor(), + TMailboxType::ReadAsFilled, 0)); + + const TActorId loggerActorId(0, "logger"); + constexpr ui32 LoggerComponentId = 410; // NKikimrServices::LOGGER + + auto loggerSettings = MakeIntrusive<NLog::TSettings>( + loggerActorId, + (NLog::EComponent)LoggerComponentId, + NLog::PRI_INFO, + NLog::PRI_DEBUG, + 0U); + + loggerSettings->Append( + NActorsServices::EServiceCommon_MIN, + NActorsServices::EServiceCommon_MAX, + NActorsServices::EServiceCommon_Name + ); + + constexpr ui32 WilsonComponentId = 430; // NKikimrServices::WILSON + static const TString WilsonComponentName = "WILSON"; + + loggerSettings->Append( + (NLog::EComponent)WilsonComponentId, + (NLog::EComponent)WilsonComponentId + 1, + [](NLog::EComponent) -> const TString & { return WilsonComponentName; }); + + // register nameserver table + auto names = MakeIntrusive<TTableNameserverSetup>(); + for (ui32 i = 1; i <= numNodes; ++i) { + names->StaticNodeTable[i] = TTableNameserverSetup::TNodeInfo(address, address, nodeToPort.at(i)); + } + setup.LocalServices.emplace_back( + NDnsResolver::MakeDnsResolverActorId(), + TActorSetupCmd( + NDnsResolver::CreateOnDemandDnsResolver(), + TMailboxType::ReadAsFilled, interconnectPoolId)); + setup.LocalServices.emplace_back(GetNameserviceActorId(), TActorSetupCmd( + CreateNameserverTable(names, interconnectPoolId), TMailboxType::ReadAsFilled, + interconnectPoolId)); + + // register logger + setup.LocalServices.emplace_back(loggerActorId, TActorSetupCmd(new TLoggerActor(loggerSettings, + CreateStderrBackend(), counters->GetSubgroup("subsystem", "logger")), + TMailboxType::ReadAsFilled, interconnectPoolId)); + + auto sp = MakeHolder<TActorSystemSetup>(std::move(setup)); + ActorSystem.Reset(new TActorSystem(sp, nullptr, loggerSettings)); + ActorSystem->Start(); + } + + ~TNode() { + ActorSystem->Stop(); + } + + bool Send(const TActorId& recipient, IEventBase* ev) { + return ActorSystem->Send(recipient, ev); + } + + TActorId RegisterActor(IActor* actor) { + return ActorSystem->Register(actor); + } + + TActorId InterconnectProxy(ui32 peerNodeId) { + return ActorSystem->InterconnectProxy(peerNodeId); + } + + void RegisterServiceActor(const TActorId& serviceId, IActor* actor) { + const TActorId actorId = ActorSystem->Register(actor); + ActorSystem->RegisterLocalService(serviceId, actorId); + } + + TActorSystem *GetActorSystem() const { + return ActorSystem.Get(); + } +}; diff --git a/library/cpp/actors/interconnect/ut/lib/test_actors.h b/library/cpp/actors/interconnect/ut/lib/test_actors.h new file mode 100644 index 0000000000..7591200471 --- /dev/null +++ b/library/cpp/actors/interconnect/ut/lib/test_actors.h @@ -0,0 +1,83 @@ +#pragma once + +namespace NActors { + class TSenderBaseActor: public TActorBootstrapped<TSenderBaseActor> { + protected: + const TActorId RecipientActorId; + const ui32 Preload; + ui64 SequenceNumber = 0; + ui32 InFlySize = 0; + + public: + TSenderBaseActor(const TActorId& recipientActorId, ui32 preload = 1) + : RecipientActorId(recipientActorId) + , Preload(preload) + { + } + + virtual ~TSenderBaseActor() { + } + + virtual void Bootstrap(const TActorContext& ctx) { + Become(&TSenderBaseActor::StateFunc); + ctx.Send(ctx.ExecutorThread.ActorSystem->InterconnectProxy(RecipientActorId.NodeId()), new TEvInterconnect::TEvConnectNode); + } + + virtual void SendMessagesIfPossible(const TActorContext& ctx) { + while (InFlySize < Preload) { + SendMessage(ctx); + } + } + + virtual void SendMessage(const TActorContext& /*ctx*/) { + ++SequenceNumber; + } + + virtual void Handle(TEvents::TEvUndelivered::TPtr& /*ev*/, const TActorContext& ctx) { + SendMessage(ctx); + } + + virtual void Handle(TEvTestResponse::TPtr& /*ev*/, const TActorContext& ctx) { + SendMessagesIfPossible(ctx); + } + + void Handle(TEvInterconnect::TEvNodeConnected::TPtr& /*ev*/, const TActorContext& ctx) { + SendMessagesIfPossible(ctx); + } + + void Handle(TEvInterconnect::TEvNodeDisconnected::TPtr& /*ev*/, const TActorContext& /*ctx*/) { + } + + virtual void Handle(TEvents::TEvPoisonPill::TPtr& /*ev*/, const TActorContext& ctx) { + Die(ctx); + } + + virtual STRICT_STFUNC(StateFunc, + HFunc(TEvTestResponse, Handle) + HFunc(TEvents::TEvUndelivered, Handle) + HFunc(TEvents::TEvPoisonPill, Handle) + HFunc(TEvInterconnect::TEvNodeConnected, Handle) + HFunc(TEvInterconnect::TEvNodeDisconnected, Handle) + ) + }; + + class TReceiverBaseActor: public TActor<TReceiverBaseActor> { + protected: + ui64 ReceivedCount = 0; + + public: + TReceiverBaseActor() + : TActor(&TReceiverBaseActor::StateFunc) + { + } + + virtual ~TReceiverBaseActor() { + } + + virtual STRICT_STFUNC(StateFunc, + HFunc(TEvTest, Handle) + ) + + virtual void Handle(TEvTest::TPtr& /*ev*/, const TActorContext& /*ctx*/) {} + }; +} diff --git a/library/cpp/actors/interconnect/ut/lib/test_events.h b/library/cpp/actors/interconnect/ut/lib/test_events.h new file mode 100644 index 0000000000..cd0d9e0152 --- /dev/null +++ b/library/cpp/actors/interconnect/ut/lib/test_events.h @@ -0,0 +1,49 @@ +#pragma once + +#include <library/cpp/actors/interconnect/ut/protos/interconnect_test.pb.h> + +namespace NActors { + enum { + EvTest = EventSpaceBegin(TEvents::ES_PRIVATE), + EvTestChan, + EvTestSmall, + EvTestLarge, + EvTestResponse, + }; + + struct TEvTest : TEventPB<TEvTest, NInterconnectTest::TEvTest, EvTest> { + TEvTest() = default; + + TEvTest(ui64 sequenceNumber, const TString& payload) { + Record.SetSequenceNumber(sequenceNumber); + Record.SetPayload(payload); + } + }; + + struct TEvTestLarge : TEventPB<TEvTestLarge, NInterconnectTest::TEvTestLarge, EvTestLarge> { + TEvTestLarge() = default; + + TEvTestLarge(ui64 sequenceNumber, const TString& payload) { + Record.SetSequenceNumber(sequenceNumber); + Record.SetPayload(payload); + } + }; + + struct TEvTestSmall : TEventPB<TEvTestSmall, NInterconnectTest::TEvTestSmall, EvTestSmall> { + TEvTestSmall() = default; + + TEvTestSmall(ui64 sequenceNumber, const TString& payload) { + Record.SetSequenceNumber(sequenceNumber); + Record.SetPayload(payload); + } + }; + + struct TEvTestResponse : TEventPB<TEvTestResponse, NInterconnectTest::TEvTestResponse, EvTestResponse> { + TEvTestResponse() = default; + + TEvTestResponse(ui64 confirmedSequenceNumber) { + Record.SetConfirmedSequenceNumber(confirmedSequenceNumber); + } + }; + +} diff --git a/library/cpp/actors/interconnect/ut/lib/ya.make b/library/cpp/actors/interconnect/ut/lib/ya.make new file mode 100644 index 0000000000..80f45f364f --- /dev/null +++ b/library/cpp/actors/interconnect/ut/lib/ya.make @@ -0,0 +1,12 @@ +LIBRARY() + +OWNER(vkanaev) + +SRCS( + node.h + test_events.h + test_actors.h + ic_test_cluster.h +) + +END() diff --git a/library/cpp/actors/interconnect/ut/poller_actor_ut.cpp b/library/cpp/actors/interconnect/ut/poller_actor_ut.cpp new file mode 100644 index 0000000000..23d846a2fd --- /dev/null +++ b/library/cpp/actors/interconnect/ut/poller_actor_ut.cpp @@ -0,0 +1,264 @@ +#include <library/cpp/actors/interconnect/poller_actor.h> +#include <library/cpp/actors/testlib/test_runtime.h> + +#include <library/cpp/testing/unittest/registar.h> + +#include <util/network/pair.h> +#include <util/network/socket.h> + +using namespace NActors; + +class TTestSocket: public TSharedDescriptor { +public: + explicit TTestSocket(SOCKET fd) + : Fd_(fd) + { + } + + int GetDescriptor() override { + return Fd_; + } + +private: + SOCKET Fd_; +}; +using TTestSocketPtr = TIntrusivePtr<TTestSocket>; + +// create pair of connected, non-blocking sockets +std::pair<TTestSocketPtr, TTestSocketPtr> NonBlockSockets() { + SOCKET fds[2]; + SocketPair(fds); + SetNonBlock(fds[0]); + SetNonBlock(fds[1]); + return {MakeIntrusive<TTestSocket>(fds[0]), MakeIntrusive<TTestSocket>(fds[1])}; +} + +std::pair<TTestSocketPtr, TTestSocketPtr> TcpSockets() { + // create server (listening) socket + SOCKET server = socket(AF_INET, SOCK_STREAM, 0); + Y_VERIFY(server != -1, "socket() failed with %s", strerror(errno)); + + // bind it to local address with automatically picked port + sockaddr_in addr; + addr.sin_family = AF_INET; + addr.sin_port = 0; + addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK); + if (bind(server, (sockaddr*)&addr, sizeof(addr)) == -1) { + Y_FAIL("bind() failed with %s", strerror(errno)); + } else if (listen(server, 1) == -1) { + Y_FAIL("listen() failed with %s", strerror(errno)); + } + + // obtain local address for client + socklen_t len = sizeof(addr); + if (getsockname(server, (sockaddr*)&addr, &len) == -1) { + Y_FAIL("getsockname() failed with %s", strerror(errno)); + } + + // create client socket + SOCKET client = socket(AF_INET, SOCK_STREAM, 0); + Y_VERIFY(client != -1, "socket() failed with %s", strerror(errno)); + + // connect to server + if (connect(client, (sockaddr*)&addr, len) == -1) { + Y_FAIL("connect() failed with %s", strerror(errno)); + } + + // accept connection from the other side + SOCKET accepted = accept(server, nullptr, nullptr); + Y_VERIFY(accepted != -1, "accept() failed with %s", strerror(errno)); + + // close server socket + closesocket(server); + + return std::make_pair(MakeIntrusive<TTestSocket>(client), MakeIntrusive<TTestSocket>(accepted)); +} + +class TPollerActorTest: public TTestBase { + UNIT_TEST_SUITE(TPollerActorTest); + UNIT_TEST(Registration) + UNIT_TEST(ReadNotification) + UNIT_TEST(WriteNotification) + UNIT_TEST(HangupNotification) + UNIT_TEST_SUITE_END(); + +public: + void SetUp() override { + ActorSystem_ = MakeHolder<TTestActorRuntimeBase>(); + ActorSystem_->Initialize(); + + PollerId_ = ActorSystem_->Register(CreatePollerActor()); + + TDispatchOptions opts; + opts.FinalEvents.emplace_back(TEvents::TSystem::Bootstrap, 1); + ActorSystem_->DispatchEvents(opts); + } + + void Registration() { + auto [s1, s2] = NonBlockSockets(); + auto readerId = ActorSystem_->AllocateEdgeActor(); + auto writerId = ActorSystem_->AllocateEdgeActor(); + + RegisterSocket(s1, readerId, writerId); + + // reader should receive event after socket registration + TPollerToken::TPtr token; + { + auto ev = ActorSystem_->GrabEdgeEvent<TEvPollerRegisterResult>(readerId); + token = ev->Get()->PollerToken; + } + + // writer should receive event after socket registration + { + auto ev = ActorSystem_->GrabEdgeEvent<TEvPollerRegisterResult>(writerId); + UNIT_ASSERT_EQUAL(token, ev->Get()->PollerToken); + } + } + + void ReadNotification() { + auto [r, w] = NonBlockSockets(); + auto clientId = ActorSystem_->AllocateEdgeActor(); + RegisterSocket(r, clientId, {}); + + // notification after registration + TPollerToken::TPtr token; + { + auto ev = ActorSystem_->GrabEdgeEvent<TEvPollerRegisterResult>(clientId); + token = ev->Get()->PollerToken; + } + + char buf; + + // data not ready yet for read + UNIT_ASSERT(read(r->GetDescriptor(), &buf, sizeof(buf)) == -1); + UNIT_ASSERT(errno == EWOULDBLOCK); + + // request read poll + token->Request(true, false); + + // write data + UNIT_ASSERT(write(w->GetDescriptor(), "x", 1) == 1); + + // notification after socket become readable + { + auto ev = ActorSystem_->GrabEdgeEvent<TEvPollerReady>(clientId); + UNIT_ASSERT_EQUAL(ev->Get()->Socket, r); + UNIT_ASSERT(ev->Get()->Read); + UNIT_ASSERT(!ev->Get()->Write); + } + + // read data + UNIT_ASSERT(read(r->GetDescriptor(), &buf, sizeof(buf)) == 1); + UNIT_ASSERT_EQUAL('x', buf); + + // no more data to read + UNIT_ASSERT(read(r->GetDescriptor(), &buf, sizeof(buf)) == -1); + UNIT_ASSERT(errno == EWOULDBLOCK); + } + + void WriteNotification() { + auto [r, w] = TcpSockets(); + auto clientId = ActorSystem_->AllocateEdgeActor(); + SetNonBlock(w->GetDescriptor()); + RegisterSocket(w, TActorId{}, clientId); + + // notification after registration + TPollerToken::TPtr token; + { + auto ev = ActorSystem_->GrabEdgeEvent<TEvPollerRegisterResult>(clientId); + token = ev->Get()->PollerToken; + } + + char buffer[4096]; + memset(buffer, 'x', sizeof(buffer)); + + for (int i = 0; i < 1000; ++i) { + // write as much as possible to send buffer + ssize_t written = 0; + for (;;) { + ssize_t res = send(w->GetDescriptor(), buffer, sizeof(buffer), 0); + if (res > 0) { + written += res; + } else if (res == 0) { + UNIT_FAIL("unexpected zero return from send()"); + } else { + UNIT_ASSERT(res == -1); + if (errno == EINTR) { + continue; + } else if (errno == EWOULDBLOCK || errno == EAGAIN) { + token->Request(false, true); + break; + } else { + UNIT_FAIL("unexpected error from send()"); + } + } + } + Cerr << "written " << written << " bytes" << Endl; + + // read all written data from the read end + for (;;) { + char buffer[4096]; + ssize_t res = recv(r->GetDescriptor(), buffer, sizeof(buffer), 0); + if (res > 0) { + UNIT_ASSERT(written >= res); + written -= res; + if (!written) { + break; + } + } else if (res == 0) { + UNIT_FAIL("unexpected zero return from recv()"); + } else { + UNIT_ASSERT(res == -1); + if (errno == EINTR) { + continue; + } else { + UNIT_FAIL("unexpected error from recv()"); + } + } + } + + // wait for notification after socket becomes writable again + { + auto ev = ActorSystem_->GrabEdgeEvent<TEvPollerReady>(clientId); + UNIT_ASSERT_EQUAL(ev->Get()->Socket, w); + UNIT_ASSERT(!ev->Get()->Read); + UNIT_ASSERT(ev->Get()->Write); + } + } + } + + void HangupNotification() { + auto [r, w] = NonBlockSockets(); + auto clientId = ActorSystem_->AllocateEdgeActor(); + RegisterSocket(r, clientId, TActorId{}); + + // notification after registration + TPollerToken::TPtr token; + { + auto ev = ActorSystem_->GrabEdgeEvent<TEvPollerRegisterResult>(clientId); + token = ev->Get()->PollerToken; + } + + token->Request(true, false); + ShutDown(w->GetDescriptor(), SHUT_RDWR); + + // notification after peer shuts down its socket + { + auto ev = ActorSystem_->GrabEdgeEvent<TEvPollerReady>(clientId); + UNIT_ASSERT_EQUAL(ev->Get()->Socket, r); + UNIT_ASSERT(ev->Get()->Read); + } + } + +private: + void RegisterSocket(TTestSocketPtr socket, TActorId readActorId, TActorId writeActorId) { + auto ev = new TEvPollerRegister{socket, readActorId, writeActorId}; + ActorSystem_->Send(new IEventHandle(PollerId_, TActorId{}, ev)); + } + +private: + THolder<TTestActorRuntimeBase> ActorSystem_; + TActorId PollerId_; +}; + +UNIT_TEST_SUITE_REGISTRATION(TPollerActorTest); diff --git a/library/cpp/actors/interconnect/ut/protos/interconnect_test.proto b/library/cpp/actors/interconnect/ut/protos/interconnect_test.proto new file mode 100644 index 0000000000..b9b2bd6a4e --- /dev/null +++ b/library/cpp/actors/interconnect/ut/protos/interconnect_test.proto @@ -0,0 +1,25 @@ +package NInterconnectTest; + +message TEvTest { + optional uint64 SequenceNumber = 1; + optional bytes Payload = 2; +} + +message TEvTestChan { + optional uint64 SequenceNumber = 1; + optional uint64 Payload = 2; +} + +message TEvTestLarge { + optional uint64 SequenceNumber = 1; + optional bytes Payload = 2; +} + +message TEvTestSmall { + optional uint64 SequenceNumber = 1; + optional bytes Payload = 2; +} + +message TEvTestResponse { + optional uint64 ConfirmedSequenceNumber = 1; +} diff --git a/library/cpp/actors/interconnect/ut/protos/ya.make b/library/cpp/actors/interconnect/ut/protos/ya.make new file mode 100644 index 0000000000..48a8cc129f --- /dev/null +++ b/library/cpp/actors/interconnect/ut/protos/ya.make @@ -0,0 +1,11 @@ +PROTO_LIBRARY() + +OWNER(vkanaev) + +SRCS( + interconnect_test.proto +) + +EXCLUDE_TAGS(GO_PROTO) + +END() diff --git a/library/cpp/actors/interconnect/ut/ya.make b/library/cpp/actors/interconnect/ut/ya.make new file mode 100644 index 0000000000..2f5b13352e --- /dev/null +++ b/library/cpp/actors/interconnect/ut/ya.make @@ -0,0 +1,36 @@ +UNITTEST() + +OWNER( + alexvru + g:kikimr +) + +IF (SANITIZER_TYPE == "thread") + TIMEOUT(1200) + SIZE(LARGE) + TAG(ya:fat) +ELSE() + TIMEOUT(600) + SIZE(MEDIUM) +ENDIF() + +SRCS( + channel_scheduler_ut.cpp + event_holder_pool_ut.cpp + interconnect_ut.cpp + large.cpp + poller_actor_ut.cpp + dynamic_proxy_ut.cpp +) + +PEERDIR( + library/cpp/actors/core + library/cpp/actors/interconnect + library/cpp/actors/interconnect/ut/lib + library/cpp/actors/interconnect/ut/protos + library/cpp/actors/testlib + library/cpp/digest/md5 + library/cpp/testing/unittest +) + +END() diff --git a/library/cpp/actors/interconnect/ut_fat/main.cpp b/library/cpp/actors/interconnect/ut_fat/main.cpp new file mode 100644 index 0000000000..5d19bc3003 --- /dev/null +++ b/library/cpp/actors/interconnect/ut_fat/main.cpp @@ -0,0 +1,133 @@ + +#include <library/cpp/actors/interconnect/interconnect_tcp_proxy.h> +#include <library/cpp/actors/interconnect/ut/protos/interconnect_test.pb.h> +#include <library/cpp/actors/interconnect/ut/lib/ic_test_cluster.h> +#include <library/cpp/actors/interconnect/ut/lib/interrupter.h> +#include <library/cpp/actors/interconnect/ut/lib/test_events.h> +#include <library/cpp/actors/interconnect/ut/lib/test_actors.h> +#include <library/cpp/actors/interconnect/ut/lib/node.h> + +#include <library/cpp/testing/unittest/tests_data.h> +#include <library/cpp/testing/unittest/registar.h> + +#include <util/network/sock.h> +#include <util/network/poller.h> +#include <util/system/atomic.h> +#include <util/generic/set.h> + +Y_UNIT_TEST_SUITE(InterconnectUnstableConnection) { + using namespace NActors; + + class TSenderActor: public TSenderBaseActor { + TDeque<ui64> InFly; + ui16 SendFlags; + + public: + TSenderActor(const TActorId& recipientActorId, ui16 sendFlags) + : TSenderBaseActor(recipientActorId, 32) + , SendFlags(sendFlags) + { + } + + ~TSenderActor() override { + Cerr << "Sent " << SequenceNumber << " messages\n"; + } + + void SendMessage(const TActorContext& ctx) override { + const ui32 flags = IEventHandle::MakeFlags(0, SendFlags); + const ui64 cookie = SequenceNumber; + const TString payload('@', RandomNumber<size_t>(65536) + 4096); + ctx.Send(RecipientActorId, new TEvTest(SequenceNumber, payload), flags, cookie); + InFly.push_back(SequenceNumber); + ++InFlySize; + ++SequenceNumber; + } + + void Handle(TEvents::TEvUndelivered::TPtr& ev, const TActorContext& ctx) override { + auto record = std::find(InFly.begin(), InFly.end(), ev->Cookie); + if (SendFlags & IEventHandle::FlagGenerateUnsureUndelivered) { + if (record != InFly.end()) { + InFly.erase(record); + --InFlySize; + SendMessage(ctx); + } + } else { + Y_VERIFY(record != InFly.end()); + } + } + + void Handle(TEvTestResponse::TPtr& ev, const TActorContext& ctx) override { + Y_VERIFY(InFly); + const NInterconnectTest::TEvTestResponse& record = ev->Get()->Record; + Y_VERIFY(record.HasConfirmedSequenceNumber()); + if (!(SendFlags & IEventHandle::FlagGenerateUnsureUndelivered)) { + while (record.GetConfirmedSequenceNumber() != InFly.front()) { + InFly.pop_front(); + --InFlySize; + } + } + Y_VERIFY(record.GetConfirmedSequenceNumber() == InFly.front(), "got# %" PRIu64 " expected# %" PRIu64, + record.GetConfirmedSequenceNumber(), InFly.front()); + InFly.pop_front(); + --InFlySize; + SendMessagesIfPossible(ctx); + } + }; + + class TReceiverActor: public TReceiverBaseActor { + ui64 ReceivedCount = 0; + TNode* SenderNode = nullptr; + + public: + TReceiverActor(TNode* senderNode) + : TReceiverBaseActor() + , SenderNode(senderNode) + { + } + + void Handle(TEvTest::TPtr& ev, const TActorContext& /*ctx*/) override { + const NInterconnectTest::TEvTest& m = ev->Get()->Record; + Y_VERIFY(m.HasSequenceNumber()); + Y_VERIFY(m.GetSequenceNumber() >= ReceivedCount, "got #%" PRIu64 " expected at least #%" PRIu64, + m.GetSequenceNumber(), ReceivedCount); + ++ReceivedCount; + SenderNode->Send(ev->Sender, new TEvTestResponse(m.GetSequenceNumber())); + } + + ~TReceiverActor() override { + Cerr << "Received " << ReceivedCount << " messages\n"; + } + }; + + Y_UNIT_TEST(InterconnectTestWithProxyUnsureUndelivered) { + ui32 numNodes = 2; + double bandWidth = 1000000; + ui16 flags = IEventHandle::FlagTrackDelivery | IEventHandle::FlagGenerateUnsureUndelivered; + TTestICCluster::TTrafficInterrupterSettings interrupterSettings{TDuration::Seconds(2), bandWidth, true}; + + TTestICCluster testCluster(numNodes, TChannelsConfig(), &interrupterSettings); + + TReceiverActor* receiverActor = new TReceiverActor(testCluster.GetNode(1)); + const TActorId recipient = testCluster.RegisterActor(receiverActor, 2); + TSenderActor* senderActor = new TSenderActor(recipient, flags); + testCluster.RegisterActor(senderActor, 1); + + NanoSleep(30ULL * 1000 * 1000 * 1000); + } + + Y_UNIT_TEST(InterconnectTestWithProxy) { + ui32 numNodes = 2; + double bandWidth = 1000000; + ui16 flags = IEventHandle::FlagTrackDelivery; + TTestICCluster::TTrafficInterrupterSettings interrupterSettings{TDuration::Seconds(2), bandWidth, true}; + + TTestICCluster testCluster(numNodes, TChannelsConfig(), &interrupterSettings); + + TReceiverActor* receiverActor = new TReceiverActor(testCluster.GetNode(1)); + const TActorId recipient = testCluster.RegisterActor(receiverActor, 2); + TSenderActor* senderActor = new TSenderActor(recipient, flags); + testCluster.RegisterActor(senderActor, 1); + + NanoSleep(30ULL * 1000 * 1000 * 1000); + } +} diff --git a/library/cpp/actors/interconnect/ut_fat/ya.make b/library/cpp/actors/interconnect/ut_fat/ya.make new file mode 100644 index 0000000000..6e58d08154 --- /dev/null +++ b/library/cpp/actors/interconnect/ut_fat/ya.make @@ -0,0 +1,25 @@ +UNITTEST() + +OWNER( + vkanaev + alexvru +) + +SIZE(LARGE) + +TAG(ya:fat) + +SRCS( + main.cpp +) + +PEERDIR( + library/cpp/actors/core + library/cpp/actors/interconnect + library/cpp/actors/interconnect/mock + library/cpp/actors/interconnect/ut/lib + library/cpp/actors/interconnect/ut/protos + library/cpp/testing/unittest +) + +END() diff --git a/library/cpp/actors/interconnect/watchdog_timer.h b/library/cpp/actors/interconnect/watchdog_timer.h new file mode 100644 index 0000000000..c190105a59 --- /dev/null +++ b/library/cpp/actors/interconnect/watchdog_timer.h @@ -0,0 +1,68 @@ +#pragma once + +namespace NActors { + template <typename TEvent> + class TWatchdogTimer { + using TCallback = std::function<void()>; + + const TDuration Timeout; + const TCallback Callback; + + TInstant LastResetTimestamp; + TEvent* ExpectedEvent = nullptr; + ui32 Iteration = 0; + + static constexpr ui32 NumIterationsBeforeFiring = 2; + + public: + TWatchdogTimer(TDuration timeout, TCallback callback) + : Timeout(timeout) + , Callback(std::move(callback)) + { + } + + void Arm(const TActorIdentity& actor) { + if (Timeout != TDuration::Zero() && Timeout != TDuration::Max()) { + Schedule(Timeout, actor); + Reset(); + } + } + + void Reset() { + LastResetTimestamp = TActivationContext::Now(); + } + + void Disarm() { + ExpectedEvent = nullptr; + } + + void operator()(typename TEvent::TPtr& ev) { + if (ev->Get() == ExpectedEvent) { + const TInstant now = TActivationContext::Now(); + const TInstant barrier = LastResetTimestamp + Timeout; + if (now < barrier) { + // the time hasn't come yet + Schedule(barrier - now, TActorIdentity(ev->Recipient)); + } else if (Iteration < NumIterationsBeforeFiring) { + // time has come, but we will still give actor a chance to process some messages and rearm timer + ++Iteration; + TActivationContext::Send(ev.Release()); // send this event into queue once more + } else { + // no chance to disarm, fire callback + Callback(); + ExpectedEvent = nullptr; + Iteration = 0; + } + } + } + + private: + void Schedule(TDuration timeout, const TActorIdentity& actor) { + auto ev = MakeHolder<TEvent>(); + ExpectedEvent = ev.Get(); + Iteration = 0; + actor.Schedule(timeout, ev.Release()); + } + }; + +} diff --git a/library/cpp/actors/interconnect/ya.make b/library/cpp/actors/interconnect/ya.make new file mode 100644 index 0000000000..60d29b0fc0 --- /dev/null +++ b/library/cpp/actors/interconnect/ya.make @@ -0,0 +1,94 @@ +LIBRARY() + +OWNER( + ddoarn + alexvru + g:kikimr +) + +NO_WSHADOW() + +IF (PROFILE_MEMORY_ALLOCATIONS) + CFLAGS(-DPROFILE_MEMORY_ALLOCATIONS) +ENDIF() + +SRCS( + channel_scheduler.h + event_filter.h + event_holder_pool.h + events_local.h + interconnect_address.cpp + interconnect_address.h + interconnect_channel.cpp + interconnect_channel.h + interconnect_common.h + interconnect_counters.cpp + interconnect.h + interconnect_handshake.cpp + interconnect_handshake.h + interconnect_impl.h + interconnect_mon.cpp + interconnect_mon.h + interconnect_nameserver_dynamic.cpp + interconnect_nameserver_table.cpp + interconnect_proxy_wrapper.cpp + interconnect_proxy_wrapper.h + interconnect_resolve.cpp + interconnect_stream.cpp + interconnect_stream.h + interconnect_tcp_input_session.cpp + interconnect_tcp_proxy.cpp + interconnect_tcp_proxy.h + interconnect_tcp_server.cpp + interconnect_tcp_server.h + interconnect_tcp_session.cpp + interconnect_tcp_session.h + load.cpp + load.h + logging.h + packet.cpp + packet.h + poller_actor.cpp + poller_actor.h + poller.h + poller_tcp.cpp + poller_tcp.h + poller_tcp_unit.cpp + poller_tcp_unit.h + poller_tcp_unit_select.cpp + poller_tcp_unit_select.h + profiler.h + slowpoke_actor.h + types.cpp + types.h + watchdog_timer.h +) + +IF (OS_LINUX) + SRCS( + poller_tcp_unit_epoll.cpp + poller_tcp_unit_epoll.h + ) +ENDIF() + +PEERDIR( + contrib/libs/libc_compat + contrib/libs/openssl + library/cpp/actors/core + library/cpp/actors/dnscachelib + library/cpp/actors/dnsresolver + library/cpp/actors/helpers + library/cpp/actors/prof + library/cpp/actors/protos + library/cpp/actors/util + library/cpp/digest/crc32c + library/cpp/json + library/cpp/lwtrace + library/cpp/monlib/dynamic_counters + library/cpp/monlib/metrics + library/cpp/monlib/service/pages/tablesorter + library/cpp/openssl/init + library/cpp/packedtypes +) + +END() |