aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/actors/interconnect
diff options
context:
space:
mode:
authorDevtools Arcadia <arcadia-devtools@yandex-team.ru>2022-02-07 18:08:42 +0300
committerDevtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>2022-02-07 18:08:42 +0300
commit1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
treee26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/actors/interconnect
downloadydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/actors/interconnect')
-rw-r--r--library/cpp/actors/interconnect/channel_scheduler.h120
-rw-r--r--library/cpp/actors/interconnect/event_filter.h72
-rw-r--r--library/cpp/actors/interconnect/event_holder_pool.h128
-rw-r--r--library/cpp/actors/interconnect/events_local.h403
-rw-r--r--library/cpp/actors/interconnect/interconnect.h179
-rw-r--r--library/cpp/actors/interconnect/interconnect_address.cpp94
-rw-r--r--library/cpp/actors/interconnect/interconnect_address.h29
-rw-r--r--library/cpp/actors/interconnect/interconnect_channel.cpp176
-rw-r--r--library/cpp/actors/interconnect/interconnect_channel.h127
-rw-r--r--library/cpp/actors/interconnect/interconnect_common.h106
-rw-r--r--library/cpp/actors/interconnect/interconnect_counters.cpp692
-rw-r--r--library/cpp/actors/interconnect/interconnect_counters.h59
-rw-r--r--library/cpp/actors/interconnect/interconnect_handshake.cpp995
-rw-r--r--library/cpp/actors/interconnect/interconnect_handshake.h24
-rw-r--r--library/cpp/actors/interconnect/interconnect_impl.h45
-rw-r--r--library/cpp/actors/interconnect/interconnect_mon.cpp276
-rw-r--r--library/cpp/actors/interconnect/interconnect_mon.h15
-rw-r--r--library/cpp/actors/interconnect/interconnect_nameserver_base.h83
-rw-r--r--library/cpp/actors/interconnect/interconnect_nameserver_dynamic.cpp178
-rw-r--r--library/cpp/actors/interconnect/interconnect_nameserver_table.cpp86
-rw-r--r--library/cpp/actors/interconnect/interconnect_proxy_wrapper.cpp47
-rw-r--r--library/cpp/actors/interconnect/interconnect_proxy_wrapper.h12
-rw-r--r--library/cpp/actors/interconnect/interconnect_resolve.cpp174
-rw-r--r--library/cpp/actors/interconnect/interconnect_stream.cpp628
-rw-r--r--library/cpp/actors/interconnect/interconnect_stream.h131
-rw-r--r--library/cpp/actors/interconnect/interconnect_tcp_input_session.cpp476
-rw-r--r--library/cpp/actors/interconnect/interconnect_tcp_proxy.cpp936
-rw-r--r--library/cpp/actors/interconnect/interconnect_tcp_proxy.h537
-rw-r--r--library/cpp/actors/interconnect/interconnect_tcp_server.cpp117
-rw-r--r--library/cpp/actors/interconnect/interconnect_tcp_server.h57
-rw-r--r--library/cpp/actors/interconnect/interconnect_tcp_session.cpp1228
-rw-r--r--library/cpp/actors/interconnect/interconnect_tcp_session.h565
-rw-r--r--library/cpp/actors/interconnect/load.cpp405
-rw-r--r--library/cpp/actors/interconnect/load.h24
-rw-r--r--library/cpp/actors/interconnect/logging.h68
-rw-r--r--library/cpp/actors/interconnect/mock/ic_mock.cpp298
-rw-r--r--library/cpp/actors/interconnect/mock/ic_mock.h19
-rw-r--r--library/cpp/actors/interconnect/mock/tsan.supp1
-rw-r--r--library/cpp/actors/interconnect/mock/ya.make16
-rw-r--r--library/cpp/actors/interconnect/packet.cpp32
-rw-r--r--library/cpp/actors/interconnect/packet.h324
-rw-r--r--library/cpp/actors/interconnect/poller.h23
-rw-r--r--library/cpp/actors/interconnect/poller_actor.cpp294
-rw-r--r--library/cpp/actors/interconnect/poller_actor.h63
-rw-r--r--library/cpp/actors/interconnect/poller_actor_darwin.h95
-rw-r--r--library/cpp/actors/interconnect/poller_actor_linux.h114
-rw-r--r--library/cpp/actors/interconnect/poller_actor_win.h103
-rw-r--r--library/cpp/actors/interconnect/poller_tcp.cpp35
-rw-r--r--library/cpp/actors/interconnect/poller_tcp.h25
-rw-r--r--library/cpp/actors/interconnect/poller_tcp_unit.cpp126
-rw-r--r--library/cpp/actors/interconnect/poller_tcp_unit.h67
-rw-r--r--library/cpp/actors/interconnect/poller_tcp_unit_epoll.cpp125
-rw-r--r--library/cpp/actors/interconnect/poller_tcp_unit_epoll.h33
-rw-r--r--library/cpp/actors/interconnect/poller_tcp_unit_select.cpp86
-rw-r--r--library/cpp/actors/interconnect/poller_tcp_unit_select.h19
-rw-r--r--library/cpp/actors/interconnect/profiler.h142
-rw-r--r--library/cpp/actors/interconnect/slowpoke_actor.h47
-rw-r--r--library/cpp/actors/interconnect/types.cpp564
-rw-r--r--library/cpp/actors/interconnect/types.h43
-rw-r--r--library/cpp/actors/interconnect/ut/channel_scheduler_ut.cpp115
-rw-r--r--library/cpp/actors/interconnect/ut/dynamic_proxy_ut.cpp179
-rw-r--r--library/cpp/actors/interconnect/ut/event_holder_pool_ut.cpp59
-rw-r--r--library/cpp/actors/interconnect/ut/interconnect_ut.cpp177
-rw-r--r--library/cpp/actors/interconnect/ut/large.cpp85
-rw-r--r--library/cpp/actors/interconnect/ut/lib/ic_test_cluster.h84
-rw-r--r--library/cpp/actors/interconnect/ut/lib/interrupter.h249
-rw-r--r--library/cpp/actors/interconnect/ut/lib/node.h137
-rw-r--r--library/cpp/actors/interconnect/ut/lib/test_actors.h83
-rw-r--r--library/cpp/actors/interconnect/ut/lib/test_events.h49
-rw-r--r--library/cpp/actors/interconnect/ut/lib/ya.make12
-rw-r--r--library/cpp/actors/interconnect/ut/poller_actor_ut.cpp264
-rw-r--r--library/cpp/actors/interconnect/ut/protos/interconnect_test.proto25
-rw-r--r--library/cpp/actors/interconnect/ut/protos/ya.make11
-rw-r--r--library/cpp/actors/interconnect/ut/ya.make36
-rw-r--r--library/cpp/actors/interconnect/ut_fat/main.cpp133
-rw-r--r--library/cpp/actors/interconnect/ut_fat/ya.make25
-rw-r--r--library/cpp/actors/interconnect/watchdog_timer.h68
-rw-r--r--library/cpp/actors/interconnect/ya.make94
78 files changed, 13871 insertions, 0 deletions
diff --git a/library/cpp/actors/interconnect/channel_scheduler.h b/library/cpp/actors/interconnect/channel_scheduler.h
new file mode 100644
index 0000000000..551a4cb61a
--- /dev/null
+++ b/library/cpp/actors/interconnect/channel_scheduler.h
@@ -0,0 +1,120 @@
+#pragma once
+
+#include "interconnect_channel.h"
+#include "event_holder_pool.h"
+
+#include <memory>
+
+namespace NActors {
+
+ class TChannelScheduler {
+ const ui32 PeerNodeId;
+ std::array<std::optional<TEventOutputChannel>, 16> ChannelArray;
+ THashMap<ui16, TEventOutputChannel> ChannelMap;
+ std::shared_ptr<IInterconnectMetrics> Metrics;
+ TEventHolderPool& Pool;
+ const ui32 MaxSerializedEventSize;
+ const TSessionParams Params;
+
+ struct THeapItem {
+ TEventOutputChannel *Channel;
+ ui64 WeightConsumed = 0;
+
+ friend bool operator <(const THeapItem& x, const THeapItem& y) {
+ return x.WeightConsumed > y.WeightConsumed;
+ }
+ };
+
+ std::vector<THeapItem> Heap;
+
+ public:
+ TChannelScheduler(ui32 peerNodeId, const TChannelsConfig& predefinedChannels,
+ std::shared_ptr<IInterconnectMetrics> metrics, TEventHolderPool& pool, ui32 maxSerializedEventSize,
+ TSessionParams params)
+ : PeerNodeId(peerNodeId)
+ , Metrics(std::move(metrics))
+ , Pool(pool)
+ , MaxSerializedEventSize(maxSerializedEventSize)
+ , Params(std::move(params))
+ {
+ for (const auto& item : predefinedChannels) {
+ GetOutputChannel(item.first);
+ }
+ }
+
+ TEventOutputChannel *PickChannelWithLeastConsumedWeight() {
+ Y_VERIFY(!Heap.empty());
+ return Heap.front().Channel;
+ }
+
+ void AddToHeap(TEventOutputChannel& channel, ui64 counter) {
+ if (channel.IsWorking()) {
+ ui64 weight = channel.WeightConsumedOnPause;
+ weight -= Min(weight, counter - channel.EqualizeCounterOnPause);
+ Heap.push_back(THeapItem{&channel, weight});
+ std::push_heap(Heap.begin(), Heap.end());
+ }
+ }
+
+ void FinishPick(ui64 weightConsumed, ui64 counter) {
+ std::pop_heap(Heap.begin(), Heap.end());
+ auto& item = Heap.back();
+ item.WeightConsumed += weightConsumed;
+ if (item.Channel->IsWorking()) { // reschedule
+ std::push_heap(Heap.begin(), Heap.end());
+ } else { // remove from heap
+ item.Channel->EqualizeCounterOnPause = counter;
+ item.Channel->WeightConsumedOnPause = item.WeightConsumed;
+ Heap.pop_back();
+ }
+ }
+
+ TEventOutputChannel& GetOutputChannel(ui16 channel) {
+ if (channel < ChannelArray.size()) {
+ auto& res = ChannelArray[channel];
+ if (Y_UNLIKELY(!res)) {
+ res.emplace(Pool, channel, PeerNodeId, MaxSerializedEventSize, Metrics,
+ Params);
+ }
+ return *res;
+ } else {
+ auto it = ChannelMap.find(channel);
+ if (Y_UNLIKELY(it == ChannelMap.end())) {
+ it = ChannelMap.emplace(std::piecewise_construct, std::forward_as_tuple(channel),
+ std::forward_as_tuple(Pool, channel, PeerNodeId, MaxSerializedEventSize,
+ Metrics, Params)).first;
+ }
+ return it->second;
+ }
+ }
+
+ ui64 Equalize() {
+ if (Heap.empty()) {
+ return 0; // nothing to do here -- no working channels
+ }
+
+ // find the minimum consumed weight among working channels and then adjust weights
+ ui64 min = Max<ui64>();
+ for (THeapItem& item : Heap) {
+ min = Min(min, item.WeightConsumed);
+ }
+ for (THeapItem& item : Heap) {
+ item.WeightConsumed -= min;
+ }
+ return min;
+ }
+
+ template<typename TCallback>
+ void ForEach(TCallback&& callback) {
+ for (auto& channel : ChannelArray) {
+ if (channel) {
+ callback(*channel);
+ }
+ }
+ for (auto& [id, channel] : ChannelMap) {
+ callback(channel);
+ }
+ }
+ };
+
+} // NActors
diff --git a/library/cpp/actors/interconnect/event_filter.h b/library/cpp/actors/interconnect/event_filter.h
new file mode 100644
index 0000000000..47dabf5f16
--- /dev/null
+++ b/library/cpp/actors/interconnect/event_filter.h
@@ -0,0 +1,72 @@
+#pragma once
+
+#include <library/cpp/actors/core/event.h>
+
+namespace NActors {
+
+ enum class ENodeClass {
+ SYSTEM,
+ LOCAL_TENANT,
+ PEER_TENANT,
+ COUNT
+ };
+
+ class TEventFilter : TNonCopyable {
+ using TRouteMask = ui16;
+
+ TVector<TVector<TRouteMask>> ScopeRoutes;
+
+ public:
+ TEventFilter()
+ : ScopeRoutes(65536)
+ {}
+
+ void RegisterEvent(ui32 type, TRouteMask routes) {
+ auto& evSpaceIndex = ScopeRoutes[type >> 16];
+ const ui16 subtype = type & 65535;
+ size_t size = (subtype + 512) & ~511;
+ if (evSpaceIndex.size() < size) {
+ evSpaceIndex.resize(size);
+ }
+ evSpaceIndex[subtype] = routes;
+ }
+
+ bool CheckIncomingEvent(const IEventHandle& ev, const TScopeId& localScopeId) const {
+ TRouteMask routes = 0;
+ if (const auto& evSpaceIndex = ScopeRoutes[ev.Type >> 16]) {
+ const ui16 subtype = ev.Type & 65535;
+ routes = subtype < evSpaceIndex.size() ? evSpaceIndex[subtype] : 0;
+ } else {
+ routes = ~TRouteMask(); // allow unfilled event spaces by default
+ }
+ return routes & MakeRouteMask(GetNodeClass(ev.OriginScopeId, localScopeId), GetNodeClass(localScopeId, ev.OriginScopeId));
+ }
+
+ static ENodeClass GetNodeClass(const TScopeId& scopeId, const TScopeId& localScopeId) {
+ if (scopeId.first == 0) {
+ // system scope, or null scope
+ return scopeId.second ? ENodeClass::SYSTEM : ENodeClass::COUNT;
+ } else if (scopeId == localScopeId) {
+ return ENodeClass::LOCAL_TENANT;
+ } else {
+ return ENodeClass::PEER_TENANT;
+ }
+ }
+
+ static TRouteMask MakeRouteMask(ENodeClass from, ENodeClass to) {
+ if (from == ENodeClass::COUNT || to == ENodeClass::COUNT) {
+ return 0;
+ }
+ return 1U << (static_cast<unsigned>(from) * static_cast<unsigned>(ENodeClass::COUNT) + static_cast<unsigned>(to));
+ }
+
+ static TRouteMask MakeRouteMask(std::initializer_list<std::pair<ENodeClass, ENodeClass>> items) {
+ TRouteMask mask = 0;
+ for (const auto& p : items) {
+ mask |= MakeRouteMask(p.first, p.second);
+ }
+ return mask;
+ }
+ };
+
+} // NActors
diff --git a/library/cpp/actors/interconnect/event_holder_pool.h b/library/cpp/actors/interconnect/event_holder_pool.h
new file mode 100644
index 0000000000..b6090a3bc8
--- /dev/null
+++ b/library/cpp/actors/interconnect/event_holder_pool.h
@@ -0,0 +1,128 @@
+#pragma once
+
+#include <library/cpp/containers/stack_vector/stack_vec.h>
+
+#include "packet.h"
+
+namespace NActors {
+ struct TEvFreeItems : TEventLocal<TEvFreeItems, EventSpaceBegin(TEvents::ES_PRIVATE)> {
+ static constexpr size_t MaxEvents = 256;
+
+ TList<TTcpPacketOutTask> Items;
+ std::list<TEventHolder> FreeQueue;
+ TStackVec<THolder<IEventBase>, MaxEvents> Events;
+ TStackVec<THolder<TEventSerializedData>, MaxEvents> Buffers;
+ std::shared_ptr<std::atomic<TAtomicBase>> Counter;
+ ui64 NumBytes = 0;
+
+ ~TEvFreeItems() {
+ if (Counter) {
+ TAtomicBase res = Counter->fetch_sub(NumBytes) - NumBytes;
+ Y_VERIFY(res >= 0);
+ }
+ }
+
+ bool GetInLineForDestruction(const TIntrusivePtr<TInterconnectProxyCommon>& common) {
+ Y_VERIFY(!Counter);
+ const auto& counter = common->DestructorQueueSize;
+ const auto& max = common->MaxDestructorQueueSize;
+ if (counter && (TAtomicBase)(counter->fetch_add(NumBytes) + NumBytes) > max) {
+ counter->fetch_sub(NumBytes);
+ return false;
+ }
+ Counter = counter;
+ return true;
+ }
+ };
+
+ class TEventHolderPool {
+ using TDestroyCallback = std::function<void(THolder<IEventBase>)>;
+
+ static constexpr size_t MaxFreeQueueItems = 32;
+ static constexpr size_t FreeQueueTrimThreshold = MaxFreeQueueItems * 2;
+ static constexpr ui64 MaxBytesPerMessage = 10 * 1024 * 1024;
+
+ TIntrusivePtr<TInterconnectProxyCommon> Common;
+ std::list<TEventHolder> Cache;
+ THolder<TEvFreeItems> PendingFreeEvent;
+ TDestroyCallback DestroyCallback;
+
+ public:
+ TEventHolderPool(TIntrusivePtr<TInterconnectProxyCommon> common,
+ TDestroyCallback destroyCallback)
+ : Common(std::move(common))
+ , DestroyCallback(std::move(destroyCallback))
+ {}
+
+ TEventHolder& Allocate(std::list<TEventHolder>& queue) {
+ if (Cache.empty()) {
+ queue.emplace_back();
+ } else {
+ queue.splice(queue.end(), Cache, Cache.begin());
+ }
+ return queue.back();
+ }
+
+ void Release(std::list<TEventHolder>& queue) {
+ for (auto it = queue.begin(); it != queue.end(); ) {
+ Release(queue, it++);
+ }
+ }
+
+ void Release(std::list<TEventHolder>& queue, std::list<TEventHolder>::iterator event) {
+ bool trim = false;
+
+ // release held event, if any
+ if (THolder<IEventBase> ev = std::move(event->Event)) {
+ auto p = GetPendingEvent();
+ p->NumBytes += event->EventSerializedSize;
+ auto& events = p->Events;
+ events.push_back(std::move(ev));
+ trim = trim || events.size() >= TEvFreeItems::MaxEvents || p->NumBytes >= MaxBytesPerMessage;
+ }
+
+ // release buffer, if any
+ if (event->Buffer && event->Buffer.RefCount() == 1) {
+ auto p = GetPendingEvent();
+ p->NumBytes += event->EventSerializedSize;
+ auto& buffers = p->Buffers;
+ buffers.emplace_back(event->Buffer.Release());
+ trim = trim || buffers.size() >= TEvFreeItems::MaxEvents || p->NumBytes >= MaxBytesPerMessage;
+ }
+
+ // free event and trim the cache if its size is exceeded
+ event->Clear();
+ Cache.splice(Cache.end(), queue, event);
+ if (Cache.size() >= FreeQueueTrimThreshold) {
+ auto& freeQueue = GetPendingEvent()->FreeQueue;
+ auto it = Cache.begin();
+ std::advance(it, Cache.size() - MaxFreeQueueItems);
+ freeQueue.splice(freeQueue.end(), Cache, Cache.begin(), it);
+ trim = true;
+ }
+
+ // release items if we have hit the limit
+ if (trim) {
+ Trim();
+ }
+ }
+
+ void Trim() {
+ if (auto ev = std::move(PendingFreeEvent); ev && ev->GetInLineForDestruction(Common)) {
+ DestroyCallback(std::move(ev));
+ }
+
+ // ensure it is dropped
+ PendingFreeEvent.Reset();
+ }
+
+ private:
+ TEvFreeItems* GetPendingEvent() {
+ if (!PendingFreeEvent) {
+ PendingFreeEvent.Reset(new TEvFreeItems);
+ }
+ return PendingFreeEvent.Get();
+ }
+ };
+
+}
diff --git a/library/cpp/actors/interconnect/events_local.h b/library/cpp/actors/interconnect/events_local.h
new file mode 100644
index 0000000000..8a46ffd535
--- /dev/null
+++ b/library/cpp/actors/interconnect/events_local.h
@@ -0,0 +1,403 @@
+#pragma once
+
+#include <library/cpp/actors/core/events.h>
+#include <library/cpp/actors/core/event_local.h>
+#include <library/cpp/actors/protos/interconnect.pb.h>
+#include <util/generic/deque.h>
+#include <util/network/address.h>
+
+#include "interconnect_stream.h"
+#include "packet.h"
+#include "types.h"
+
+namespace NActors {
+ struct TProgramInfo {
+ ui64 PID = 0;
+ ui64 StartTime = 0;
+ ui64 Serial = 0;
+ };
+
+ enum class ENetwork : ui32 {
+ ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+ // local messages
+ ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+ Start = EventSpaceBegin(TEvents::ES_INTERCONNECT_TCP),
+
+ SocketReadyRead = Start,
+ SocketReadyWrite,
+ SocketError,
+ Connect,
+ Disconnect,
+ IncomingConnection,
+ HandshakeAsk,
+ HandshakeAck,
+ HandshakeNak,
+ HandshakeDone,
+ HandshakeFail,
+ Kick,
+ Flush,
+ NodeInfo,
+ BunchOfEventsToDestroy,
+ HandshakeRequest,
+ HandshakeReplyOK,
+ HandshakeReplyError,
+ ResolveAddress,
+ AddressInfo,
+ ResolveError,
+ HTTPStreamStatus,
+ HTTPSendContent,
+ ConnectProtocolWakeup,
+ HTTPProtocolRetry,
+ EvPollerRegister,
+ EvPollerRegisterResult,
+ EvPollerReady,
+ EvUpdateFromInputSession,
+ EvConfirmUpdate,
+ EvSessionBufferSizeRequest,
+ EvSessionBufferSizeResponse,
+ EvProcessPingRequest,
+ EvGetSecureSocket,
+ EvSecureSocket,
+
+ ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+ // nonlocal messages; their indices must be preserved in order to work properly while doing rolling update
+ ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+ // interconnect load test message
+ EvLoadMessage = Start + 256,
+ };
+
+ struct TEvSocketReadyRead: public TEventLocal<TEvSocketReadyRead, ui32(ENetwork::SocketReadyRead)> {
+ DEFINE_SIMPLE_LOCAL_EVENT(TEvSocketReadyRead, "Network: TEvSocketReadyRead")
+ };
+
+ struct TEvSocketReadyWrite: public TEventLocal<TEvSocketReadyWrite, ui32(ENetwork::SocketReadyWrite)> {
+ DEFINE_SIMPLE_LOCAL_EVENT(TEvSocketReadyWrite, "Network: TEvSocketReadyWrite")
+ };
+
+ struct TEvSocketError: public TEventLocal<TEvSocketError, ui32(ENetwork::SocketError)> {
+ DEFINE_SIMPLE_LOCAL_EVENT(TEvSocketError, ::strerror(Error))
+ TString GetReason() const {
+ return ::strerror(Error);
+ }
+ const int Error;
+ TIntrusivePtr<NInterconnect::TStreamSocket> Socket;
+
+ TEvSocketError(int error, TIntrusivePtr<NInterconnect::TStreamSocket> sock)
+ : Error(error)
+ , Socket(std::move(sock))
+ {
+ }
+ };
+
+ struct TEvSocketConnect: public TEventLocal<TEvSocketConnect, ui32(ENetwork::Connect)> {
+ DEFINE_SIMPLE_LOCAL_EVENT(TEvSocketConnect, "Network: TEvSocketConnect")
+ };
+
+ struct TEvSocketDisconnect: public TEventLocal<TEvSocketDisconnect, ui32(ENetwork::Disconnect)> {
+ DEFINE_SIMPLE_LOCAL_EVENT(TEvSocketDisconnect, "Network: TEvSocketDisconnect")
+ TDisconnectReason Reason;
+
+ TEvSocketDisconnect(TDisconnectReason reason)
+ : Reason(std::move(reason))
+ {
+ }
+ };
+
+ struct TEvHandshakeAsk: public TEventLocal<TEvHandshakeAsk, ui32(ENetwork::HandshakeAsk)> {
+ DEFINE_SIMPLE_LOCAL_EVENT(TEvHandshakeAsk, "Network: TEvHandshakeAsk")
+ TEvHandshakeAsk(const TActorId& self,
+ const TActorId& peer,
+ ui64 counter)
+ : Self(self)
+ , Peer(peer)
+ , Counter(counter)
+ {
+ }
+ const TActorId Self;
+ const TActorId Peer;
+ const ui64 Counter;
+ };
+
+ struct TEvHandshakeAck: public TEventLocal<TEvHandshakeAck, ui32(ENetwork::HandshakeAck)> {
+ DEFINE_SIMPLE_LOCAL_EVENT(TEvHandshakeAck, "Network: TEvHandshakeAck")
+
+ TEvHandshakeAck(const TActorId& self, ui64 nextPacket, TSessionParams params)
+ : Self(self)
+ , NextPacket(nextPacket)
+ , Params(std::move(params))
+ {}
+
+ const TActorId Self;
+ const ui64 NextPacket;
+ const TSessionParams Params;
+ };
+
+ struct TEvHandshakeNak : TEventLocal<TEvHandshakeNak, ui32(ENetwork::HandshakeNak)> {
+ DEFINE_SIMPLE_LOCAL_EVENT(TEvSocketReadyRead, "Network: TEvHandshakeNak")
+ };
+
+ struct TEvHandshakeRequest
+ : public TEventLocal<TEvHandshakeRequest,
+ ui32(ENetwork::HandshakeRequest)> {
+ DEFINE_SIMPLE_LOCAL_EVENT(TEvHandshakeRequest,
+ "Network: TEvHandshakeRequest")
+
+ NActorsInterconnect::THandshakeRequest Record;
+ };
+
+ struct TEvHandshakeReplyOK
+ : public TEventLocal<TEvHandshakeReplyOK,
+ ui32(ENetwork::HandshakeReplyOK)> {
+ DEFINE_SIMPLE_LOCAL_EVENT(TEvHandshakeReplyOK,
+ "Network: TEvHandshakeReplyOK")
+
+ NActorsInterconnect::THandshakeReply Record;
+ };
+
+ struct TEvHandshakeReplyError
+ : public TEventLocal<TEvHandshakeReplyError,
+ ui32(ENetwork::HandshakeReplyError)> {
+ DEFINE_SIMPLE_LOCAL_EVENT(TEvHandshakeReplyError,
+ "Network: TEvHandshakeReplyError")
+
+ TEvHandshakeReplyError(TString error) {
+ Record.SetErrorExplaination(error);
+ }
+
+ NActorsInterconnect::THandshakeReply Record;
+ };
+
+ struct TEvIncomingConnection: public TEventLocal<TEvIncomingConnection, ui32(ENetwork::IncomingConnection)> {
+ DEFINE_SIMPLE_LOCAL_EVENT(TEvIncomingConnection, "Network: TEvIncomingConnection")
+ TIntrusivePtr<NInterconnect::TStreamSocket> Socket;
+ NInterconnect::TAddress Address;
+
+ TEvIncomingConnection(TIntrusivePtr<NInterconnect::TStreamSocket> socket, NInterconnect::TAddress address)
+ : Socket(std::move(socket))
+ , Address(std::move(address))
+ {}
+ };
+
+ struct TEvHandshakeDone: public TEventLocal<TEvHandshakeDone, ui32(ENetwork::HandshakeDone)> {
+ DEFINE_SIMPLE_LOCAL_EVENT(TEvHandshakeDone, "Network: TEvHandshakeDone")
+
+ TEvHandshakeDone(
+ TIntrusivePtr<NInterconnect::TStreamSocket> socket,
+ const TActorId& peer,
+ const TActorId& self,
+ ui64 nextPacket,
+ TAutoPtr<TProgramInfo>&& programInfo,
+ TSessionParams params)
+ : Socket(std::move(socket))
+ , Peer(peer)
+ , Self(self)
+ , NextPacket(nextPacket)
+ , ProgramInfo(std::move(programInfo))
+ , Params(std::move(params))
+ {
+ }
+
+ TIntrusivePtr<NInterconnect::TStreamSocket> Socket;
+ const TActorId Peer;
+ const TActorId Self;
+ const ui64 NextPacket;
+ TAutoPtr<TProgramInfo> ProgramInfo;
+ const TSessionParams Params;
+ };
+
+ struct TEvHandshakeFail: public TEventLocal<TEvHandshakeFail, ui32(ENetwork::HandshakeFail)> {
+ DEFINE_SIMPLE_LOCAL_EVENT(TEvHandshakeFail, "Network: TEvHandshakeFail")
+
+ enum EnumHandshakeFail {
+ HANDSHAKE_FAIL_TRANSIENT,
+ HANDSHAKE_FAIL_PERMANENT,
+ HANDSHAKE_FAIL_SESSION_MISMATCH,
+ };
+
+ TEvHandshakeFail(EnumHandshakeFail temporary, TString explanation)
+ : Temporary(temporary)
+ , Explanation(std::move(explanation))
+ {
+ }
+
+ const EnumHandshakeFail Temporary;
+ const TString Explanation;
+ };
+
+ struct TEvKick: public TEventLocal<TEvKick, ui32(ENetwork::Kick)> {
+ DEFINE_SIMPLE_LOCAL_EVENT(TEvKick, "Network: TEvKick")
+ };
+
+ struct TEvFlush: public TEventLocal<TEvFlush, ui32(ENetwork::Flush)> {
+ DEFINE_SIMPLE_LOCAL_EVENT(TEvFlush, "Network: TEvFlush")
+ };
+
+ struct TEvLocalNodeInfo
+ : public TEventLocal<TEvLocalNodeInfo, ui32(ENetwork::NodeInfo)> {
+ DEFINE_SIMPLE_LOCAL_EVENT(TEvLocalNodeInfo, "Network: TEvLocalNodeInfo")
+
+ ui32 NodeId;
+ NAddr::IRemoteAddrPtr Address;
+ };
+
+ struct TEvBunchOfEventsToDestroy : TEventLocal<TEvBunchOfEventsToDestroy, ui32(ENetwork::BunchOfEventsToDestroy)> {
+ DEFINE_SIMPLE_LOCAL_EVENT(TEvBunchOfEventsToDestroy,
+ "Network: TEvBunchOfEventsToDestroy")
+
+ TEvBunchOfEventsToDestroy(TDeque<TAutoPtr<IEventBase>> events)
+ : Events(std::move(events))
+ {
+ }
+
+ TDeque<TAutoPtr<IEventBase>> Events;
+ };
+
+ struct TEvResolveAddress
+ : public TEventLocal<TEvResolveAddress, ui32(ENetwork::ResolveAddress)> {
+ DEFINE_SIMPLE_LOCAL_EVENT(TEvResolveAddress, "Network: TEvResolveAddress")
+
+ TString Address;
+ ui16 Port;
+ };
+
+ struct TEvAddressInfo
+ : public TEventLocal<TEvAddressInfo, ui32(ENetwork::AddressInfo)> {
+ DEFINE_SIMPLE_LOCAL_EVENT(TEvAddressInfo, "Network: TEvAddressInfo")
+
+ NAddr::IRemoteAddrPtr Address;
+ };
+
+ struct TEvResolveError
+ : public TEventLocal<TEvResolveError, ui32(ENetwork::ResolveError)> {
+ DEFINE_SIMPLE_LOCAL_EVENT(TEvResolveError, "Network: TEvResolveError")
+
+ TString Explain;
+ };
+
+ struct TEvHTTPStreamStatus
+ : public TEventLocal<TEvHTTPStreamStatus, ui32(ENetwork::HTTPStreamStatus)> {
+ DEFINE_SIMPLE_LOCAL_EVENT(TEvHTTPStreamStatus,
+ "Network: TEvHTTPStreamStatus")
+ enum EStatus {
+ READY,
+ COMPLETE,
+ ERROR,
+ };
+
+ EStatus Status;
+ TString Error;
+ TString HttpHeaders;
+ };
+
+ struct TEvHTTPSendContent
+ : public TEventLocal<TEvHTTPSendContent, ui32(ENetwork::HTTPSendContent)> {
+ DEFINE_SIMPLE_LOCAL_EVENT(TEvHTTPSendContent, "Network: TEvHTTPSendContent")
+
+ const char* Data;
+ size_t Len;
+ bool Last;
+ };
+
+ struct TEvConnectWakeup
+ : public TEventLocal<TEvConnectWakeup,
+ ui32(ENetwork::ConnectProtocolWakeup)> {
+ DEFINE_SIMPLE_LOCAL_EVENT(TEvConnectWakeup, "Protocols: TEvConnectWakeup")
+ };
+
+ struct TEvHTTPProtocolRetry
+ : public TEventLocal<TEvHTTPProtocolRetry,
+ ui32(ENetwork::HTTPProtocolRetry)> {
+ DEFINE_SIMPLE_LOCAL_EVENT(TEvHTTPProtocolRetry,
+ "Protocols: TEvHTTPProtocolRetry")
+ };
+
+ struct TEvLoadMessage
+ : TEventPB<TEvLoadMessage, NActorsInterconnect::TEvLoadMessage, static_cast<ui32>(ENetwork::EvLoadMessage)> {
+ TEvLoadMessage() = default;
+
+ template <typename TContainer>
+ TEvLoadMessage(const TContainer& route, const TString& id, const TString* payload) {
+ for (const TActorId& actorId : route) {
+ auto* hop = Record.AddHops();
+ if (actorId) {
+ ActorIdToProto(actorId, hop->MutableNextHop());
+ }
+ }
+ Record.SetId(id);
+ if (payload) {
+ Record.SetPayload(*payload);
+ }
+ }
+
+ template <typename TContainer>
+ TEvLoadMessage(const TContainer& route, const TString& id, TRope&& payload) {
+ for (const TActorId& actorId : route) {
+ auto* hop = Record.AddHops();
+ if (actorId) {
+ ActorIdToProto(actorId, hop->MutableNextHop());
+ }
+ }
+ Record.SetId(id);
+ AddPayload(std::move(payload));
+ }
+ };
+
+ struct TEvUpdateFromInputSession : TEventLocal<TEvUpdateFromInputSession, static_cast<ui32>(ENetwork::EvUpdateFromInputSession)> {
+ ui64 ConfirmedByInput; // latest Confirm value from processed input packet
+ ui64 NumDataBytes;
+ TDuration Ping;
+
+ TEvUpdateFromInputSession(ui64 confirmedByInput, ui64 numDataBytes, TDuration ping)
+ : ConfirmedByInput(confirmedByInput)
+ , NumDataBytes(numDataBytes)
+ , Ping(ping)
+ {
+ }
+ };
+
+ struct TEvConfirmUpdate : TEventLocal<TEvConfirmUpdate, static_cast<ui32>(ENetwork::EvConfirmUpdate)>
+ {};
+
+ struct TEvSessionBufferSizeRequest : TEventLocal<TEvSessionBufferSizeRequest, static_cast<ui32>(ENetwork::EvSessionBufferSizeRequest)> {
+ //DEFINE_SIMPLE_LOCAL_EVENT(TEvSessionBufferSizeRequest, "Session: TEvSessionBufferSizeRequest")
+ DEFINE_SIMPLE_LOCAL_EVENT(TEvSessionBufferSizeRequest, "Network: TEvSessionBufferSizeRequest");
+ };
+
+ struct TEvSessionBufferSizeResponse : TEventLocal<TEvSessionBufferSizeResponse, static_cast<ui32>(ENetwork::EvSessionBufferSizeResponse)> {
+ TEvSessionBufferSizeResponse(const TActorId& sessionId, ui64 outputBufferSize)
+ : SessionID(sessionId)
+ , BufferSize(outputBufferSize)
+ {
+ }
+
+ TActorId SessionID;
+ ui64 BufferSize;
+ };
+
+ struct TEvProcessPingRequest : TEventLocal<TEvProcessPingRequest, static_cast<ui32>(ENetwork::EvProcessPingRequest)> {
+ const ui64 Payload;
+
+ TEvProcessPingRequest(ui64 payload)
+ : Payload(payload)
+ {}
+ };
+
+ struct TEvGetSecureSocket : TEventLocal<TEvGetSecureSocket, (ui32)ENetwork::EvGetSecureSocket> {
+ TIntrusivePtr<NInterconnect::TStreamSocket> Socket;
+
+ TEvGetSecureSocket(TIntrusivePtr<NInterconnect::TStreamSocket> socket)
+ : Socket(std::move(socket))
+ {}
+ };
+
+ struct TEvSecureSocket : TEventLocal<TEvSecureSocket, (ui32)ENetwork::EvSecureSocket> {
+ TIntrusivePtr<NInterconnect::TSecureSocket> Socket;
+
+ TEvSecureSocket(TIntrusivePtr<NInterconnect::TSecureSocket> socket)
+ : Socket(std::move(socket))
+ {}
+ };
+
+}
diff --git a/library/cpp/actors/interconnect/interconnect.h b/library/cpp/actors/interconnect/interconnect.h
new file mode 100644
index 0000000000..225a5243fd
--- /dev/null
+++ b/library/cpp/actors/interconnect/interconnect.h
@@ -0,0 +1,179 @@
+#pragma once
+
+#include <library/cpp/actors/core/actorsystem.h>
+#include <library/cpp/actors/core/interconnect.h>
+#include <util/generic/map.h>
+#include <util/network/address.h>
+
+namespace NActors {
+ struct TInterconnectGlobalState: public TThrRefBase {
+ TString SelfAddress;
+ ui32 SelfPort;
+
+ TVector<TActorId> GlobalNameservers; // todo: add some info about (like expected reply time)
+ };
+
+ struct TInterconnectProxySetup: public TThrRefBase {
+ // synchronous (session -> proxy)
+ struct IProxy : TNonCopyable {
+ virtual ~IProxy() {
+ }
+
+ virtual void ActivateSession(const TActorContext& ctx) = 0; // session activated
+ virtual void DetachSession(const TActorContext& ctx) = 0; // session is dead
+ };
+
+ // synchronous (proxy -> session)
+ struct ISession : TNonCopyable {
+ virtual ~ISession() {
+ }
+
+ virtual void DetachSession(const TActorContext& ownerCtx, const TActorContext& sessionCtx) = 0; // kill yourself
+ virtual void ForwardPacket(TAutoPtr<IEventHandle>& ev, const TActorContext& ownerCtx, const TActorContext& sessionCtx) = 0; // receive packet for forward
+ virtual void Connect(const TActorContext& ownerCtx, const TActorContext& sessionCtx) = 0; // begin connection
+ virtual bool ReceiveIncomingSession(TAutoPtr<IEventHandle>& ev, const TActorContext& ownerCtx, const TActorContext& sessionCtx) = 0; // handle incoming session, if returns true - then session is dead and must be recreated with new one
+ };
+
+ ui32 DestinationNode;
+
+ TString StaticAddress; // if set - would be used as main destination address
+ int StaticPort;
+
+ TIntrusivePtr<TInterconnectGlobalState> GlobalState;
+
+ virtual IActor* CreateSession(const TActorId& ownerId, IProxy* owner) = 0; // returned actor is session and would be attached to same mailbox as proxy to allow sync calls
+ virtual TActorSetupCmd CreateAcceptor() = 0;
+ };
+
+ struct TNameserverSetup {
+ TActorId ServiceID;
+
+ TIntrusivePtr<TInterconnectGlobalState> GlobalState;
+ };
+
+ struct TTableNameserverSetup: public TThrRefBase {
+ struct TNodeInfo {
+ TString Address;
+ TString Host;
+ TString ResolveHost;
+ ui16 Port;
+ TNodeLocation Location;
+ TString& first;
+ ui16& second;
+
+ TNodeInfo()
+ : first(Address)
+ , second(Port)
+ {
+ }
+
+ TNodeInfo(const TNodeInfo&) = default;
+
+ // for testing purposes only
+ TNodeInfo(const TString& address, const TString& host, ui16 port)
+ : TNodeInfo()
+ {
+ Address = address;
+ Host = host;
+ ResolveHost = host;
+ Port = port;
+ }
+
+ TNodeInfo(const TString& address,
+ const TString& host,
+ const TString& resolveHost,
+ ui16 port,
+ const TNodeLocation& location)
+ : TNodeInfo()
+ {
+ Address = address;
+ Host = host;
+ ResolveHost = resolveHost;
+ Port = port;
+ Location = location;
+ }
+
+ // for testing purposes only
+ TNodeInfo& operator=(const std::pair<TString, ui32>& pr) {
+ Address = pr.first;
+ Host = pr.first;
+ ResolveHost = pr.first;
+ Port = pr.second;
+ return *this;
+ }
+
+ TNodeInfo& operator=(const TNodeInfo& ni) {
+ Address = ni.Address;
+ Host = ni.Host;
+ ResolveHost = ni.ResolveHost;
+ Port = ni.Port;
+ Location = ni.Location;
+ return *this;
+ }
+ };
+
+ TMap<ui32, TNodeInfo> StaticNodeTable;
+
+ bool IsEntriesUnique() const;
+ };
+
+ struct TNodeRegistrarSetup {
+ TActorId ServiceID;
+
+ TIntrusivePtr<TInterconnectGlobalState> GlobalState;
+ };
+
+ TActorId GetNameserviceActorId();
+
+ /**
+ * Const table-lookup based name service
+ */
+
+ IActor* CreateNameserverTable(
+ const TIntrusivePtr<TTableNameserverSetup>& setup,
+ ui32 poolId = 0);
+
+ /**
+ * Name service which can be paired with external discovery service.
+ * Copies information from setup on the start (table may be empty).
+ * Handles TEvNodesInfo to change list of known nodes.
+ *
+ * If PendingPeriod is not zero, wait for unknown nodeId
+ */
+
+ IActor* CreateDynamicNameserver(
+ const TIntrusivePtr<TTableNameserverSetup>& setup,
+ const TDuration& pendingPeriod = TDuration::Zero(),
+ ui32 poolId = 0);
+
+ /**
+ * Creates an actor that resolves host/port and replies with either:
+ *
+ * - TEvLocalNodeInfo on success
+ * - TEvResolveError on errors
+ *
+ * Optional defaultAddress may be used as fallback.
+ */
+ IActor* CreateResolveActor(
+ const TString& host, ui16 port, ui32 nodeId, const TString& defaultAddress,
+ const TActorId& replyTo, const TActorId& replyFrom, TInstant deadline);
+
+ inline IActor* CreateResolveActor(
+ ui32 nodeId, const TTableNameserverSetup::TNodeInfo& nodeInfo,
+ const TActorId& replyTo, const TActorId& replyFrom, TInstant deadline)
+ {
+ return CreateResolveActor(nodeInfo.ResolveHost, nodeInfo.Port, nodeId, nodeInfo.Address,
+ replyTo, replyFrom, deadline);
+ }
+
+ /**
+ * Creates an actor that resolves host/port and replies with either:
+ *
+ * - TEvAddressInfo on success
+ * - TEvResolveError on errors
+ */
+ IActor* CreateResolveActor(
+ const TString& host, ui16 port,
+ const TActorId& replyTo, const TActorId& replyFrom, TInstant deadline);
+
+}
diff --git a/library/cpp/actors/interconnect/interconnect_address.cpp b/library/cpp/actors/interconnect/interconnect_address.cpp
new file mode 100644
index 0000000000..8f474f5a39
--- /dev/null
+++ b/library/cpp/actors/interconnect/interconnect_address.cpp
@@ -0,0 +1,94 @@
+#include "interconnect_address.h"
+
+#include <util/string/cast.h>
+#include <util/system/file.h>
+
+#if defined(_linux_)
+#include <sys/un.h>
+#include <sys/stat.h>
+#endif
+
+namespace NInterconnect {
+ TAddress::TAddress() {
+ memset(&Addr, 0, sizeof(Addr));
+ }
+
+ TAddress::TAddress(NAddr::IRemoteAddr& addr) {
+ socklen_t len = addr.Len();
+ Y_VERIFY(len <= sizeof(Addr));
+ memcpy(&Addr.Generic, addr.Addr(), len);
+ }
+
+ int TAddress::GetFamily() const {
+ return Addr.Generic.sa_family;
+ }
+
+ socklen_t TAddress::Size() const {
+ switch (Addr.Generic.sa_family) {
+ case AF_INET6:
+ return sizeof(sockaddr_in6);
+ case AF_INET:
+ return sizeof(sockaddr_in);
+ default:
+ return 0;
+ }
+ }
+
+ sockaddr* TAddress::SockAddr() {
+ return &Addr.Generic;
+ }
+
+ const sockaddr* TAddress::SockAddr() const {
+ return &Addr.Generic;
+ }
+
+ ui16 TAddress::GetPort() const {
+ switch (Addr.Generic.sa_family) {
+ case AF_INET6:
+ return ntohs(Addr.Ipv6.sin6_port);
+ case AF_INET:
+ return ntohs(Addr.Ipv4.sin_port);
+ default:
+ return 0;
+ }
+ }
+
+ TString TAddress::ToString() const {
+ return GetAddress() + ":" + ::ToString(GetPort());
+ }
+
+ TAddress::TAddress(const char* addr, ui16 port) {
+ memset(&Addr, 0, sizeof(Addr));
+ if (inet_pton(Addr.Ipv6.sin6_family = AF_INET6, addr, &Addr.Ipv6.sin6_addr)) {
+ Addr.Ipv6.sin6_port = htons(port);
+ } else if (inet_pton(Addr.Ipv4.sin_family = AF_INET, addr, &Addr.Ipv4.sin_addr)) {
+ Addr.Ipv4.sin_port = htons(port);
+ }
+ }
+
+ TAddress::TAddress(const TString& addr, ui16 port)
+ : TAddress(addr.data(), port)
+ {}
+
+ TString TAddress::GetAddress() const {
+ const void *src;
+ socklen_t size;
+
+ switch (Addr.Generic.sa_family) {
+ case AF_INET6:
+ std::tie(src, size) = std::make_tuple(&Addr.Ipv6.sin6_addr, INET6_ADDRSTRLEN);
+ break;
+
+ case AF_INET:
+ std::tie(src, size) = std::make_tuple(&Addr.Ipv4.sin_addr, INET_ADDRSTRLEN);
+ break;
+
+ default:
+ return TString();
+ }
+
+ char *buffer = static_cast<char*>(alloca(size));
+ const char *p = inet_ntop(Addr.Generic.sa_family, const_cast<void*>(src), buffer, size);
+ return p ? TString(p) : TString();
+ }
+}
diff --git a/library/cpp/actors/interconnect/interconnect_address.h b/library/cpp/actors/interconnect/interconnect_address.h
new file mode 100644
index 0000000000..e9e0faec81
--- /dev/null
+++ b/library/cpp/actors/interconnect/interconnect_address.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <util/system/defaults.h>
+#include <util/network/init.h>
+#include <util/network/address.h>
+#include <util/generic/string.h>
+
+namespace NInterconnect {
+ class TAddress {
+ union {
+ sockaddr Generic;
+ sockaddr_in Ipv4;
+ sockaddr_in6 Ipv6;
+ } Addr;
+
+ public:
+ TAddress();
+ TAddress(const char* addr, ui16 port);
+ TAddress(const TString& addr, ui16 port);
+ TAddress(NAddr::IRemoteAddr& addr);
+ int GetFamily() const;
+ socklen_t Size() const;
+ ::sockaddr* SockAddr();
+ const ::sockaddr* SockAddr() const;
+ ui16 GetPort() const;
+ TString GetAddress() const;
+ TString ToString() const;
+ };
+}
diff --git a/library/cpp/actors/interconnect/interconnect_channel.cpp b/library/cpp/actors/interconnect/interconnect_channel.cpp
new file mode 100644
index 0000000000..a66ba2a154
--- /dev/null
+++ b/library/cpp/actors/interconnect/interconnect_channel.cpp
@@ -0,0 +1,176 @@
+#include "interconnect_channel.h"
+
+#include <library/cpp/actors/core/events.h>
+#include <library/cpp/actors/core/executor_thread.h>
+#include <library/cpp/actors/core/log.h>
+#include <library/cpp/actors/core/probes.h>
+#include <library/cpp/actors/protos/services_common.pb.h>
+#include <library/cpp/actors/prof/tag.h>
+#include <library/cpp/digest/crc32c/crc32c.h>
+
+LWTRACE_USING(ACTORLIB_PROVIDER);
+
+namespace NActors {
+ DECLARE_WILSON_EVENT(EventSentToSocket);
+ DECLARE_WILSON_EVENT(EventReceivedFromSocket);
+
+ bool TEventOutputChannel::FeedDescriptor(TTcpPacketOutTask& task, TEventHolder& event, ui64 *weightConsumed) {
+ const size_t amount = sizeof(TChannelPart) + sizeof(TEventDescr);
+ if (task.GetVirtualFreeAmount() < amount) {
+ return false;
+ }
+
+ NWilson::TTraceId traceId(event.Descr.TraceId);
+// if (ctx) {
+// WILSON_TRACE(*ctx, &traceId, EventSentToSocket);
+// }
+ traceId.Serialize(&event.Descr.TraceId);
+ LWTRACK(SerializeToPacketEnd, event.Orbit, PeerNodeId, ChannelId, OutputQueueSize, task.GetDataSize());
+ task.Orbit.Take(event.Orbit);
+
+ event.Descr.Flags = (event.Descr.Flags & ~IEventHandle::FlagForwardOnNondelivery) |
+ (ExtendedFormat ? IEventHandle::FlagExtendedFormat : 0);
+
+ TChannelPart *part = static_cast<TChannelPart*>(task.GetFreeArea());
+ part->Channel = ChannelId | TChannelPart::LastPartFlag;
+ part->Size = sizeof(TEventDescr);
+ memcpy(part + 1, &event.Descr, sizeof(TEventDescr));
+ task.AppendBuf(part, amount);
+ *weightConsumed += amount;
+ OutputQueueSize -= part->Size;
+ Metrics->UpdateOutputChannelEvents(ChannelId);
+
+ return true;
+ }
+
+ void TEventOutputChannel::DropConfirmed(ui64 confirm) {
+ LOG_DEBUG_IC_SESSION("ICOCH98", "Dropping confirmed messages");
+ for (auto it = NotYetConfirmed.begin(); it != NotYetConfirmed.end() && it->Serial <= confirm; ) {
+ Pool.Release(NotYetConfirmed, it++);
+ }
+ }
+
+ bool TEventOutputChannel::FeedBuf(TTcpPacketOutTask& task, ui64 serial, ui64 *weightConsumed) {
+ for (;;) {
+ Y_VERIFY(!Queue.empty());
+ TEventHolder& event = Queue.front();
+
+ switch (State) {
+ case EState::INITIAL:
+ event.InitChecksum();
+ LWTRACK(SerializeToPacketBegin, event.Orbit, PeerNodeId, ChannelId, OutputQueueSize);
+ if (event.Event) {
+ State = EState::CHUNKER;
+ IEventBase *base = event.Event.Get();
+ Chunker.SetSerializingEvent(base);
+ ExtendedFormat = base->IsExtendedFormat();
+ } else if (event.Buffer) {
+ State = EState::BUFFER;
+ Iter = event.Buffer->GetBeginIter();
+ ExtendedFormat = event.Buffer->IsExtendedFormat();
+ } else {
+ State = EState::DESCRIPTOR;
+ ExtendedFormat = false;
+ }
+ break;
+
+ case EState::CHUNKER:
+ case EState::BUFFER: {
+ size_t maxBytes = task.GetVirtualFreeAmount();
+ if (maxBytes <= sizeof(TChannelPart)) {
+ return false;
+ }
+
+ TChannelPart *part = static_cast<TChannelPart*>(task.GetFreeArea());
+ part->Channel = ChannelId;
+ part->Size = 0;
+ task.AppendBuf(part, sizeof(TChannelPart));
+ maxBytes -= sizeof(TChannelPart);
+ Y_VERIFY(maxBytes);
+
+ auto addChunk = [&](const void *data, size_t len) {
+ event.UpdateChecksum(Params, data, len);
+ task.AppendBuf(data, len);
+ part->Size += len;
+ Y_VERIFY_DEBUG(maxBytes >= len);
+ maxBytes -= len;
+
+ event.EventActuallySerialized += len;
+ if (event.EventActuallySerialized > MaxSerializedEventSize) {
+ throw TExSerializedEventTooLarge(event.Descr.Type);
+ }
+ };
+
+ bool complete = false;
+ if (State == EState::CHUNKER) {
+ Y_VERIFY_DEBUG(task.GetFreeArea() == part + 1);
+ while (!complete && maxBytes) {
+ const auto [first, last] = Chunker.FeedBuf(task.GetFreeArea(), maxBytes);
+ for (auto p = first; p != last; ++p) {
+ addChunk(p->first, p->second);
+ }
+ complete = Chunker.IsComplete();
+ }
+ Y_VERIFY(!complete || Chunker.IsSuccessfull());
+ Y_VERIFY_DEBUG(complete || !maxBytes);
+ } else { // BUFFER
+ while (const size_t numb = Min(maxBytes, Iter.ContiguousSize())) {
+ const char *obuf = Iter.ContiguousData();
+ addChunk(obuf, numb);
+ Iter += numb;
+ }
+ complete = !Iter.Valid();
+ }
+ if (complete) {
+ Y_VERIFY(event.EventActuallySerialized == event.EventSerializedSize,
+ "EventActuallySerialized# %" PRIu32 " EventSerializedSize# %" PRIu32 " Type# 0x%08" PRIx32,
+ event.EventActuallySerialized, event.EventSerializedSize, event.Descr.Type);
+ }
+
+ if (!part->Size) {
+ task.Undo(sizeof(TChannelPart));
+ } else {
+ *weightConsumed += sizeof(TChannelPart) + part->Size;
+ OutputQueueSize -= part->Size;
+ }
+ if (complete) {
+ State = EState::DESCRIPTOR;
+ }
+ break;
+ }
+
+ case EState::DESCRIPTOR:
+ if (!FeedDescriptor(task, event, weightConsumed)) {
+ return false;
+ }
+ event.Serial = serial;
+ NotYetConfirmed.splice(NotYetConfirmed.end(), Queue, Queue.begin()); // move event to not-yet-confirmed queue
+ State = EState::INITIAL;
+ return true; // we have processed whole event, signal to the caller
+ }
+ }
+ }
+
+ void TEventOutputChannel::NotifyUndelivered() {
+ LOG_DEBUG_IC_SESSION("ICOCH89", "Notyfying about Undelivered messages! NotYetConfirmed size: %zu, Queue size: %zu", NotYetConfirmed.size(), Queue.size());
+ if (State == EState::CHUNKER) {
+ Y_VERIFY(!Chunker.IsComplete()); // chunk must have an event being serialized
+ Y_VERIFY(!Queue.empty()); // this event must be the first event in queue
+ TEventHolder& event = Queue.front();
+ Y_VERIFY(Chunker.GetCurrentEvent() == event.Event.Get()); // ensure the event is valid
+ Chunker.Abort(); // stop serializing current event
+ Y_VERIFY(Chunker.IsComplete());
+ }
+ for (auto& item : NotYetConfirmed) {
+ if (item.Descr.Flags & IEventHandle::FlagGenerateUnsureUndelivered) { // notify only when unsure flag is set
+ item.ForwardOnNondelivery(true);
+ }
+ }
+ Pool.Release(NotYetConfirmed);
+ for (auto& item : Queue) {
+ item.ForwardOnNondelivery(false);
+ }
+ Pool.Release(Queue);
+ }
+
+}
diff --git a/library/cpp/actors/interconnect/interconnect_channel.h b/library/cpp/actors/interconnect/interconnect_channel.h
new file mode 100644
index 0000000000..e4a0ae3cda
--- /dev/null
+++ b/library/cpp/actors/interconnect/interconnect_channel.h
@@ -0,0 +1,127 @@
+#pragma once
+
+#include <library/cpp/monlib/dynamic_counters/counters.h>
+#include <library/cpp/actors/core/actorsystem.h>
+#include <library/cpp/actors/core/event_load.h>
+#include <library/cpp/actors/util/rope.h>
+#include <util/generic/deque.h>
+#include <util/generic/vector.h>
+#include <util/generic/map.h>
+#include <util/stream/walk.h>
+#include <library/cpp/actors/wilson/wilson_event.h>
+#include <library/cpp/actors/helpers/mon_histogram_helper.h>
+
+#include "interconnect_common.h"
+#include "interconnect_counters.h"
+#include "packet.h"
+#include "event_holder_pool.h"
+
+namespace NActors {
+#pragma pack(push, 1)
+ struct TChannelPart {
+ ui16 Channel;
+ ui16 Size;
+
+ static constexpr ui16 LastPartFlag = ui16(1) << 15;
+
+ TString ToString() const {
+ return TStringBuilder() << "{Channel# " << (Channel & ~LastPartFlag)
+ << " LastPartFlag# " << ((Channel & LastPartFlag) ? "true" : "false")
+ << " Size# " << Size << "}";
+ }
+ };
+#pragma pack(pop)
+
+ struct TExSerializedEventTooLarge : std::exception {
+ const ui32 Type;
+
+ TExSerializedEventTooLarge(ui32 type)
+ : Type(type)
+ {}
+ };
+
+ class TEventOutputChannel : public TInterconnectLoggingBase {
+ public:
+ TEventOutputChannel(TEventHolderPool& pool, ui16 id, ui32 peerNodeId, ui32 maxSerializedEventSize,
+ std::shared_ptr<IInterconnectMetrics> metrics, TSessionParams params)
+ : TInterconnectLoggingBase(Sprintf("OutputChannel %" PRIu16 " [node %" PRIu32 "]", id, peerNodeId))
+ , Pool(pool)
+ , PeerNodeId(peerNodeId)
+ , ChannelId(id)
+ , Metrics(std::move(metrics))
+ , Params(std::move(params))
+ , MaxSerializedEventSize(maxSerializedEventSize)
+ {}
+
+ ~TEventOutputChannel() {
+ }
+
+ std::pair<ui32, TEventHolder*> Push(IEventHandle& ev) {
+ TEventHolder& event = Pool.Allocate(Queue);
+ const ui32 bytes = event.Fill(ev) + sizeof(TEventDescr);
+ OutputQueueSize += bytes;
+ return std::make_pair(bytes, &event);
+ }
+
+ void DropConfirmed(ui64 confirm);
+
+ bool FeedBuf(TTcpPacketOutTask& task, ui64 serial, ui64 *weightConsumed);
+
+ bool IsEmpty() const {
+ return Queue.empty();
+ }
+
+ bool IsWorking() const {
+ return !IsEmpty();
+ }
+
+ ui32 GetQueueSize() const {
+ return (ui32)Queue.size();
+ }
+
+ ui64 GetBufferedAmountOfData() const {
+ return OutputQueueSize;
+ }
+
+ void NotifyUndelivered();
+
+ TEventHolderPool& Pool;
+ const ui32 PeerNodeId;
+ const ui16 ChannelId;
+ std::shared_ptr<IInterconnectMetrics> Metrics;
+ const TSessionParams Params;
+ const ui32 MaxSerializedEventSize;
+ ui64 UnaccountedTraffic = 0;
+ ui64 EqualizeCounterOnPause = 0;
+ ui64 WeightConsumedOnPause = 0;
+
+ enum class EState {
+ INITIAL,
+ CHUNKER,
+ BUFFER,
+ DESCRIPTOR,
+ };
+ EState State = EState::INITIAL;
+
+ static constexpr ui16 MinimumFreeSpace = sizeof(TChannelPart) + sizeof(TEventDescr);
+
+ protected:
+ ui64 OutputQueueSize = 0;
+
+ std::list<TEventHolder> Queue;
+ std::list<TEventHolder> NotYetConfirmed;
+ TRope::TConstIterator Iter;
+ TCoroutineChunkSerializer Chunker;
+ bool ExtendedFormat = false;
+
+ bool FeedDescriptor(TTcpPacketOutTask& task, TEventHolder& event, ui64 *weightConsumed);
+
+ void AccountTraffic() {
+ if (const ui64 amount = std::exchange(UnaccountedTraffic, 0)) {
+ Metrics->UpdateOutputChannelTraffic(ChannelId, amount);
+ }
+ }
+
+ friend class TInterconnectSessionTCP;
+ };
+}
diff --git a/library/cpp/actors/interconnect/interconnect_common.h b/library/cpp/actors/interconnect/interconnect_common.h
new file mode 100644
index 0000000000..285709a00c
--- /dev/null
+++ b/library/cpp/actors/interconnect/interconnect_common.h
@@ -0,0 +1,106 @@
+#pragma once
+
+#include <library/cpp/actors/core/actorid.h>
+#include <library/cpp/actors/core/actorsystem.h>
+#include <library/cpp/actors/util/datetime.h>
+#include <library/cpp/monlib/dynamic_counters/counters.h>
+#include <library/cpp/monlib/metrics/metric_registry.h>
+#include <util/generic/map.h>
+#include <util/generic/set.h>
+#include <util/system/datetime.h>
+
+#include "poller_tcp.h"
+#include "logging.h"
+#include "event_filter.h"
+
+#include <atomic>
+
+namespace NActors {
+ enum class EEncryptionMode {
+ DISABLED, // no encryption is required at all
+ OPTIONAL, // encryption is enabled when supported by both peers
+ REQUIRED, // encryption is mandatory
+ };
+
+ struct TInterconnectSettings {
+ TDuration Handshake;
+ TDuration DeadPeer;
+ TDuration CloseOnIdle;
+ ui32 SendBufferDieLimitInMB = 0;
+ ui64 OutputBuffersTotalSizeLimitInMB = 0;
+ ui32 TotalInflightAmountOfData = 0;
+ bool MergePerPeerCounters = false;
+ bool MergePerDataCenterCounters = false;
+ ui32 TCPSocketBufferSize = 0;
+ TDuration PingPeriod = TDuration::Seconds(3);
+ TDuration ForceConfirmPeriod = TDuration::Seconds(1);
+ TDuration LostConnection;
+ TDuration BatchPeriod;
+ bool BindOnAllAddresses = true;
+ EEncryptionMode EncryptionMode = EEncryptionMode::DISABLED;
+ bool TlsAuthOnly = false;
+ TString Certificate; // certificate data in PEM format
+ TString PrivateKey; // private key for the certificate in PEM format
+ TString CaFilePath; // path to certificate authority file
+ TString CipherList; // encryption algorithms
+ TDuration MessagePendingTimeout = TDuration::Seconds(1); // timeout for which messages are queued while in PendingConnection state
+ ui64 MessagePendingSize = Max<ui64>(); // size of the queue
+ ui32 MaxSerializedEventSize = NActors::EventMaxByteSize;
+
+ ui32 GetSendBufferSize() const {
+ ui32 res = 512 * 1024; // 512 kb is the default value for send buffer
+ if (TCPSocketBufferSize) {
+ res = TCPSocketBufferSize;
+ }
+ return res;
+ }
+ };
+
+ struct TChannelSettings {
+ ui16 Weight;
+ };
+
+ typedef TMap<ui16, TChannelSettings> TChannelsConfig;
+
+ using TRegisterMonPageCallback = std::function<void(const TString& path, const TString& title,
+ TActorSystem* actorSystem, const TActorId& actorId)>;
+
+ using TInitWhiteboardCallback = std::function<void(ui16 icPort, TActorSystem* actorSystem)>;
+
+ using TUpdateWhiteboardCallback = std::function<void(const TString& peer, bool connected, bool green, bool yellow,
+ bool orange, bool red, TActorSystem* actorSystem)>;
+
+ struct TInterconnectProxyCommon : TAtomicRefCount<TInterconnectProxyCommon> {
+ TActorId NameserviceId;
+ NMonitoring::TDynamicCounterPtr MonCounters;
+ std::shared_ptr<NMonitoring::IMetricRegistry> Metrics;
+ TChannelsConfig ChannelsConfig;
+ TInterconnectSettings Settings;
+ TRegisterMonPageCallback RegisterMonPage;
+ TActorId DestructorId;
+ std::shared_ptr<std::atomic<TAtomicBase>> DestructorQueueSize;
+ TAtomicBase MaxDestructorQueueSize = 1024 * 1024 * 1024;
+ TString ClusterUUID;
+ TVector<TString> AcceptUUID;
+ ui64 StartTime = GetCycleCountFast();
+ TString TechnicalSelfHostName;
+ TInitWhiteboardCallback InitWhiteboard;
+ TUpdateWhiteboardCallback UpdateWhiteboard;
+ ui32 HandshakeBallastSize = 0;
+ TAtomic StartedSessionKiller = 0;
+ TScopeId LocalScopeId;
+ std::shared_ptr<TEventFilter> EventFilter;
+ TString Cookie; // unique random identifier of a node instance (generated randomly at every start)
+ std::unordered_map<ui16, TString> ChannelName;
+
+ struct TVersionInfo {
+ TString Tag; // version tag for this node
+ TSet<TString> AcceptedTags; // we accept all enlisted version tags of peer nodes, but no others; empty = accept all
+ };
+
+ TMaybe<TVersionInfo> VersionInfo;
+
+ using TPtr = TIntrusivePtr<TInterconnectProxyCommon>;
+ };
+
+}
diff --git a/library/cpp/actors/interconnect/interconnect_counters.cpp b/library/cpp/actors/interconnect/interconnect_counters.cpp
new file mode 100644
index 0000000000..ba674f664b
--- /dev/null
+++ b/library/cpp/actors/interconnect/interconnect_counters.cpp
@@ -0,0 +1,692 @@
+#include "interconnect_counters.h"
+
+#include <library/cpp/monlib/metrics/metric_registry.h>
+#include <library/cpp/monlib/metrics/metric_sub_registry.h>
+
+#include <unordered_map>
+
+namespace NActors {
+
+namespace {
+
+ class TInterconnectCounters: public IInterconnectMetrics {
+ public:
+ struct TOutputChannel {
+ NMonitoring::TDynamicCounters::TCounterPtr Traffic;
+ NMonitoring::TDynamicCounters::TCounterPtr Events;
+ NMonitoring::TDynamicCounters::TCounterPtr OutgoingTraffic;
+ NMonitoring::TDynamicCounters::TCounterPtr OutgoingEvents;
+
+ TOutputChannel() = default;
+
+ TOutputChannel(const TIntrusivePtr<NMonitoring::TDynamicCounters>& counters,
+ NMonitoring::TDynamicCounters::TCounterPtr traffic,
+ NMonitoring::TDynamicCounters::TCounterPtr events)
+ : Traffic(std::move(traffic))
+ , Events(std::move(events))
+ , OutgoingTraffic(counters->GetCounter("OutgoingTraffic", true))
+ , OutgoingEvents(counters->GetCounter("OutgoingEvents", true))
+ {}
+
+ TOutputChannel(const TOutputChannel&) = default;
+ };
+
+ struct TInputChannel {
+ NMonitoring::TDynamicCounters::TCounterPtr Traffic;
+ NMonitoring::TDynamicCounters::TCounterPtr Events;
+ NMonitoring::TDynamicCounters::TCounterPtr ScopeErrors;
+ NMonitoring::TDynamicCounters::TCounterPtr IncomingTraffic;
+ NMonitoring::TDynamicCounters::TCounterPtr IncomingEvents;
+
+ TInputChannel() = default;
+
+ TInputChannel(const TIntrusivePtr<NMonitoring::TDynamicCounters>& counters,
+ NMonitoring::TDynamicCounters::TCounterPtr traffic,
+ NMonitoring::TDynamicCounters::TCounterPtr events,
+ NMonitoring::TDynamicCounters::TCounterPtr scopeErrors)
+ : Traffic(std::move(traffic))
+ , Events(std::move(events))
+ , ScopeErrors(std::move(scopeErrors))
+ , IncomingTraffic(counters->GetCounter("IncomingTraffic", true))
+ , IncomingEvents(counters->GetCounter("IncomingEvents", true))
+ {}
+
+ TInputChannel(const TInputChannel&) = default;
+ };
+
+ struct TInputChannels : std::unordered_map<ui16, TInputChannel> {
+ TInputChannel OtherInputChannel;
+
+ TInputChannels() = default;
+
+ TInputChannels(const TIntrusivePtr<NMonitoring::TDynamicCounters>& counters,
+ const std::unordered_map<ui16, TString>& names,
+ NMonitoring::TDynamicCounters::TCounterPtr traffic,
+ NMonitoring::TDynamicCounters::TCounterPtr events,
+ NMonitoring::TDynamicCounters::TCounterPtr scopeErrors)
+ : OtherInputChannel(counters->GetSubgroup("channel", "other"), traffic, events, scopeErrors)
+ {
+ for (const auto& [id, name] : names) {
+ try_emplace(id, counters->GetSubgroup("channel", name), traffic, events, scopeErrors);
+ }
+ }
+
+ TInputChannels(const TInputChannels&) = default;
+
+ const TInputChannel& Get(ui16 id) const {
+ const auto it = find(id);
+ return it != end() ? it->second : OtherInputChannel;
+ }
+ };
+
+ private:
+ const TInterconnectProxyCommon::TPtr Common;
+ const bool MergePerDataCenterCounters;
+ const bool MergePerPeerCounters;
+ NMonitoring::TDynamicCounterPtr Counters;
+ NMonitoring::TDynamicCounterPtr PerSessionCounters;
+ NMonitoring::TDynamicCounterPtr PerDataCenterCounters;
+ NMonitoring::TDynamicCounterPtr& AdaptiveCounters;
+
+ bool Initialized = false;
+
+ NMonitoring::TDynamicCounters::TCounterPtr Traffic;
+ NMonitoring::TDynamicCounters::TCounterPtr Events;
+ NMonitoring::TDynamicCounters::TCounterPtr ScopeErrors;
+
+ public:
+ TInterconnectCounters(const TInterconnectProxyCommon::TPtr& common)
+ : Common(common)
+ , MergePerDataCenterCounters(common->Settings.MergePerDataCenterCounters)
+ , MergePerPeerCounters(common->Settings.MergePerPeerCounters)
+ , Counters(common->MonCounters)
+ , AdaptiveCounters(MergePerDataCenterCounters
+ ? PerDataCenterCounters :
+ MergePerPeerCounters ? Counters : PerSessionCounters)
+ {}
+
+ void AddInflightDataAmount(ui64 value) override {
+ *InflightDataAmount += value;
+ }
+
+ void SubInflightDataAmount(ui64 value) override {
+ *InflightDataAmount -= value;
+ }
+
+ void AddTotalBytesWritten(ui64 value) override {
+ *TotalBytesWritten += value;
+ }
+
+ void SetClockSkewMicrosec(i64 value) override {
+ *ClockSkewMicrosec = value;
+ }
+
+ void IncSessionDeaths() override {
+ ++*SessionDeaths;
+ }
+
+ void IncHandshakeFails() override {
+ ++*HandshakeFails;
+ }
+
+ void SetConnected(ui32 value) override {
+ *Connected = value;
+ }
+
+ void IncSubscribersCount() override {
+ ++*SubscribersCount;
+ }
+
+ void SubSubscribersCount(ui32 value) override {
+ *SubscribersCount -= value;
+ }
+
+ void SubOutputBuffersTotalSize(ui64 value) override {
+ *OutputBuffersTotalSize -= value;
+ }
+
+ void AddOutputBuffersTotalSize(ui64 value) override {
+ *OutputBuffersTotalSize += value;
+ }
+
+ ui64 GetOutputBuffersTotalSize() const override {
+ return *OutputBuffersTotalSize;
+ }
+
+ void IncDisconnections() override {
+ ++*Disconnections;
+ }
+
+ void IncUsefulWriteWakeups() override {
+ ++*UsefulWriteWakeups;
+ }
+
+ void IncSpuriousWriteWakeups() override {
+ ++*SpuriousWriteWakeups;
+ }
+
+ void IncSendSyscalls() override {
+ ++*SendSyscalls;
+ }
+
+ void IncInflyLimitReach() override {
+ ++*InflyLimitReach;
+ }
+
+ void IncUsefulReadWakeups() override {
+ ++*UsefulReadWakeups;
+ }
+
+ void IncSpuriousReadWakeups() override {
+ ++*SpuriousReadWakeups;
+ }
+
+ void IncDisconnectByReason(const TString& s) override {
+ if (auto it = DisconnectByReason.find(s); it != DisconnectByReason.end()) {
+ it->second->Inc();
+ }
+ }
+
+ void AddInputChannelsIncomingTraffic(ui16 channel, ui64 incomingTraffic) override {
+ auto& ch = InputChannels.Get(channel);
+ *ch.IncomingTraffic += incomingTraffic;
+ }
+
+ void IncInputChannelsIncomingEvents(ui16 channel) override {
+ auto& ch = InputChannels.Get(channel);
+ ++*ch.IncomingEvents;
+ }
+
+ void IncRecvSyscalls() override {
+ ++*RecvSyscalls;
+ }
+
+ void AddTotalBytesRead(ui64 value) override {
+ *TotalBytesRead += value;
+ }
+
+ void UpdateLegacyPingTimeHist(ui64 value) override {
+ LegacyPingTimeHist.Add(value);
+ PingTimeHistogram->Collect(value);
+ }
+
+ void UpdateOutputChannelTraffic(ui16 channel, ui64 value) override {
+ if (GetOutputChannel(channel).OutgoingTraffic) {
+ *(GetOutputChannel(channel).OutgoingTraffic) += value;
+ }
+ if (GetOutputChannel(channel).Traffic) {
+ *(GetOutputChannel(channel).Traffic) += value;
+ }
+ }
+
+ void UpdateOutputChannelEvents(ui16 channel) override {
+ if (GetOutputChannel(channel).OutgoingEvents) {
+ ++*(GetOutputChannel(channel).OutgoingEvents);
+ }
+ if (GetOutputChannel(channel).Events) {
+ ++*(GetOutputChannel(channel).Events);
+ }
+ }
+
+ void SetPeerInfo(const TString& name, const TString& dataCenterId) override {
+ if (name != std::exchange(HumanFriendlyPeerHostName, name)) {
+ PerSessionCounters.Reset();
+ }
+ VALGRIND_MAKE_READABLE(&DataCenterId, sizeof(DataCenterId));
+ if (dataCenterId != std::exchange(DataCenterId, dataCenterId)) {
+ PerDataCenterCounters.Reset();
+ }
+
+ const bool updatePerDataCenter = !PerDataCenterCounters && MergePerDataCenterCounters;
+ if (updatePerDataCenter) {
+ PerDataCenterCounters = Counters->GetSubgroup("dataCenterId", *DataCenterId);
+ }
+
+ const bool updatePerSession = !PerSessionCounters || updatePerDataCenter;
+ if (updatePerSession) {
+ auto base = MergePerDataCenterCounters ? PerDataCenterCounters : Counters;
+ PerSessionCounters = base->GetSubgroup("peer", *HumanFriendlyPeerHostName);
+ }
+
+ const bool updateGlobal = !Initialized;
+
+ const bool updateAdaptive =
+ &AdaptiveCounters == &Counters ? updateGlobal :
+ &AdaptiveCounters == &PerSessionCounters ? updatePerSession :
+ &AdaptiveCounters == &PerDataCenterCounters ? updatePerDataCenter :
+ false;
+
+ if (updatePerSession) {
+ Connected = PerSessionCounters->GetCounter("Connected");
+ Disconnections = PerSessionCounters->GetCounter("Disconnections", true);
+ ClockSkewMicrosec = PerSessionCounters->GetCounter("ClockSkewMicrosec");
+ Traffic = PerSessionCounters->GetCounter("Traffic", true);
+ Events = PerSessionCounters->GetCounter("Events", true);
+ ScopeErrors = PerSessionCounters->GetCounter("ScopeErrors", true);
+
+ for (const auto& [id, name] : Common->ChannelName) {
+ OutputChannels.try_emplace(id, Counters->GetSubgroup("channel", name), Traffic, Events);
+ }
+ OtherOutputChannel = TOutputChannel(Counters->GetSubgroup("channel", "other"), Traffic, Events);
+
+ InputChannels = TInputChannels(Counters, Common->ChannelName, Traffic, Events, ScopeErrors);
+ }
+
+ if (updateAdaptive) {
+ SessionDeaths = AdaptiveCounters->GetCounter("Session_Deaths", true);
+ HandshakeFails = AdaptiveCounters->GetCounter("Handshake_Fails", true);
+ InflyLimitReach = AdaptiveCounters->GetCounter("InflyLimitReach", true);
+ InflightDataAmount = AdaptiveCounters->GetCounter("Inflight_Data");
+
+ LegacyPingTimeHist = {};
+ LegacyPingTimeHist.Init(AdaptiveCounters.Get(), "PingTimeHist", "mks", 125, 18);
+
+ PingTimeHistogram = AdaptiveCounters->GetHistogram(
+ "PingTimeUs", NMonitoring::ExponentialHistogram(18, 2, 125));
+ }
+
+ if (updateGlobal) {
+ OutputBuffersTotalSize = Counters->GetCounter("OutputBuffersTotalSize");
+ SendSyscalls = Counters->GetCounter("SendSyscalls", true);
+ RecvSyscalls = Counters->GetCounter("RecvSyscalls", true);
+ SpuriousReadWakeups = Counters->GetCounter("SpuriousReadWakeups", true);
+ UsefulReadWakeups = Counters->GetCounter("UsefulReadWakeups", true);
+ SpuriousWriteWakeups = Counters->GetCounter("SpuriousWriteWakeups", true);
+ UsefulWriteWakeups = Counters->GetCounter("UsefulWriteWakeups", true);
+ SubscribersCount = AdaptiveCounters->GetCounter("SubscribersCount");
+ TotalBytesWritten = Counters->GetCounter("TotalBytesWritten", true);
+ TotalBytesRead = Counters->GetCounter("TotalBytesRead", true);
+
+ auto disconnectReasonGroup = Counters->GetSubgroup("subsystem", "disconnectReason");
+ for (const char *reason : TDisconnectReason::Reasons) {
+ DisconnectByReason[reason] = disconnectReasonGroup->GetNamedCounter("reason", reason, true);
+ }
+ }
+
+ Initialized = true;
+ }
+
+ TOutputChannel GetOutputChannel(ui16 index) const {
+ Y_VERIFY(Initialized);
+ const auto it = OutputChannels.find(index);
+ return it != OutputChannels.end() ? it->second : OtherOutputChannel;
+ }
+
+ private:
+ NMonitoring::TDynamicCounters::TCounterPtr SessionDeaths;
+ NMonitoring::TDynamicCounters::TCounterPtr HandshakeFails;
+ NMonitoring::TDynamicCounters::TCounterPtr Connected;
+ NMonitoring::TDynamicCounters::TCounterPtr Disconnections;
+ NMonitoring::TDynamicCounters::TCounterPtr InflightDataAmount;
+ NMonitoring::TDynamicCounters::TCounterPtr InflyLimitReach;
+ NMonitoring::TDynamicCounters::TCounterPtr OutputBuffersTotalSize;
+ NMonitoring::TDynamicCounters::TCounterPtr QueueUtilization;
+ NMonitoring::TDynamicCounters::TCounterPtr SubscribersCount;
+ NMonitoring::TDynamicCounters::TCounterPtr SendSyscalls;
+ NMonitoring::TDynamicCounters::TCounterPtr ClockSkewMicrosec;
+ NMonitoring::TDynamicCounters::TCounterPtr RecvSyscalls;
+ NMonitoring::TDynamicCounters::TCounterPtr UsefulReadWakeups;
+ NMonitoring::TDynamicCounters::TCounterPtr SpuriousReadWakeups;
+ NMonitoring::TDynamicCounters::TCounterPtr UsefulWriteWakeups;
+ NMonitoring::TDynamicCounters::TCounterPtr SpuriousWriteWakeups;
+ NMon::THistogramCounterHelper LegacyPingTimeHist;
+ NMonitoring::THistogramPtr PingTimeHistogram;
+
+ std::unordered_map<ui16, TOutputChannel> OutputChannels;
+ TOutputChannel OtherOutputChannel;
+ TInputChannels InputChannels;
+ THashMap<TString, NMonitoring::TDynamicCounters::TCounterPtr> DisconnectByReason;
+
+ NMonitoring::TDynamicCounters::TCounterPtr TotalBytesWritten, TotalBytesRead;
+ };
+
+ class TInterconnectMetrics: public IInterconnectMetrics {
+ public:
+ struct TOutputChannel {
+ NMonitoring::IRate* Traffic;
+ NMonitoring::IRate* Events;
+ NMonitoring::IRate* OutgoingTraffic;
+ NMonitoring::IRate* OutgoingEvents;
+
+ TOutputChannel() = default;
+
+ TOutputChannel(const std::shared_ptr<NMonitoring::IMetricRegistry>& metrics,
+ NMonitoring::IRate* traffic,
+ NMonitoring::IRate* events)
+ : Traffic(traffic)
+ , Events(events)
+ , OutgoingTraffic(metrics->Rate(NMonitoring::MakeLabels({{"sensor", "interconnect.outgoing_traffic"}})))
+ , OutgoingEvents(metrics->Rate(NMonitoring::MakeLabels({{"sensor", "interconnect.outgoing_events"}})))
+ {}
+
+ TOutputChannel(const TOutputChannel&) = default;
+ };
+
+ struct TInputChannel {
+ NMonitoring::IRate* Traffic;
+ NMonitoring::IRate* Events;
+ NMonitoring::IRate* ScopeErrors;
+ NMonitoring::IRate* IncomingTraffic;
+ NMonitoring::IRate* IncomingEvents;
+
+ TInputChannel() = default;
+
+ TInputChannel(const std::shared_ptr<NMonitoring::IMetricRegistry>& metrics,
+ NMonitoring::IRate* traffic, NMonitoring::IRate* events,
+ NMonitoring::IRate* scopeErrors)
+ : Traffic(traffic)
+ , Events(events)
+ , ScopeErrors(scopeErrors)
+ , IncomingTraffic(metrics->Rate(NMonitoring::MakeLabels({{"sensor", "interconnect.incoming_traffic"}})))
+ , IncomingEvents(metrics->Rate(NMonitoring::MakeLabels({{"sensor", "interconnect.incoming_events"}})))
+ {}
+
+ TInputChannel(const TInputChannel&) = default;
+ };
+
+ struct TInputChannels : std::unordered_map<ui16, TInputChannel> {
+ TInputChannel OtherInputChannel;
+
+ TInputChannels() = default;
+
+ TInputChannels(const std::shared_ptr<NMonitoring::IMetricRegistry>& metrics,
+ const std::unordered_map<ui16, TString>& names,
+ NMonitoring::IRate* traffic, NMonitoring::IRate* events,
+ NMonitoring::IRate* scopeErrors)
+ : OtherInputChannel(std::make_shared<NMonitoring::TMetricSubRegistry>(
+ NMonitoring::TLabels{{"channel", "other"}}, metrics), traffic, events, scopeErrors)
+ {
+ for (const auto& [id, name] : names) {
+ try_emplace(id, std::make_shared<NMonitoring::TMetricSubRegistry>(NMonitoring::TLabels{{"channel", name}}, metrics),
+ traffic, events, scopeErrors);
+ }
+ }
+
+ TInputChannels(const TInputChannels&) = default;
+
+ const TInputChannel& Get(ui16 id) const {
+ const auto it = find(id);
+ return it != end() ? it->second : OtherInputChannel;
+ }
+ };
+
+ TInterconnectMetrics(const TInterconnectProxyCommon::TPtr& common)
+ : Common(common)
+ , MergePerDataCenterMetrics_(common->Settings.MergePerDataCenterCounters)
+ , MergePerPeerMetrics_(common->Settings.MergePerPeerCounters)
+ , Metrics_(common->Metrics)
+ , AdaptiveMetrics_(MergePerDataCenterMetrics_
+ ? PerDataCenterMetrics_ :
+ MergePerPeerMetrics_ ? Metrics_ : PerSessionMetrics_)
+ {}
+
+ void AddInflightDataAmount(ui64 value) override {
+ InflightDataAmount_->Add(value);
+ }
+
+ void SubInflightDataAmount(ui64 value) override {
+ InflightDataAmount_->Add(-value);
+ }
+
+ void AddTotalBytesWritten(ui64 value) override {
+ TotalBytesWritten_->Add(value);
+ }
+
+ void SetClockSkewMicrosec(i64 value) override {
+ ClockSkewMicrosec_->Set(value);
+ }
+
+ void IncSessionDeaths() override {
+ SessionDeaths_->Inc();
+ }
+
+ void IncHandshakeFails() override {
+ HandshakeFails_->Inc();
+ }
+
+ void SetConnected(ui32 value) override {
+ Connected_->Set(value);
+ }
+
+ void IncSubscribersCount() override {
+ SubscribersCount_->Inc();
+ }
+
+ void SubSubscribersCount(ui32 value) override {
+ SubscribersCount_->Add(-value);
+ }
+
+ void SubOutputBuffersTotalSize(ui64 value) override {
+ OutputBuffersTotalSize_->Add(-value);
+ }
+
+ void AddOutputBuffersTotalSize(ui64 value) override {
+ OutputBuffersTotalSize_->Add(value);
+ }
+
+ ui64 GetOutputBuffersTotalSize() const override {
+ return OutputBuffersTotalSize_->Get();
+ }
+
+ void IncDisconnections() override {
+ Disconnections_->Inc();
+ }
+
+ void IncUsefulWriteWakeups() override {
+ UsefulWriteWakeups_->Inc();
+ }
+
+ void IncSpuriousWriteWakeups() override {
+ SpuriousWriteWakeups_->Inc();
+ }
+
+ void IncSendSyscalls() override {
+ SendSyscalls_->Inc();
+ }
+
+ void IncInflyLimitReach() override {
+ InflyLimitReach_->Inc();
+ }
+
+ void IncUsefulReadWakeups() override {
+ UsefulReadWakeups_->Inc();
+ }
+
+ void IncSpuriousReadWakeups() override {
+ SpuriousReadWakeups_->Inc();
+ }
+
+ void IncDisconnectByReason(const TString& s) override {
+ if (auto it = DisconnectByReason_.find(s); it != DisconnectByReason_.end()) {
+ it->second->Inc();
+ }
+ }
+
+ void AddInputChannelsIncomingTraffic(ui16 channel, ui64 incomingTraffic) override {
+ auto& ch = InputChannels_.Get(channel);
+ ch.IncomingTraffic->Add(incomingTraffic);
+ }
+
+ void IncInputChannelsIncomingEvents(ui16 channel) override {
+ auto& ch = InputChannels_.Get(channel);
+ ch.IncomingEvents->Inc();
+ }
+
+ void IncRecvSyscalls() override {
+ RecvSyscalls_->Inc();
+ }
+
+ void AddTotalBytesRead(ui64 value) override {
+ TotalBytesRead_->Add(value);
+ }
+
+ void UpdateLegacyPingTimeHist(ui64 value) override {
+ PingTimeHistogram_->Record(value);
+ }
+
+ void UpdateOutputChannelTraffic(ui16 channel, ui64 value) override {
+ if (GetOutputChannel(channel).OutgoingTraffic) {
+ GetOutputChannel(channel).OutgoingTraffic->Add(value);
+ }
+ if (GetOutputChannel(channel).Traffic) {
+ GetOutputChannel(channel).Traffic->Add(value);
+ }
+ }
+
+ void UpdateOutputChannelEvents(ui16 channel) override {
+ if (GetOutputChannel(channel).OutgoingEvents) {
+ GetOutputChannel(channel).OutgoingEvents->Inc();
+ }
+ if (GetOutputChannel(channel).Events) {
+ GetOutputChannel(channel).Events->Inc();
+ }
+ }
+
+ void SetPeerInfo(const TString& name, const TString& dataCenterId) override {
+ if (name != std::exchange(HumanFriendlyPeerHostName, name)) {
+ PerSessionMetrics_.reset();
+ }
+ VALGRIND_MAKE_READABLE(&DataCenterId, sizeof(DataCenterId));
+ if (dataCenterId != std::exchange(DataCenterId, dataCenterId)) {
+ PerDataCenterMetrics_.reset();
+ }
+
+ const bool updatePerDataCenter = !PerDataCenterMetrics_ && MergePerDataCenterMetrics_;
+ if (updatePerDataCenter) {
+ PerDataCenterMetrics_ = std::make_shared<NMonitoring::TMetricSubRegistry>(
+ NMonitoring::TLabels{{"datacenter_id", *DataCenterId}}, Metrics_);
+ }
+
+ const bool updatePerSession = !PerSessionMetrics_ || updatePerDataCenter;
+ if (updatePerSession) {
+ auto base = MergePerDataCenterMetrics_ ? PerDataCenterMetrics_ : Metrics_;
+ PerSessionMetrics_ = std::make_shared<NMonitoring::TMetricSubRegistry>(
+ NMonitoring::TLabels{{"peer", *HumanFriendlyPeerHostName}}, base);
+ }
+
+ const bool updateGlobal = !Initialized_;
+
+ const bool updateAdaptive =
+ &AdaptiveMetrics_ == &Metrics_ ? updateGlobal :
+ &AdaptiveMetrics_ == &PerSessionMetrics_ ? updatePerSession :
+ &AdaptiveMetrics_ == &PerDataCenterMetrics_ ? updatePerDataCenter :
+ false;
+
+ auto createRate = [](std::shared_ptr<NMonitoring::IMetricRegistry> metrics, TStringBuf name) mutable {
+ return metrics->Rate(NMonitoring::MakeLabels(NMonitoring::TLabels{{"sensor", name}}));
+ };
+ auto createIntGauge = [](std::shared_ptr<NMonitoring::IMetricRegistry> metrics, TStringBuf name) mutable {
+ return metrics->IntGauge(NMonitoring::MakeLabels(NMonitoring::TLabels{{"sensor", name}}));
+ };
+
+ if (updatePerSession) {
+ Connected_ = createIntGauge(PerSessionMetrics_, "interconnect.connected");
+ Disconnections_ = createRate(PerSessionMetrics_, "interconnect.disconnections");
+ ClockSkewMicrosec_ = createIntGauge(PerSessionMetrics_, "interconnect.clock_skew_microsec");
+ Traffic_ = createRate(PerSessionMetrics_, "interconnect.traffic");
+ Events_ = createRate(PerSessionMetrics_, "interconnect.events");
+ ScopeErrors_ = createRate(PerSessionMetrics_, "interconnect.scope_errors");
+
+ for (const auto& [id, name] : Common->ChannelName) {
+ OutputChannels_.try_emplace(id, std::make_shared<NMonitoring::TMetricSubRegistry>(
+ NMonitoring::TLabels{{"channel", name}}, Metrics_), Traffic_, Events_);
+ }
+ OtherOutputChannel_ = TOutputChannel(std::make_shared<NMonitoring::TMetricSubRegistry>(
+ NMonitoring::TLabels{{"channel", "other"}}, Metrics_), Traffic_, Events_);
+
+ InputChannels_ = TInputChannels(Metrics_, Common->ChannelName, Traffic_, Events_, ScopeErrors_);
+ }
+
+ if (updateAdaptive) {
+ SessionDeaths_ = createRate(AdaptiveMetrics_, "interconnect.session_deaths");
+ HandshakeFails_ = createRate(AdaptiveMetrics_, "interconnect.handshake_fails");
+ InflyLimitReach_ = createRate(AdaptiveMetrics_, "interconnect.infly_limit_reach");
+ InflightDataAmount_ = createRate(AdaptiveMetrics_, "interconnect.inflight_data");
+ PingTimeHistogram_ = AdaptiveMetrics_->HistogramRate(
+ NMonitoring::MakeLabels({{"sensor", "interconnect.ping_time_us"}}), NMonitoring::ExponentialHistogram(18, 2, 125));
+ }
+
+ if (updateGlobal) {
+ OutputBuffersTotalSize_ = createRate(Metrics_, "interconnect.output_buffers_total_size");
+ SendSyscalls_ = createRate(Metrics_, "interconnect.send_syscalls");
+ RecvSyscalls_ = createRate(Metrics_, "interconnect.recv_syscalls");
+ SpuriousReadWakeups_ = createRate(Metrics_, "interconnect.spurious_read_wakeups");
+ UsefulReadWakeups_ = createRate(Metrics_, "interconnect.useful_read_wakeups");
+ SpuriousWriteWakeups_ = createRate(Metrics_, "interconnect.spurious_write_wakeups");
+ UsefulWriteWakeups_ = createRate(Metrics_, "interconnect.useful_write_wakeups");
+ SubscribersCount_ = createIntGauge(AdaptiveMetrics_, "interconnect.subscribers_count");
+ TotalBytesWritten_ = createRate(Metrics_, "interconnect.total_bytes_written");
+ TotalBytesRead_ = createRate(Metrics_, "interconnect.total_bytes_read");
+
+ for (const char *reason : TDisconnectReason::Reasons) {
+ DisconnectByReason_[reason] = Metrics_->Rate(
+ NMonitoring::MakeLabels({
+ {"sensor", "interconnect.disconnect_reason"},
+ {"reason", reason},
+ }));
+ }
+ }
+
+ Initialized_ = true;
+ }
+
+ TOutputChannel GetOutputChannel(ui16 index) const {
+ Y_VERIFY(Initialized_);
+ const auto it = OutputChannels_.find(index);
+ return it != OutputChannels_.end() ? it->second : OtherOutputChannel_;
+ }
+
+ private:
+ const TInterconnectProxyCommon::TPtr Common;
+ const bool MergePerDataCenterMetrics_;
+ const bool MergePerPeerMetrics_;
+ std::shared_ptr<NMonitoring::IMetricRegistry> Metrics_;
+ std::shared_ptr<NMonitoring::IMetricRegistry> PerSessionMetrics_;
+ std::shared_ptr<NMonitoring::IMetricRegistry> PerDataCenterMetrics_;
+ std::shared_ptr<NMonitoring::IMetricRegistry>& AdaptiveMetrics_;
+ bool Initialized_ = false;
+
+ NMonitoring::IRate* Traffic_;
+
+ NMonitoring::IRate* Events_;
+ NMonitoring::IRate* ScopeErrors_;
+ NMonitoring::IRate* Disconnections_;
+ NMonitoring::IIntGauge* Connected_;
+
+ NMonitoring::IRate* SessionDeaths_;
+ NMonitoring::IRate* HandshakeFails_;
+ NMonitoring::IRate* InflyLimitReach_;
+ NMonitoring::IRate* InflightDataAmount_;
+ NMonitoring::IRate* OutputBuffersTotalSize_;
+ NMonitoring::IIntGauge* SubscribersCount_;
+ NMonitoring::IRate* SendSyscalls_;
+ NMonitoring::IRate* RecvSyscalls_;
+ NMonitoring::IRate* SpuriousWriteWakeups_;
+ NMonitoring::IRate* UsefulWriteWakeups_;
+ NMonitoring::IRate* SpuriousReadWakeups_;
+ NMonitoring::IRate* UsefulReadWakeups_;
+ NMonitoring::IIntGauge* ClockSkewMicrosec_;
+
+ NMonitoring::IHistogram* PingTimeHistogram_;
+
+ std::unordered_map<ui16, TOutputChannel> OutputChannels_;
+ TOutputChannel OtherOutputChannel_;
+ TInputChannels InputChannels_;
+
+ THashMap<TString, NMonitoring::IRate*> DisconnectByReason_;
+
+ NMonitoring::IRate* TotalBytesWritten_;
+ NMonitoring::IRate* TotalBytesRead_;
+ };
+
+} // namespace
+
+std::unique_ptr<IInterconnectMetrics> CreateInterconnectCounters(const TInterconnectProxyCommon::TPtr& common) {
+ return std::make_unique<TInterconnectCounters>(common);
+}
+
+std::unique_ptr<IInterconnectMetrics> CreateInterconnectMetrics(const TInterconnectProxyCommon::TPtr& common) {
+ return std::make_unique<TInterconnectMetrics>(common);
+}
+
+} // NActors
diff --git a/library/cpp/actors/interconnect/interconnect_counters.h b/library/cpp/actors/interconnect/interconnect_counters.h
new file mode 100644
index 0000000000..e30f03a0bc
--- /dev/null
+++ b/library/cpp/actors/interconnect/interconnect_counters.h
@@ -0,0 +1,59 @@
+#pragma once
+
+#include <library/cpp/actors/helpers/mon_histogram_helper.h>
+
+#include <util/system/valgrind.h>
+
+#include "types.h"
+
+#include "interconnect_common.h"
+
+#include <memory>
+#include <optional>
+
+namespace NActors {
+
+class IInterconnectMetrics {
+public:
+ virtual ~IInterconnectMetrics() = default;
+
+ virtual void AddInflightDataAmount(ui64 value) = 0;
+ virtual void SubInflightDataAmount(ui64 value) = 0;
+ virtual void AddTotalBytesWritten(ui64 value) = 0;
+ virtual void SetClockSkewMicrosec(i64 value) = 0;
+ virtual void IncSessionDeaths() = 0;
+ virtual void IncHandshakeFails() = 0;
+ virtual void SetConnected(ui32 value) = 0;
+ virtual void IncSubscribersCount() = 0;
+ virtual void SubSubscribersCount(ui32 value) = 0;
+ virtual void SubOutputBuffersTotalSize(ui64 value) = 0;
+ virtual void AddOutputBuffersTotalSize(ui64 value) = 0;
+ virtual ui64 GetOutputBuffersTotalSize() const = 0;
+ virtual void IncDisconnections() = 0;
+ virtual void IncUsefulWriteWakeups() = 0;
+ virtual void IncSpuriousWriteWakeups() = 0;
+ virtual void IncSendSyscalls() = 0;
+ virtual void IncInflyLimitReach() = 0;
+ virtual void IncDisconnectByReason(const TString& s) = 0;
+ virtual void IncUsefulReadWakeups() = 0;
+ virtual void IncSpuriousReadWakeups() = 0;
+ virtual void SetPeerInfo(const TString& name, const TString& dataCenterId) = 0;
+ virtual void AddInputChannelsIncomingTraffic(ui16 channel, ui64 incomingTraffic) = 0;
+ virtual void IncInputChannelsIncomingEvents(ui16 channel) = 0;
+ virtual void IncRecvSyscalls() = 0;
+ virtual void AddTotalBytesRead(ui64 value) = 0;
+ virtual void UpdateLegacyPingTimeHist(ui64 value) = 0;
+ virtual void UpdateOutputChannelTraffic(ui16 channel, ui64 value) = 0;
+ virtual void UpdateOutputChannelEvents(ui16 channel) = 0;
+ TString GetHumanFriendlyPeerHostName() const {
+ return HumanFriendlyPeerHostName.value_or(TString());
+ }
+
+protected:
+ std::optional<TString> DataCenterId;
+ std::optional<TString> HumanFriendlyPeerHostName;
+};
+
+std::unique_ptr<IInterconnectMetrics> CreateInterconnectCounters(const NActors::TInterconnectProxyCommon::TPtr& common);
+std::unique_ptr<IInterconnectMetrics> CreateInterconnectMetrics(const NActors::TInterconnectProxyCommon::TPtr& common);
+} // NActors
diff --git a/library/cpp/actors/interconnect/interconnect_handshake.cpp b/library/cpp/actors/interconnect/interconnect_handshake.cpp
new file mode 100644
index 0000000000..9ede998d8e
--- /dev/null
+++ b/library/cpp/actors/interconnect/interconnect_handshake.cpp
@@ -0,0 +1,995 @@
+#include "interconnect_handshake.h"
+#include "interconnect_tcp_proxy.h"
+
+#include <library/cpp/actors/core/actor_coroutine.h>
+#include <library/cpp/actors/core/log.h>
+#include <library/cpp/actors/protos/services_common.pb.h>
+#include <util/system/getpid.h>
+
+#include <google/protobuf/text_format.h>
+
+#include <variant>
+
+namespace NActors {
+ static constexpr size_t StackSize = 64 * 1024; // 64k should be enough
+
+ class THandshakeActor
+ : public TActorCoroImpl
+ , public TInterconnectLoggingBase
+ {
+ struct TExHandshakeFailed : yexception {};
+
+ static constexpr TDuration ResolveTimeout = TDuration::Seconds(1);
+
+#pragma pack(push, 1)
+
+ struct TInitialPacket {
+ struct {
+ TActorId SelfVirtualId;
+ TActorId PeerVirtualId;
+ ui64 NextPacket;
+ ui64 Version;
+ } Header;
+ ui32 Checksum;
+
+ TInitialPacket() = default;
+
+ TInitialPacket(const TActorId& self, const TActorId& peer, ui64 nextPacket, ui64 version) {
+ Header.SelfVirtualId = self;
+ Header.PeerVirtualId = peer;
+ Header.NextPacket = nextPacket;
+ Header.Version = version;
+ Checksum = Crc32cExtendMSanCompatible(0, &Header, sizeof(Header));
+ }
+
+ bool Check() const {
+ return Checksum == Crc32cExtendMSanCompatible(0, &Header, sizeof(Header));
+ }
+
+ TString ToString() const {
+ return TStringBuilder()
+ << "{SelfVirtualId# " << Header.SelfVirtualId.ToString()
+ << " PeerVirtualId# " << Header.PeerVirtualId.ToString()
+ << " NextPacket# " << Header.NextPacket
+ << " Version# " << Header.Version
+ << "}";
+ }
+ };
+
+ struct TExHeader {
+ static constexpr ui32 MaxSize = 1024 * 1024;
+
+ ui32 Checksum;
+ ui32 Size;
+
+ ui32 CalculateChecksum(const void* data, size_t len) const {
+ return Crc32cExtendMSanCompatible(Crc32cExtendMSanCompatible(0, &Size, sizeof(Size)), data, len);
+ }
+
+ void Sign(const void* data, size_t len) {
+ Checksum = CalculateChecksum(data, len);
+ }
+
+ bool Check(const void* data, size_t len) const {
+ return Checksum == CalculateChecksum(data, len);
+ }
+ };
+
+#pragma pack(pop)
+
+ private:
+ TInterconnectProxyCommon::TPtr Common;
+ TActorId SelfVirtualId;
+ TActorId PeerVirtualId;
+ ui32 PeerNodeId = 0;
+ ui64 NextPacketToPeer = 0;
+ TMaybe<ui64> NextPacketFromPeer; // will be obtained from incoming initial packet
+ TString PeerHostName;
+ TString PeerAddr;
+ TSocketPtr Socket;
+ TPollerToken::TPtr PollerToken;
+ TString State;
+ TString HandshakeKind;
+ TMaybe<THolder<TProgramInfo>> ProgramInfo; // filled in in case of successful handshake; even if null
+ TSessionParams Params;
+ bool ResolveTimedOut = false;
+ THashMap<ui32, TInstant> LastLogNotice;
+ const TDuration MuteDuration = TDuration::Seconds(15);
+ TInstant Deadline;
+
+ public:
+ static constexpr IActor::EActivityType ActorActivityType() {
+ return IActor::INTERCONNECT_HANDSHAKE;
+ }
+
+ THandshakeActor(TInterconnectProxyCommon::TPtr common, const TActorId& self, const TActorId& peer,
+ ui32 nodeId, ui64 nextPacket, TString peerHostName, TSessionParams params)
+ : TActorCoroImpl(StackSize, true, true) // allow unhandled poison pills and dtors
+ , Common(std::move(common))
+ , SelfVirtualId(self)
+ , PeerVirtualId(peer)
+ , PeerNodeId(nodeId)
+ , NextPacketToPeer(nextPacket)
+ , PeerHostName(std::move(peerHostName))
+ , HandshakeKind("outgoing handshake")
+ , Params(std::move(params))
+ {
+ Y_VERIFY(SelfVirtualId);
+ Y_VERIFY(SelfVirtualId.NodeId());
+ Y_VERIFY(PeerNodeId);
+ }
+
+ THandshakeActor(TInterconnectProxyCommon::TPtr common, TSocketPtr socket)
+ : TActorCoroImpl(StackSize, true, true) // allow unhandled poison pills and dtors
+ , Common(std::move(common))
+ , Socket(std::move(socket))
+ , HandshakeKind("incoming handshake")
+ {
+ Y_VERIFY(Socket);
+ PeerAddr = TString::Uninitialized(1024);
+ if (GetRemoteAddr(*Socket, PeerAddr.Detach(), PeerAddr.size())) {
+ PeerAddr.resize(strlen(PeerAddr.data()));
+ } else {
+ PeerAddr.clear();
+ }
+ }
+
+ void UpdatePrefix() {
+ SetPrefix(Sprintf("Handshake %s [node %" PRIu32 "]", SelfActorId.ToString().data(), PeerNodeId));
+ }
+
+ void Run() override {
+ UpdatePrefix();
+
+ // set up overall handshake process timer
+ TDuration timeout = Common->Settings.Handshake;
+ if (timeout == TDuration::Zero()) {
+ timeout = DEFAULT_HANDSHAKE_TIMEOUT;
+ }
+ timeout += ResolveTimeout * 2;
+ Deadline = Now() + timeout;
+ Schedule(Deadline, new TEvents::TEvWakeup);
+
+ try {
+ if (Socket) {
+ PerformIncomingHandshake();
+ } else {
+ PerformOutgoingHandshake();
+ }
+
+ // establish encrypted channel, or, in case when encryption is disabled, check if it matches settings
+ if (ProgramInfo) {
+ if (Params.Encryption) {
+ EstablishSecureConnection();
+ } else if (Common->Settings.EncryptionMode == EEncryptionMode::REQUIRED && !Params.AuthOnly) {
+ Fail(TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT, "Peer doesn't support encryption, which is required");
+ }
+ }
+ } catch (const TExHandshakeFailed&) {
+ ProgramInfo.Clear();
+ }
+
+ if (ProgramInfo) {
+ LOG_LOG_IC_X(NActorsServices::INTERCONNECT, "ICH04", NLog::PRI_INFO, "handshake succeeded");
+ Y_VERIFY(NextPacketFromPeer);
+ if (PollerToken) {
+ Y_VERIFY(PollerToken->RefCount() == 1);
+ PollerToken.Reset(); // ensure we are going to destroy poller token here as we will re-register the socket within other actor
+ }
+ SendToProxy(MakeHolder<TEvHandshakeDone>(std::move(Socket), PeerVirtualId, SelfVirtualId,
+ *NextPacketFromPeer, ProgramInfo->Release(), std::move(Params)));
+ }
+
+ Socket.Reset();
+ }
+
+ void EstablishSecureConnection() {
+ Y_VERIFY(PollerToken && PollerToken->RefCount() == 1);
+ PollerToken.Reset();
+ auto ev = AskProxy<TEvSecureSocket>(MakeHolder<TEvGetSecureSocket>(Socket), "AskProxy(TEvSecureContext)");
+ Socket = std::move(ev->Get()->Socket);
+ RegisterInPoller();
+ const ui32 myNodeId = GetActorSystem()->NodeId;
+ const bool server = myNodeId < PeerNodeId; // keep server/client role permanent to enable easy TLS session resuming
+ for (;;) {
+ TString err;
+ auto& secure = static_cast<NInterconnect::TSecureSocket&>(*Socket);
+ switch (secure.Establish(server, Params.AuthOnly, err)) {
+ case NInterconnect::TSecureSocket::EStatus::SUCCESS:
+ if (Params.AuthOnly) {
+ Params.Encryption = false;
+ Params.AuthCN = secure.GetPeerCommonName();
+ Y_VERIFY(PollerToken && PollerToken->RefCount() == 1);
+ PollerToken.Reset();
+ Socket = secure.Detach();
+ }
+ return;
+
+ case NInterconnect::TSecureSocket::EStatus::ERROR:
+ Fail(TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT, err, true);
+ [[fallthrough]];
+
+ case NInterconnect::TSecureSocket::EStatus::WANT_READ:
+ WaitPoller(true, false, "ReadEstablish");
+ break;
+
+ case NInterconnect::TSecureSocket::EStatus::WANT_WRITE:
+ WaitPoller(false, true, "WriteEstablish");
+ break;
+ }
+ }
+ }
+
+ void ProcessUnexpectedEvent(TAutoPtr<IEventHandle> ev) override {
+ switch (const ui32 type = ev->GetTypeRewrite()) {
+ case TEvents::TSystem::Wakeup:
+ Fail(TEvHandshakeFail::HANDSHAKE_FAIL_TRANSIENT, Sprintf("Handshake timed out, State# %s", State.data()), true);
+ [[fallthrough]];
+
+ case ui32(ENetwork::NodeInfo):
+ case TEvInterconnect::EvNodeAddress:
+ case ui32(ENetwork::ResolveError):
+ break; // most likely a race with resolve timeout
+
+ case TEvPollerReady::EventType:
+ break;
+
+ default:
+ Y_FAIL("unexpected event 0x%08" PRIx32, type);
+ }
+ }
+
+ template<typename T>
+ void SetupVersionTag(T& proto) {
+ if (Common->VersionInfo) {
+ proto.SetVersionTag(Common->VersionInfo->Tag);
+ for (const TString& accepted : Common->VersionInfo->AcceptedTags) {
+ proto.AddAcceptedVersionTags(accepted);
+ }
+ }
+ }
+
+ template<typename T>
+ void SetupClusterUUID(T& proto) {
+ auto *pb = proto.MutableClusterUUIDs();
+ pb->SetClusterUUID(Common->ClusterUUID);
+ for (const TString& uuid : Common->AcceptUUID) {
+ pb->AddAcceptUUID(uuid);
+ }
+ }
+
+ template<typename T, typename TCallback>
+ void ValidateVersionTag(const T& proto, TCallback&& errorCallback) {
+ // check if we will accept peer's version tag (if peer provides one and if we have accepted list non-empty)
+ if (Common->VersionInfo) {
+ if (!proto.HasVersionTag()) {
+ LOG_LOG_IC_X(NActorsServices::INTERCONNECT, "ICH06", NLog::PRI_WARN,
+ "peer did not report VersionTag, accepting by default");
+ } else if (!Common->VersionInfo->AcceptedTags.count(proto.GetVersionTag())) {
+ // we will not accept peer's tag, so check if remote peer would accept our version tag
+ size_t i;
+ for (i = 0; i < proto.AcceptedVersionTagsSize() && Common->VersionInfo->Tag != proto.GetAcceptedVersionTags(i); ++i)
+ {}
+ if (i == proto.AcceptedVersionTagsSize()) {
+ // peer will neither accept our version -- this is total failure
+ TStringStream s("local/peer version tags did not match accepted ones");
+ s << " local Tag# " << Common->VersionInfo->Tag << " accepted Tags# [";
+ bool first = true;
+ for (const auto& tag : Common->VersionInfo->AcceptedTags) {
+ s << (std::exchange(first, false) ? "" : " ") << tag;
+ }
+ s << "] peer Tag# " << proto.GetVersionTag() << " accepted Tags# [";
+ first = true;
+ for (const auto& tag : proto.GetAcceptedVersionTags()) {
+ s << (std::exchange(first, false) ? "" : " ") << tag;
+ }
+ s << "]";
+ errorCallback(s.Str());
+ }
+ }
+ }
+ }
+
+ template<typename T, typename TCallback>
+ void ValidateClusterUUID(const T& proto, TCallback&& errorCallback, const TMaybe<TString>& uuid = {}) {
+ auto formatList = [](const auto& list) {
+ TStringStream s;
+ s << "[";
+ for (auto it = list.begin(); it != list.end(); ++it) {
+ if (it != list.begin()) {
+ s << " ";
+ }
+ s << *it;
+ }
+ s << "]";
+ return s.Str();
+ };
+ if (!Common->AcceptUUID) {
+ return; // promiscuous mode -- we accept every other peer
+ }
+ if (!proto.HasClusterUUIDs()) {
+ if (uuid) {
+ // old-style checking, peer does not support symmetric protoocol
+ bool matching = false;
+ for (const TString& accepted : Common->AcceptUUID) {
+ if (*uuid == accepted) {
+ matching = true;
+ break;
+ }
+ }
+ if (!matching) {
+ errorCallback(Sprintf("Peer ClusterUUID# %s mismatch, AcceptUUID# %s", uuid->data(), formatList(Common->AcceptUUID).data()));
+ }
+ }
+ return; // remote side did not fill in this field -- old version, symmetric protocol is not supported
+ }
+
+ const auto& uuids = proto.GetClusterUUIDs();
+
+ // check if our UUID matches remote accept list
+ for (const TString& item : uuids.GetAcceptUUID()) {
+ if (item == Common->ClusterUUID) {
+ return; // match
+ }
+ }
+
+ // check if remote UUID matches our accept list
+ const TString& remoteUUID = uuids.GetClusterUUID();
+ for (const TString& item : Common->AcceptUUID) {
+ if (item == remoteUUID) {
+ return; // match
+ }
+ }
+
+ // no match
+ errorCallback(Sprintf("Peer ClusterUUID# %s mismatch, AcceptUUID# %s", remoteUUID.data(), formatList(Common->AcceptUUID).data()));
+ }
+
+ void ParsePeerScopeId(const NActorsInterconnect::TScopeId& proto) {
+ Params.PeerScopeId = {proto.GetX1(), proto.GetX2()};
+ }
+
+ void FillInScopeId(NActorsInterconnect::TScopeId& proto) {
+ const TScopeId& scope = Common->LocalScopeId;
+ proto.SetX1(scope.first);
+ proto.SetX2(scope.second);
+ }
+
+ template<typename T>
+ void ReportProto(const T& protobuf, const char *msg) {
+ auto formatString = [&] {
+ google::protobuf::TextFormat::Printer p;
+ p.SetSingleLineMode(true);
+ TString s;
+ p.PrintToString(protobuf, &s);
+ return s;
+ };
+ LOG_LOG_IC_X(NActorsServices::INTERCONNECT, "ICH07", NLog::PRI_DEBUG, "%s %s", msg,
+ formatString().data());
+ }
+
+ bool CheckPeerCookie(const TString& cookie, TString *error) {
+ // create a temporary socket to connect to the peer
+ TSocketPtr tempSocket;
+ std::swap(tempSocket, Socket);
+ TPollerToken::TPtr tempPollerToken;
+ std::swap(tempPollerToken, PollerToken);
+
+ // set up virtual self id to ensure peer will not drop our connection
+ char buf[12] = {'c', 'o', 'o', 'k', 'i', 'e', ' ', 'c', 'h', 'e', 'c', 'k'};
+ SelfVirtualId = TActorId(SelfActorId.NodeId(), TStringBuf(buf, 12));
+
+ bool success = true;
+ try {
+ // issue connection and send initial packet
+ Connect(false);
+ SendInitialPacket();
+
+ // wait for basic response
+ TInitialPacket response;
+ ReceiveData(&response, sizeof(response), "ReceiveResponse");
+ if (!response.Check()) {
+ Fail(TEvHandshakeFail::HANDSHAKE_FAIL_TRANSIENT, "Initial packet CRC error");
+ } else if (response.Header.Version != INTERCONNECT_PROTOCOL_VERSION) {
+ Fail(TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT, Sprintf("Incompatible protocol %" PRIu64, response.Header.Version));
+ }
+
+ // issue cookie check request
+ NActorsInterconnect::THandshakeRequest request;
+ request.SetProtocol(INTERCONNECT_PROTOCOL_VERSION);
+ request.SetProgramPID(0);
+ request.SetProgramStartTime(0);
+ request.SetSerial(0);
+ request.SetReceiverNodeId(0);
+ request.SetSenderActorId(TString());
+ request.SetCookie(cookie);
+ request.SetDoCheckCookie(true);
+ SendExBlock(request, "SendExBlockDoCheckCookie");
+
+ // process cookie check reply
+ NActorsInterconnect::THandshakeReply reply;
+ if (!reply.ParseFromString(ReceiveExBlock("ReceiveExBlockDoCheckCookie"))) {
+ Fail(TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT, "Incorrect packet from peer");
+ } else if (reply.HasCookieCheckResult() && !reply.GetCookieCheckResult()) {
+ Fail(TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT, "Cookie check error -- possible network problem");
+ }
+ } catch (const TExHandshakeFailed& e) {
+ *error = e.what();
+ success = false;
+ }
+
+ // restore state
+ SelfVirtualId = TActorId();
+ std::swap(tempSocket, Socket);
+ std::swap(tempPollerToken, PollerToken);
+ return success;
+ }
+
+ void PerformOutgoingHandshake() {
+ LOG_LOG_IC_X(NActorsServices::INTERCONNECT, "ICH01", NLog::PRI_DEBUG,
+ "starting outgoing handshake");
+
+ // perform connection
+ Connect(true);
+
+ // send initial request packet
+ SendInitialPacket();
+
+ TInitialPacket response;
+ ReceiveData(&response, sizeof(response), "ReceiveResponse");
+ if (!response.Check()) {
+ Fail(TEvHandshakeFail::HANDSHAKE_FAIL_TRANSIENT, "Initial packet CRC error");
+ } else if (response.Header.Version != INTERCONNECT_PROTOCOL_VERSION) {
+ Fail(TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT, Sprintf("Incompatible protocol %" PRIu64, response.Header.Version));
+ }
+
+ // extract next packet
+ NextPacketFromPeer = response.Header.NextPacket;
+
+ if (!PeerVirtualId) {
+ // creating new session -- we have to generate request
+ NActorsInterconnect::THandshakeRequest request;
+
+ request.SetProtocol(INTERCONNECT_PROTOCOL_VERSION);
+ request.SetProgramPID(GetPID());
+ request.SetProgramStartTime(Common->StartTime);
+ request.SetSerial(SelfVirtualId.LocalId());
+ request.SetReceiverNodeId(PeerNodeId);
+ request.SetSenderActorId(SelfVirtualId.ToString());
+ request.SetSenderHostName(Common->TechnicalSelfHostName);
+ request.SetReceiverHostName(PeerHostName);
+
+ if (Common->LocalScopeId != TScopeId()) {
+ FillInScopeId(*request.MutableClientScopeId());
+ }
+
+ if (Common->Cookie) {
+ request.SetCookie(Common->Cookie);
+ }
+ if (Common->ClusterUUID) {
+ request.SetUUID(Common->ClusterUUID);
+ }
+ SetupClusterUUID(request);
+ SetupVersionTag(request);
+
+ if (const ui32 size = Common->HandshakeBallastSize) {
+ TString ballast(size, 0);
+ char* data = ballast.Detach();
+ for (ui32 i = 0; i < size; ++i) {
+ data[i] = i;
+ }
+ request.SetBallast(ballast);
+ }
+
+ switch (Common->Settings.EncryptionMode) {
+ case EEncryptionMode::DISABLED:
+ break;
+
+ case EEncryptionMode::OPTIONAL:
+ request.SetRequireEncryption(false);
+ break;
+
+ case EEncryptionMode::REQUIRED:
+ request.SetRequireEncryption(true);
+ break;
+ }
+
+ request.SetRequestModernFrame(true);
+ request.SetRequestAuthOnly(Common->Settings.TlsAuthOnly);
+
+ SendExBlock(request, "ExRequest");
+
+ NActorsInterconnect::THandshakeReply reply;
+ if (!reply.ParseFromString(ReceiveExBlock("ExReply"))) {
+ Fail(TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT, "Incorrect THandshakeReply");
+ }
+ ReportProto(reply, "ReceiveExBlock ExReply");
+
+ if (reply.HasErrorExplaination()) {
+ Fail(TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT, "error from peer: " + reply.GetErrorExplaination());
+ } else if (!reply.HasSuccess()) {
+ Fail(TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT, "empty reply");
+ }
+
+ auto generateError = [this](TString msg) {
+ Fail(TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT, msg);
+ };
+
+ const auto& success = reply.GetSuccess();
+ ValidateClusterUUID(success, generateError);
+ ValidateVersionTag(success, generateError);
+
+ const auto& s = success.GetSenderActorId();
+ PeerVirtualId.Parse(s.data(), s.size());
+
+ // recover flags
+ Params.Encryption = success.GetStartEncryption();
+ Params.UseModernFrame = success.GetUseModernFrame();
+ Params.AuthOnly = Params.Encryption && success.GetAuthOnly();
+ if (success.HasServerScopeId()) {
+ ParsePeerScopeId(success.GetServerScopeId());
+ }
+
+ // recover peer process info from peer's reply
+ ProgramInfo = GetProgramInfo(success);
+ } else if (!response.Header.SelfVirtualId) {
+ // peer reported error -- empty ack was generated by proxy for this request
+ Fail(TEvHandshakeFail::HANDSHAKE_FAIL_SESSION_MISMATCH, "Peer rejected session continuation handshake");
+ } else if (response.Header.SelfVirtualId != PeerVirtualId || response.Header.PeerVirtualId != SelfVirtualId) {
+ // resuming existing session; check that virtual ids of peers match each other
+ Fail(TEvHandshakeFail::HANDSHAKE_FAIL_SESSION_MISMATCH, "Session virtual ID mismatch");
+ } else {
+ ProgramInfo.ConstructInPlace(); // successful handshake
+ }
+ }
+
+ void PerformIncomingHandshake() {
+ LOG_LOG_IC_X(NActorsServices::INTERCONNECT, "ICH02", NLog::PRI_DEBUG,
+ "starting incoming handshake");
+
+ // set up incoming socket
+ SetupSocket();
+
+ // wait for initial request packet
+ TInitialPacket request;
+ ReceiveData(&request, sizeof(request), "ReceiveRequest");
+ if (!request.Check()) {
+ Fail(TEvHandshakeFail::HANDSHAKE_FAIL_TRANSIENT, "Initial packet CRC error");
+ } else if (request.Header.Version != INTERCONNECT_PROTOCOL_VERSION) {
+ Fail(TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT, Sprintf("Incompatible protocol %" PRIu64, request.Header.Version));
+ }
+
+ // extract peer node id from the peer
+ PeerNodeId = request.Header.SelfVirtualId.NodeId();
+ if (!PeerNodeId) {
+ Y_VERIFY_DEBUG(false, "PeerNodeId is zero request# %s", request.ToString().data());
+ Fail(TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT, "SelfVirtualId.NodeId is empty in initial packet");
+ }
+ UpdatePrefix();
+
+ // extract next packet
+ NextPacketFromPeer = request.Header.NextPacket;
+
+ if (request.Header.PeerVirtualId) {
+ // issue request to the proxy and wait for the response
+ auto reply = AskProxy<TEvHandshakeAck, TEvHandshakeNak>(MakeHolder<TEvHandshakeAsk>(
+ request.Header.SelfVirtualId, request.Header.PeerVirtualId, request.Header.NextPacket),
+ "TEvHandshakeAsk");
+ if (auto *ack = reply->CastAsLocal<TEvHandshakeAck>()) {
+ // extract self/peer virtual ids
+ SelfVirtualId = ack->Self;
+ PeerVirtualId = request.Header.SelfVirtualId;
+ NextPacketToPeer = ack->NextPacket;
+ Params = ack->Params;
+
+ // only succeed in case when proxy returned valid SelfVirtualId; otherwise it wants us to terminate
+ // the handshake process and it does not expect the handshake reply
+ ProgramInfo.ConstructInPlace();
+ } else {
+ LOG_LOG_IC_X(NActorsServices::INTERCONNECT, "ICH08", NLog::PRI_NOTICE,
+ "Continuation request rejected by proxy");
+
+ // report continuation reject to peer
+ SelfVirtualId = TActorId();
+ PeerVirtualId = TActorId();
+ NextPacketToPeer = 0;
+ }
+
+ // issue response to the peer
+ SendInitialPacket();
+ } else {
+ // peer wants a new session, clear fields and send initial packet
+ SelfVirtualId = TActorId();
+ PeerVirtualId = TActorId();
+ NextPacketToPeer = 0;
+ SendInitialPacket();
+
+ // wait for extended request
+ auto ev = MakeHolder<TEvHandshakeRequest>();
+ auto& request = ev->Record;
+ if (!request.ParseFromString(ReceiveExBlock("ExRequest"))) {
+ Fail(TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT, "Incorrect THandshakeRequest");
+ }
+ ReportProto(request, "ReceiveExBlock ExRequest");
+
+ auto generateError = [this](TString msg) {
+ // issue reply to the peer to prevent repeating connection retries
+ NActorsInterconnect::THandshakeReply reply;
+ reply.SetErrorExplaination(msg);
+ SendExBlock(reply, "ExReply");
+
+ // terminate ths handshake
+ Fail(TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT, msg);
+ };
+
+ // check request cookie
+ TString error;
+ if (request.HasDoCheckCookie()) {
+ NActorsInterconnect::THandshakeReply reply;
+ reply.SetCookieCheckResult(request.GetCookie() == Common->Cookie);
+ SendExBlock(reply, "ExReplyDoCheckCookie");
+ throw TExHandshakeFailed();
+ } else if (request.HasCookie() && !CheckPeerCookie(request.GetCookie(), &error)) {
+ generateError(TStringBuilder() << "Peer connectivity-checking failed, error# " << error);
+ }
+
+ // update log prefix with the reported peer host name
+ PeerHostName = request.GetSenderHostName();
+
+ // parse peer virtual id
+ const auto& str = request.GetSenderActorId();
+ PeerVirtualId.Parse(str.data(), str.size());
+
+ // validate request
+ ValidateClusterUUID(request, generateError, request.GetUUID());
+ if (request.GetReceiverNodeId() != SelfActorId.NodeId()) {
+ generateError(Sprintf("Incorrect ReceiverNodeId# %" PRIu32 " from the peer, expected# %" PRIu32,
+ request.GetReceiverNodeId(), SelfActorId.NodeId()));
+ } else if (request.GetReceiverHostName() != Common->TechnicalSelfHostName) {
+ generateError(Sprintf("ReceiverHostName# %s mismatch, expected# %s", request.GetReceiverHostName().data(),
+ Common->TechnicalSelfHostName.data()));
+ }
+ ValidateVersionTag(request, generateError);
+
+ // check peer node
+ auto peerNodeInfo = GetPeerNodeInfo();
+ if (!peerNodeInfo) {
+ generateError("Peer node not registered in nameservice");
+ } else if (peerNodeInfo->Host != request.GetSenderHostName()) {
+ generateError("SenderHostName mismatch");
+ }
+
+ // check request against encryption
+ switch (Common->Settings.EncryptionMode) {
+ case EEncryptionMode::DISABLED:
+ if (request.GetRequireEncryption()) {
+ generateError("Peer requested encryption, but it is disabled locally");
+ }
+ break;
+
+ case EEncryptionMode::OPTIONAL:
+ Params.Encryption = request.HasRequireEncryption();
+ break;
+
+ case EEncryptionMode::REQUIRED:
+ if (!request.HasRequireEncryption()) {
+ generateError("Peer did not request encryption, but it is required locally");
+ }
+ Params.Encryption = true;
+ break;
+ }
+
+ Params.UseModernFrame = request.GetRequestModernFrame();
+ Params.AuthOnly = Params.Encryption && request.GetRequestAuthOnly() && Common->Settings.TlsAuthOnly;
+
+ if (request.HasClientScopeId()) {
+ ParsePeerScopeId(request.GetClientScopeId());
+ }
+
+ // remember program info (assuming successful handshake)
+ ProgramInfo = GetProgramInfo(request);
+
+ // send to proxy
+ auto reply = AskProxy<TEvHandshakeReplyOK, TEvHandshakeReplyError>(std::move(ev), "TEvHandshakeRequest");
+
+ // parse it
+ if (auto ev = reply->CastAsLocal<TEvHandshakeReplyOK>()) {
+ // issue successful reply to the peer
+ auto& record = ev->Record;
+ Y_VERIFY(record.HasSuccess());
+ auto& success = *record.MutableSuccess();
+ SetupClusterUUID(success);
+ SetupVersionTag(success);
+ success.SetStartEncryption(Params.Encryption);
+ if (Common->LocalScopeId != TScopeId()) {
+ FillInScopeId(*success.MutableServerScopeId());
+ }
+ success.SetUseModernFrame(Params.UseModernFrame);
+ success.SetAuthOnly(Params.AuthOnly);
+ SendExBlock(record, "ExReply");
+
+ // extract sender actor id (self virtual id)
+ const auto& str = success.GetSenderActorId();
+ SelfVirtualId.Parse(str.data(), str.size());
+ } else if (auto ev = reply->CastAsLocal<TEvHandshakeReplyError>()) {
+ // in case of error just send reply to the peer and terminate handshake
+ SendExBlock(ev->Record, "ExReply");
+ ProgramInfo.Clear(); // do not issue reply to the proxy
+ } else {
+ Y_FAIL("unexpected event Type# 0x%08" PRIx32, reply->GetTypeRewrite());
+ }
+ }
+ }
+
+ template <typename T>
+ void SendExBlock(const T& proto, const char* what) {
+ TString data;
+ Y_PROTOBUF_SUPPRESS_NODISCARD proto.SerializeToString(&data);
+ Y_VERIFY(data.size() <= TExHeader::MaxSize);
+
+ ReportProto(proto, Sprintf("SendExBlock %s", what).data());
+
+ TExHeader header;
+ header.Size = data.size();
+ header.Sign(data.data(), data.size());
+ SendData(&header, sizeof(header), Sprintf("Send%sHeader", what));
+ SendData(data.data(), data.size(), Sprintf("Send%sData", what));
+ }
+
+ TString ReceiveExBlock(const char* what) {
+ TExHeader header;
+ ReceiveData(&header, sizeof(header), Sprintf("Receive%sHeader", what));
+ if (header.Size > TExHeader::MaxSize) {
+ Fail(TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT, "Incorrect extended header size");
+ }
+
+ TString data;
+ data.resize(header.Size);
+ ReceiveData(data.Detach(), data.size(), Sprintf("Receive%sData", what));
+
+ if (!header.Check(data.data(), data.size())) {
+ Fail(TEvHandshakeFail::HANDSHAKE_FAIL_TRANSIENT, "Extended header CRC error");
+ }
+
+ return data;
+ }
+
+ private:
+ void SendToProxy(THolder<IEventBase> ev) {
+ Y_VERIFY(PeerNodeId);
+ Send(GetActorSystem()->InterconnectProxy(PeerNodeId), ev.Release());
+ }
+
+ template <typename TEvent>
+ THolder<typename TEvent::THandle> WaitForSpecificEvent(TString state, TInstant deadline = TInstant::Max()) {
+ State = std::move(state);
+ return TActorCoroImpl::WaitForSpecificEvent<TEvent>(deadline);
+ }
+
+ template <typename T1, typename T2, typename... TEvents>
+ THolder<IEventHandle> WaitForSpecificEvent(TString state, TInstant deadline = TInstant::Max()) {
+ State = std::move(state);
+ return TActorCoroImpl::WaitForSpecificEvent<T1, T2, TEvents...>(deadline);
+ }
+
+ template <typename TEvent>
+ THolder<typename TEvent::THandle> AskProxy(THolder<IEventBase> ev, TString state) {
+ SendToProxy(std::move(ev));
+ return WaitForSpecificEvent<TEvent>(std::move(state));
+ }
+
+ template <typename T1, typename T2, typename... TOther>
+ THolder<IEventHandle> AskProxy(THolder<IEventBase> ev, TString state) {
+ SendToProxy(std::move(ev));
+ return WaitForSpecificEvent<T1, T2, TOther...>(std::move(state));
+ }
+
+ void Fail(TEvHandshakeFail::EnumHandshakeFail reason, TString explanation, bool network = false) {
+ TString msg = Sprintf("%s Peer# %s(%s) %s%s", HandshakeKind.data(), PeerHostName ? PeerHostName.data() : "<unknown>",
+ PeerAddr.size() ? PeerAddr.data() : "<unknown>", ResolveTimedOut ? "[resolve timeout] " : "",
+ explanation.data());
+
+ if (network) {
+ TInstant now = Now();
+ TInstant prevLog = LastLogNotice[PeerNodeId];
+ NActors::NLog::EPriority logPriority = NActors::NLog::PRI_DEBUG;
+ if (now - prevLog > MuteDuration) {
+ logPriority = NActors::NLog::PRI_NOTICE;
+ LastLogNotice[PeerNodeId] = now;
+ }
+ LOG_LOG_NET_X(logPriority, PeerNodeId, "network-related error occured on handshake: %s", msg.data());
+ } else {
+ // calculate log severity based on failure type; permanent failures lead to error log messages
+ auto severity = reason == TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT
+ ? NActors::NLog::PRI_NOTICE
+ : NActors::NLog::PRI_INFO;
+
+ LOG_LOG_IC_X(NActorsServices::INTERCONNECT, "ICH03", severity, "handshake failed, explanation# %s", msg.data());
+ }
+
+ if (PeerNodeId) {
+ SendToProxy(MakeHolder<TEvHandshakeFail>(reason, std::move(msg)));
+ }
+
+ throw TExHandshakeFailed() << explanation;
+ }
+
+ private:
+ ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+ // COMMUNICATION BLOCK
+ ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+ void Connect(bool updatePeerAddr) {
+ // issue request to a nameservice to resolve peer node address
+ Send(Common->NameserviceId, new TEvInterconnect::TEvResolveNode(PeerNodeId, Deadline));
+
+ // wait for the result
+ auto ev = WaitForSpecificEvent<TEvResolveError, TEvLocalNodeInfo, TEvInterconnect::TEvNodeAddress>("ResolveNode",
+ Now() + ResolveTimeout);
+
+ // extract address from the result
+ NInterconnect::TAddress address;
+ if (!ev) {
+ ResolveTimedOut = true;
+ if (auto peerNodeInfo = GetPeerNodeInfo(); peerNodeInfo && peerNodeInfo->Address) {
+ address = {peerNodeInfo->Address, peerNodeInfo->Port};
+ } else {
+ Fail(TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT, "DNS resolve timed out and no static address defined", true);
+ }
+ } else if (auto *p = ev->CastAsLocal<TEvLocalNodeInfo>()) {
+ if (!p->Address) {
+ Fail(TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT, "DNS resolve error: no address returned", true);
+ }
+ address = {*p->Address};
+ } else if (auto *p = ev->CastAsLocal<TEvInterconnect::TEvNodeAddress>()) {
+ const auto& r = p->Record;
+ if (!r.HasAddress() || !r.HasPort()) {
+ Fail(TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT, "DNS resolve error: no address returned", true);
+ }
+ address = {r.GetAddress(), static_cast<ui16>(r.GetPort())};
+ } else {
+ Y_VERIFY(ev->GetTypeRewrite() == ui32(ENetwork::ResolveError));
+ Fail(TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT, "DNS resolve error: " + ev->Get<TEvResolveError>()->Explain, true);
+ }
+
+ // create the socket with matching address family
+ Socket = NInterconnect::TStreamSocket::Make(address.GetFamily());
+ if (*Socket == -1) {
+ Fail(TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT, "System error: failed to create socket");
+ }
+
+ // extract peer address
+ if (updatePeerAddr) {
+ PeerAddr = address.ToString();
+ }
+
+ // set up socket parameters
+ SetupSocket();
+
+ // start connecting
+ switch (int err = -Socket->Connect(address)) {
+ case 0: // successful connection
+ break;
+
+ case EINPROGRESS: // connection in progress
+ WaitPoller(false, true, "WaitConnect");
+ err = Socket->GetConnectStatus();
+ if (err) {
+ Fail(TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT, Sprintf("Connection failed: %s", strerror(err)), true);
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ auto it = LastLogNotice.find(PeerNodeId);
+ NActors::NLog::EPriority logPriority = NActors::NLog::PRI_DEBUG;
+ if (it != LastLogNotice.end()) {
+ LastLogNotice.erase(it);
+ logPriority = NActors::NLog::PRI_NOTICE;
+ }
+ LOG_LOG_IC_X(NActorsServices::INTERCONNECT, "ICH05", logPriority, "connected to peer");
+ }
+
+ void SetupSocket() {
+ // switch to nonblocking mode
+ try {
+ SetNonBlock(*Socket);
+ SetNoDelay(*Socket, true);
+ } catch (...) {
+ Fail(TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT, "System error: can't up nonblocking mode for socket");
+ }
+
+ // setup send buffer size
+ Socket->SetSendBufferSize(Common->Settings.GetSendBufferSize());
+
+ // register in poller
+ RegisterInPoller();
+ }
+
+ void RegisterInPoller() {
+ const bool success = Send(MakePollerActorId(), new TEvPollerRegister(Socket, SelfActorId, SelfActorId));
+ Y_VERIFY(success);
+ auto result = WaitForSpecificEvent<TEvPollerRegisterResult>("RegisterPoller");
+ PollerToken = std::move(result->Get()->PollerToken);
+ Y_VERIFY(PollerToken);
+ Y_VERIFY(PollerToken->RefCount() == 1); // ensure exclusive ownership
+ }
+
+ void SendInitialPacket() {
+ TInitialPacket packet(SelfVirtualId, PeerVirtualId, NextPacketToPeer, INTERCONNECT_PROTOCOL_VERSION);
+ SendData(&packet, sizeof(packet), "SendInitialPacket");
+ }
+
+ void WaitPoller(bool read, bool write, TString state) {
+ PollerToken->Request(read, write);
+ WaitForSpecificEvent<TEvPollerReady>(std::move(state));
+ }
+
+ template <typename TDataPtr, typename TSendRecvFunc>
+ void Process(TDataPtr buffer, size_t len, TSendRecvFunc&& sendRecv, bool read, bool write, TString state) {
+ Y_VERIFY(Socket);
+ NInterconnect::TStreamSocket* sock = Socket.Get();
+ ssize_t (NInterconnect::TStreamSocket::*pfn)(TDataPtr, size_t, TString*) const = sendRecv;
+ size_t processed = 0;
+
+ auto error = [&](TString msg) {
+ Fail(TEvHandshakeFail::HANDSHAKE_FAIL_TRANSIENT, Sprintf("Socket error# %s state# %s processed# %zu remain# %zu",
+ msg.data(), state.data(), processed, len), true);
+ };
+
+ while (len) {
+ TString err;
+ ssize_t nbytes = (sock->*pfn)(buffer, len, &err);
+ if (nbytes > 0) {
+ buffer = (char*)buffer + nbytes;
+ len -= nbytes;
+ processed += nbytes;
+ } else if (-nbytes == EAGAIN || -nbytes == EWOULDBLOCK) {
+ WaitPoller(read, write, state);
+ } else if (!nbytes) {
+ error("connection unexpectedly closed");
+ } else if (-nbytes != EINTR) {
+ error(err ? err : TString(strerror(-nbytes)));
+ }
+ }
+ }
+
+ void SendData(const void* buffer, size_t len, TString state) {
+ Process(buffer, len, &NInterconnect::TStreamSocket::Send, false, true, std::move(state));
+ }
+
+ void ReceiveData(void* buffer, size_t len, TString state) {
+ Process(buffer, len, &NInterconnect::TStreamSocket::Recv, true, false, std::move(state));
+ }
+
+ THolder<TEvInterconnect::TNodeInfo> GetPeerNodeInfo() {
+ Y_VERIFY(PeerNodeId);
+ Send(Common->NameserviceId, new TEvInterconnect::TEvGetNode(PeerNodeId, Deadline));
+ auto response = WaitForSpecificEvent<TEvInterconnect::TEvNodeInfo>("GetPeerNodeInfo");
+ return std::move(response->Get()->Node);
+ }
+
+ template <typename T>
+ static THolder<TProgramInfo> GetProgramInfo(const T& proto) {
+ auto programInfo = MakeHolder<TProgramInfo>();
+ programInfo->PID = proto.GetProgramPID();
+ programInfo->StartTime = proto.GetProgramStartTime();
+ programInfo->Serial = proto.GetSerial();
+ return programInfo;
+ }
+ };
+
+ IActor* CreateOutgoingHandshakeActor(TInterconnectProxyCommon::TPtr common, const TActorId& self,
+ const TActorId& peer, ui32 nodeId, ui64 nextPacket, TString peerHostName,
+ TSessionParams params) {
+ return new TActorCoro(MakeHolder<THandshakeActor>(std::move(common), self, peer, nodeId, nextPacket,
+ std::move(peerHostName), std::move(params)));
+ }
+
+ IActor* CreateIncomingHandshakeActor(TInterconnectProxyCommon::TPtr common, TSocketPtr socket) {
+ return new TActorCoro(MakeHolder<THandshakeActor>(std::move(common), std::move(socket)));
+ }
+
+}
diff --git a/library/cpp/actors/interconnect/interconnect_handshake.h b/library/cpp/actors/interconnect/interconnect_handshake.h
new file mode 100644
index 0000000000..b3c0db6c5d
--- /dev/null
+++ b/library/cpp/actors/interconnect/interconnect_handshake.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <library/cpp/actors/core/hfunc.h>
+#include <library/cpp/actors/core/event_pb.h>
+#include <library/cpp/actors/core/events.h>
+
+#include "interconnect_common.h"
+#include "interconnect_impl.h"
+#include "poller_tcp.h"
+#include "events_local.h"
+
+namespace NActors {
+ static constexpr TDuration DEFAULT_HANDSHAKE_TIMEOUT = TDuration::Seconds(1);
+ static constexpr ui64 INTERCONNECT_PROTOCOL_VERSION = 2;
+
+ using TSocketPtr = TIntrusivePtr<NInterconnect::TStreamSocket>;
+
+ IActor* CreateOutgoingHandshakeActor(TInterconnectProxyCommon::TPtr common, const TActorId& self,
+ const TActorId& peer, ui32 nodeId, ui64 nextPacket, TString peerHostName,
+ TSessionParams params);
+
+ IActor* CreateIncomingHandshakeActor(TInterconnectProxyCommon::TPtr common, TSocketPtr socket);
+
+}
diff --git a/library/cpp/actors/interconnect/interconnect_impl.h b/library/cpp/actors/interconnect/interconnect_impl.h
new file mode 100644
index 0000000000..ee29e4d397
--- /dev/null
+++ b/library/cpp/actors/interconnect/interconnect_impl.h
@@ -0,0 +1,45 @@
+#pragma once
+
+#include "interconnect.h"
+#include <library/cpp/actors/protos/interconnect.pb.h>
+#include <library/cpp/actors/core/event_pb.h>
+#include <library/cpp/actors/helpers/mon_histogram_helper.h>
+#include <library/cpp/monlib/dynamic_counters/counters.h>
+
+namespace NActors {
+ // resolve node info
+ struct TEvInterconnect::TEvResolveNode: public TEventPB<TEvInterconnect::TEvResolveNode, NActorsInterconnect::TEvResolveNode, TEvInterconnect::EvResolveNode> {
+ TEvResolveNode() {
+ }
+
+ TEvResolveNode(ui32 nodeId, TInstant deadline = TInstant::Max()) {
+ Record.SetNodeId(nodeId);
+ if (deadline != TInstant::Max()) {
+ Record.SetDeadline(deadline.GetValue());
+ }
+ }
+ };
+
+ // node info
+ struct TEvInterconnect::TEvNodeAddress: public TEventPB<TEvInterconnect::TEvNodeAddress, NActorsInterconnect::TEvNodeInfo, TEvInterconnect::EvNodeAddress> {
+ TEvNodeAddress() {
+ }
+
+ TEvNodeAddress(ui32 nodeId) {
+ Record.SetNodeId(nodeId);
+ }
+ };
+
+ // register node
+ struct TEvInterconnect::TEvRegisterNode: public TEventBase<TEvInterconnect::TEvRegisterNode, TEvInterconnect::EvRegisterNode> {
+ };
+
+ // reply on register node
+ struct TEvInterconnect::TEvRegisterNodeResult: public TEventBase<TEvInterconnect::TEvRegisterNodeResult, TEvInterconnect::EvRegisterNodeResult> {
+ };
+
+ // disconnect
+ struct TEvInterconnect::TEvDisconnect: public TEventLocal<TEvInterconnect::TEvDisconnect, TEvInterconnect::EvDisconnect> {
+ };
+
+}
diff --git a/library/cpp/actors/interconnect/interconnect_mon.cpp b/library/cpp/actors/interconnect/interconnect_mon.cpp
new file mode 100644
index 0000000000..cf924ccbf9
--- /dev/null
+++ b/library/cpp/actors/interconnect/interconnect_mon.cpp
@@ -0,0 +1,276 @@
+#include "interconnect_mon.h"
+#include "interconnect_tcp_proxy.h"
+
+#include <library/cpp/json/json_value.h>
+#include <library/cpp/json/json_writer.h>
+#include <library/cpp/monlib/service/pages/templates.h>
+
+#include <openssl/ssl.h>
+#include <openssl/pem.h>
+
+namespace NInterconnect {
+
+ using namespace NActors;
+
+ class TInterconnectMonActor : public TActor<TInterconnectMonActor> {
+ class TQueryProcessor : public TActorBootstrapped<TQueryProcessor> {
+ const TActorId Sender;
+ const bool Json;
+ TMap<ui32, TInterconnectProxyTCP::TProxyStats> Stats;
+ ui32 PendingReplies = 0;
+
+ public:
+ static constexpr IActor::EActorActivity ActorActivityType() {
+ return INTERCONNECT_MONACTOR;
+ }
+
+ TQueryProcessor(const TActorId& sender, bool json)
+ : Sender(sender)
+ , Json(json)
+ {}
+
+ void Bootstrap(const TActorContext& ctx) {
+ Become(&TThis::StateFunc, ctx, TDuration::Seconds(5), new TEvents::TEvWakeup);
+ Send(GetNameserviceActorId(), new TEvInterconnect::TEvListNodes);
+ }
+
+ void Handle(TEvInterconnect::TEvNodesInfo::TPtr ev, const TActorContext& ctx) {
+ TActorSystem* const as = ctx.ExecutorThread.ActorSystem;
+ for (const auto& node : ev->Get()->Nodes) {
+ Send(as->InterconnectProxy(node.NodeId), new TInterconnectProxyTCP::TEvQueryStats, IEventHandle::FlagTrackDelivery);
+ ++PendingReplies;
+ }
+ GenerateResultWhenReady(ctx);
+ }
+
+ STRICT_STFUNC(StateFunc,
+ HFunc(TEvInterconnect::TEvNodesInfo, Handle)
+ HFunc(TInterconnectProxyTCP::TEvStats, Handle)
+ CFunc(TEvents::TSystem::Undelivered, HandleUndelivered)
+ CFunc(TEvents::TSystem::Wakeup, HandleWakeup)
+ )
+
+ void Handle(TInterconnectProxyTCP::TEvStats::TPtr& ev, const TActorContext& ctx) {
+ auto *msg = ev->Get();
+ Stats.emplace(msg->PeerNodeId, std::move(msg->ProxyStats));
+ --PendingReplies;
+ GenerateResultWhenReady(ctx);
+ }
+
+ void HandleUndelivered(const TActorContext& ctx) {
+ --PendingReplies;
+ GenerateResultWhenReady(ctx);
+ }
+
+ void HandleWakeup(const TActorContext& ctx) {
+ PendingReplies = 0;
+ GenerateResultWhenReady(ctx);
+ }
+
+ void GenerateResultWhenReady(const TActorContext& ctx) {
+ if (!PendingReplies) {
+ if (Json) {
+ ctx.Send(Sender, new NMon::TEvHttpInfoRes(GenerateJson(), 0, NMon::IEvHttpInfoRes::EContentType::Custom));
+ } else {
+ ctx.Send(Sender, new NMon::TEvHttpInfoRes(GenerateHtml()));
+ }
+ Die(ctx);
+ }
+ }
+
+ TString GenerateHtml() {
+ TStringStream str;
+ HTML(str) {
+ TABLE_CLASS("table-sortable table") {
+ TABLEHEAD() {
+ TABLER() {
+ TABLEH() { str << "Peer node id"; }
+ TABLEH() { str << "State"; }
+ TABLEH() { str << "Ping"; }
+ TABLEH() { str << "Clock skew"; }
+ TABLEH() { str << "Scope id"; }
+ TABLEH() { str << "Encryption"; }
+ TABLEH() { str << "LastSessionDieTime"; }
+ TABLEH() { str << "TotalOutputQueueSize"; }
+ TABLEH() { str << "Connected"; }
+ TABLEH() { str << "Host"; }
+ TABLEH() { str << "Port"; }
+ TABLEH() { str << "LastErrorTimestamp"; }
+ TABLEH() { str << "LastErrorKind"; }
+ TABLEH() { str << "LastErrorExplanation"; }
+ }
+ }
+ TABLEBODY() {
+ for (const auto& kv : Stats) {
+ TABLER() {
+ TABLED() { str << "<a href='" << kv.second.Path << "'>" << kv.first << "</a>"; }
+ TABLED() { str << kv.second.State; }
+ TABLED() {
+ if (kv.second.Ping != TDuration::Zero()) {
+ str << kv.second.Ping;
+ }
+ }
+ TABLED() {
+ if (kv.second.ClockSkew < 0) {
+ str << "-" << TDuration::MicroSeconds(-kv.second.ClockSkew);
+ } else {
+ str << "+" << TDuration::MicroSeconds(kv.second.ClockSkew);
+ }
+ }
+ TABLED() { str << ScopeIdToString(kv.second.PeerScopeId); }
+ TABLED() {
+ const char *color = kv.second.Encryption != "none" ? "green" : "red";
+ str << "<font color='" << color << "'>" << kv.second.Encryption << "</font>";
+ }
+ TABLED() {
+ if (kv.second.LastSessionDieTime != TInstant::Zero()) {
+ str << kv.second.LastSessionDieTime;
+ }
+ }
+ TABLED() { str << kv.second.TotalOutputQueueSize; }
+ TABLED() { str << (kv.second.Connected ? "yes" : "<strong>no</strong>"); }
+ TABLED() { str << kv.second.Host; }
+ TABLED() { str << kv.second.Port; }
+ TABLED() {
+ str << "<strong>";
+ if (kv.second.LastErrorTimestamp != TInstant::Zero()) {
+ str << kv.second.LastErrorTimestamp;
+ }
+ str << "</strong>";
+ }
+ TABLED() { str << "<strong>" << kv.second.LastErrorKind << "</strong>"; }
+ TABLED() { str << "<strong>" << kv.second.LastErrorExplanation << "</strong>"; }
+ }
+ }
+ }
+ }
+ }
+ return str.Str();
+ }
+
+ TString GenerateJson() {
+ NJson::TJsonValue json;
+ for (const auto& [nodeId, info] : Stats) {
+ NJson::TJsonValue item;
+ item["NodeId"] = nodeId;
+
+ auto id = [](const auto& x) { return x; };
+ auto toString = [](const auto& x) { return x.ToString(); };
+
+#define JSON(NAME, FUN) item[#NAME] = FUN(info.NAME);
+ JSON(Path, id)
+ JSON(State, id)
+ JSON(PeerScopeId, ScopeIdToString)
+ JSON(LastSessionDieTime, toString)
+ JSON(TotalOutputQueueSize, id)
+ JSON(Connected, id)
+ JSON(Host, id)
+ JSON(Port, id)
+ JSON(LastErrorTimestamp, toString)
+ JSON(LastErrorKind, id)
+ JSON(LastErrorExplanation, id)
+ JSON(Ping, toString)
+ JSON(ClockSkew, id)
+ JSON(Encryption, id)
+#undef JSON
+
+ json[ToString(nodeId)] = item;
+ }
+ TStringStream str(NMonitoring::HTTPOKJSON);
+ NJson::WriteJson(&str, &json);
+ return str.Str();
+ }
+ };
+
+ private:
+ TIntrusivePtr<TInterconnectProxyCommon> Common;
+
+ public:
+ static constexpr IActor::EActorActivity ActorActivityType() {
+ return INTERCONNECT_MONACTOR;
+ }
+
+ TInterconnectMonActor(TIntrusivePtr<TInterconnectProxyCommon> common)
+ : TActor(&TThis::StateFunc)
+ , Common(std::move(common))
+ {}
+
+ STRICT_STFUNC(StateFunc,
+ HFunc(NMon::TEvHttpInfo, Handle)
+ )
+
+ void Handle(NMon::TEvHttpInfo::TPtr& ev, const TActorContext& ctx) {
+ const auto& params = ev->Get()->Request.GetParams();
+ int certinfo = 0;
+ if (TryFromString(params.Get("certinfo"), certinfo) && certinfo) {
+ ctx.Send(ev->Sender, new NMon::TEvHttpInfoRes(GetCertInfoJson(), ev->Get()->SubRequestId,
+ NMon::TEvHttpInfoRes::Custom));
+ } else {
+ const bool json = params.Has("fmt") && params.Get("fmt") == "json";
+ ctx.Register(new TQueryProcessor(ev->Sender, json));
+ }
+ }
+
+ TString GetCertInfoJson() const {
+ NJson::TJsonValue json(NJson::JSON_MAP);
+ if (const TString cert = Common ? Common->Settings.Certificate : TString()) {
+ struct TEx : yexception {};
+ try {
+ const auto& cert = Common->Settings.Certificate;
+ std::unique_ptr<BIO, void(*)(BIO*)> bio(BIO_new_mem_buf(cert.data(), cert.size()), &BIO_vfree);
+ if (!bio) {
+ throw TEx() << "BIO_new_mem_buf failed";
+ }
+ std::unique_ptr<X509, void(*)(X509*)> x509(PEM_read_bio_X509(bio.get(), nullptr, nullptr, nullptr),
+ &X509_free);
+ if (!x509) {
+ throw TEx() << "PEM_read_bio_X509 failed";
+ }
+ X509_NAME *name = X509_get_subject_name(x509.get());
+ if (!name) {
+ throw TEx() << "X509_get_subject_name failed";
+ }
+ char buffer[4096];
+ if (char *p = X509_NAME_oneline(name, buffer, sizeof(buffer))) {
+ json["Subject"] = p;
+ }
+ if (int loc = X509_NAME_get_index_by_NID(name, NID_commonName, -1); loc >= 0) {
+ if (X509_NAME_ENTRY *entry = X509_NAME_get_entry(name, loc)) {
+ if (ASN1_STRING *data = X509_NAME_ENTRY_get_data(entry)) {
+ unsigned char *cn;
+ if (const int len = ASN1_STRING_to_UTF8(&cn, data); len >= 0) {
+ json["CommonName"] = TString(reinterpret_cast<char*>(cn), len);
+ OPENSSL_free(cn);
+ }
+ }
+ }
+ }
+ auto time = [](const ASN1_TIME *t, const char *name) -> TString {
+ if (t) {
+ struct tm tm;
+ if (ASN1_TIME_to_tm(t, &tm)) {
+ return Strftime("%Y-%m-%dT%H:%M:%S%z", &tm);
+ } else {
+ throw TEx() << "ASN1_TIME_to_tm failed";
+ }
+ } else {
+ throw TEx() << name << " failed";
+ }
+ };
+ json["NotBefore"] = time(X509_get0_notBefore(x509.get()), "X509_get0_notBefore");
+ json["NotAfter"] = time(X509_get0_notAfter(x509.get()), "X509_get0_notAfter");
+ } catch (const TEx& ex) {
+ json["Error"] = ex.what();
+ }
+ }
+ TStringStream str(NMonitoring::HTTPOKJSON);
+ NJson::WriteJson(&str, &json);
+ return str.Str();
+ }
+ };
+
+ IActor *CreateInterconnectMonActor(TIntrusivePtr<TInterconnectProxyCommon> common) {
+ return new TInterconnectMonActor(std::move(common));
+ }
+
+} // NInterconnect
diff --git a/library/cpp/actors/interconnect/interconnect_mon.h b/library/cpp/actors/interconnect/interconnect_mon.h
new file mode 100644
index 0000000000..3fb26053fb
--- /dev/null
+++ b/library/cpp/actors/interconnect/interconnect_mon.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <library/cpp/actors/core/actor.h>
+#include "interconnect_common.h"
+
+namespace NInterconnect {
+
+ NActors::IActor *CreateInterconnectMonActor(TIntrusivePtr<NActors::TInterconnectProxyCommon> common = nullptr);
+
+ static inline NActors::TActorId MakeInterconnectMonActorId(ui32 nodeId) {
+ char s[12] = {'I', 'C', 'O', 'v', 'e', 'r', 'v', 'i', 'e', 'w', 0, 0};
+ return NActors::TActorId(nodeId, TStringBuf(s, 12));
+ }
+
+} // NInterconnect
diff --git a/library/cpp/actors/interconnect/interconnect_nameserver_base.h b/library/cpp/actors/interconnect/interconnect_nameserver_base.h
new file mode 100644
index 0000000000..df614f6c2b
--- /dev/null
+++ b/library/cpp/actors/interconnect/interconnect_nameserver_base.h
@@ -0,0 +1,83 @@
+#include "interconnect.h"
+#include "interconnect_impl.h"
+#include "interconnect_address.h"
+#include "events_local.h"
+
+#include <library/cpp/actors/core/hfunc.h>
+#include <library/cpp/actors/memory_log/memlog.h>
+
+namespace NActors {
+
+ template<typename TDerived>
+ class TInterconnectNameserverBase : public TActor<TDerived> {
+ protected:
+ const TMap<ui32, TTableNameserverSetup::TNodeInfo>& NodeTable;
+
+ TInterconnectNameserverBase(void (TDerived::*func)(TAutoPtr<IEventHandle>& ev, const TActorContext& ctx)
+ , const TMap<ui32, TTableNameserverSetup::TNodeInfo>& nodeTable)
+ : TActor<TDerived>(func)
+ , NodeTable(nodeTable)
+ {
+ }
+ public:
+
+ void HandleMissedNodeId(TEvInterconnect::TEvResolveNode::TPtr& ev,
+ const TActorContext& ctx,
+ const TInstant&) {
+ auto reply = new TEvLocalNodeInfo;
+ reply->NodeId = ev->Get()->Record.GetNodeId();
+ ctx.Send(ev->Sender, reply);
+ }
+
+ void Handle(TEvInterconnect::TEvResolveNode::TPtr& ev,
+ const TActorContext& ctx) {
+ const TEvInterconnect::TEvResolveNode* request = ev->Get();
+ auto& record = request->Record;
+ const ui32 nodeId = record.GetNodeId();
+ const TInstant deadline = record.HasDeadline() ? TInstant::FromValue(record.GetDeadline()) : TInstant::Max();
+ auto it = NodeTable.find(nodeId);
+
+ if (it == NodeTable.end()) {
+ static_cast<TDerived*>(this)->HandleMissedNodeId(ev, ctx, deadline);
+ } else {
+ IActor::RegisterWithSameMailbox(
+ CreateResolveActor(nodeId, it->second, ev->Sender, this->SelfId(), deadline));
+ }
+ }
+
+ void Handle(TEvResolveAddress::TPtr& ev,
+ const TActorContext&) {
+ const TEvResolveAddress* request = ev->Get();
+
+ IActor::RegisterWithSameMailbox(
+ CreateResolveActor(request->Address, request->Port, ev->Sender, this->SelfId(), TInstant::Max()));
+ }
+
+ void Handle(TEvInterconnect::TEvListNodes::TPtr& ev,
+ const TActorContext& ctx) {
+ THolder<TEvInterconnect::TEvNodesInfo>
+ reply(new TEvInterconnect::TEvNodesInfo());
+ reply->Nodes.reserve(NodeTable.size());
+ for (const auto& pr : NodeTable) {
+ reply->Nodes.emplace_back(pr.first,
+ pr.second.Address, pr.second.Host, pr.second.ResolveHost,
+ pr.second.Port, pr.second.Location);
+ }
+ ctx.Send(ev->Sender, reply.Release());
+ }
+
+ void Handle(TEvInterconnect::TEvGetNode::TPtr& ev,
+ const TActorContext& ctx) {
+ ui32 nodeId = ev->Get()->NodeId;
+ THolder<TEvInterconnect::TEvNodeInfo>
+ reply(new TEvInterconnect::TEvNodeInfo(nodeId));
+ auto it = NodeTable.find(nodeId);
+ if (it != NodeTable.end()) {
+ reply->Node = MakeHolder<TEvInterconnect::TNodeInfo>(it->first, it->second.Address,
+ it->second.Host, it->second.ResolveHost,
+ it->second.Port, it->second.Location);
+ }
+ ctx.Send(ev->Sender, reply.Release());
+ }
+ };
+}
diff --git a/library/cpp/actors/interconnect/interconnect_nameserver_dynamic.cpp b/library/cpp/actors/interconnect/interconnect_nameserver_dynamic.cpp
new file mode 100644
index 0000000000..5e48401b14
--- /dev/null
+++ b/library/cpp/actors/interconnect/interconnect_nameserver_dynamic.cpp
@@ -0,0 +1,178 @@
+#include "interconnect.h"
+#include "interconnect_impl.h"
+#include "interconnect_address.h"
+#include "interconnect_nameserver_base.h"
+#include "events_local.h"
+#include "logging.h"
+
+#include <library/cpp/actors/core/hfunc.h>
+#include <library/cpp/actors/core/log.h>
+
+namespace NActors {
+
+ class TInterconnectDynamicNameserver
+ : public TInterconnectNameserverBase<TInterconnectDynamicNameserver>
+ , public TInterconnectLoggingBase
+ {
+ struct TPendingRequest {
+ TEvInterconnect::TEvResolveNode::TPtr Request;
+ TInstant Deadline;
+
+ TPendingRequest(TEvInterconnect::TEvResolveNode::TPtr request, const TInstant& deadline)
+ : Request(request), Deadline(deadline)
+ {
+ }
+ };
+
+ TMap<ui32, TTableNameserverSetup::TNodeInfo> NodeTable;
+ TVector<TPendingRequest> PendingRequests;
+ TDuration PendingPeriod;
+
+ void PrintInfo() {
+ TString logMsg = TStringBuilder() << "Table size: " << NodeTable.size();
+ for (const auto& [nodeId, node] : NodeTable) {
+ TString str = TStringBuilder() << "\n > Node " << nodeId << " `" << node.Address << "`:" << node.Port << ", host: " << node.Host << ", resolveHost: " << node.ResolveHost;
+ logMsg += str;
+ }
+ LOG_DEBUG_IC("ICN01", "%s", logMsg.c_str());
+ }
+
+ bool IsNodeUpdated(const ui32 nodeId, const TString& address, const ui32 port) {
+ bool printInfo = false;
+ auto it = NodeTable.find(nodeId);
+ if (it == NodeTable.end()) {
+ LOG_DEBUG_IC("ICN02", "New node %u `%s`: %u",
+ nodeId, address.c_str(), port);
+ printInfo = true;
+ } else if (it->second.Address != address || it->second.Port != port) {
+ LOG_DEBUG_IC("ICN03", "Updated node %u `%s`: %u (from `%s`: %u)",
+ nodeId, address.c_str(), port, it->second.Address.c_str(), it->second.Port);
+ printInfo = true;
+ Send(TActivationContext::InterconnectProxy(nodeId), new TEvInterconnect::TEvDisconnect);
+ }
+ return printInfo;
+ }
+
+ void DiscardTimedOutRequests(const TActorContext& ctx, ui32 compactionCount = 0) {
+
+ auto now = Now();
+
+ for (auto& pending : PendingRequests) {
+ if (pending.Deadline > now) {
+ LOG_ERROR_IC("ICN06", "Unknown nodeId: %u", pending.Request->Get()->Record.GetNodeId());
+ auto reply = new TEvLocalNodeInfo;
+ reply->NodeId = pending.Request->Get()->Record.GetNodeId();
+ ctx.Send(pending.Request->Sender, reply);
+ pending.Request.Reset();
+ compactionCount++;
+ }
+ }
+
+ if (compactionCount) {
+ TVector<TPendingRequest> requests;
+ if (compactionCount < PendingRequests.size()) { // sanity check
+ requests.reserve(PendingRequests.size() - compactionCount);
+ }
+ for (auto& pending : PendingRequests) {
+ if (pending.Request) {
+ requests.emplace_back(pending.Request, pending.Deadline);
+ }
+ }
+ PendingRequests.swap(requests);
+ }
+ }
+
+ void SchedulePeriodic() {
+ Schedule(TDuration::MilliSeconds(200), new TEvents::TEvWakeup());
+ }
+
+ public:
+ static constexpr EActivityType ActorActivityType() {
+ return NAMESERVICE;
+ }
+
+ TInterconnectDynamicNameserver(const TIntrusivePtr<TTableNameserverSetup>& setup, const TDuration& pendingPeriod, ui32 /*resolvePoolId*/ )
+ : TInterconnectNameserverBase<TInterconnectDynamicNameserver>(&TInterconnectDynamicNameserver::StateFunc, NodeTable)
+ , NodeTable(setup->StaticNodeTable)
+ , PendingPeriod(pendingPeriod)
+ {
+ Y_VERIFY(setup->IsEntriesUnique());
+ }
+
+ STFUNC(StateFunc) {
+ try {
+ switch (ev->GetTypeRewrite()) {
+ HFunc(TEvInterconnect::TEvResolveNode, Handle);
+ HFunc(TEvResolveAddress, Handle);
+ HFunc(TEvInterconnect::TEvListNodes, Handle);
+ HFunc(TEvInterconnect::TEvGetNode, Handle);
+ HFunc(TEvInterconnect::TEvNodesInfo, HandleUpdate);
+ CFunc(TEvents::TEvWakeup::EventType, HandlePeriodic);
+ }
+ } catch (...) {
+ LOG_ERROR_IC("ICN09", "%s", CurrentExceptionMessage().c_str());
+ }
+ }
+
+ void HandleMissedNodeId(TEvInterconnect::TEvResolveNode::TPtr& ev,
+ const TActorContext& ctx,
+ const TInstant& deadline) {
+ if (PendingPeriod) {
+ if (PendingRequests.size() == 0) {
+ SchedulePeriodic();
+ }
+ PendingRequests.emplace_back(std::move(ev), Min(deadline, Now() + PendingPeriod));
+ } else {
+ LOG_ERROR_IC("ICN07", "Unknown nodeId: %u", ev->Get()->Record.GetNodeId());
+ TInterconnectNameserverBase::HandleMissedNodeId(ev, ctx, deadline);
+ }
+ }
+
+ void HandleUpdate(TEvInterconnect::TEvNodesInfo::TPtr& ev,
+ const TActorContext& ctx) {
+
+ auto request = ev->Get();
+ LOG_DEBUG_IC("ICN04", "Update TEvNodesInfo with sz: %lu ", request->Nodes.size());
+
+ bool printInfo = false;
+ ui32 compactionCount = 0;
+
+ for (const auto& node : request->Nodes) {
+ printInfo |= IsNodeUpdated(node.NodeId, node.Address, node.Port);
+
+ NodeTable[node.NodeId] = TTableNameserverSetup::TNodeInfo(
+ node.Address, node.Host, node.ResolveHost, node.Port, node.Location);
+
+ for (auto& pending : PendingRequests) {
+ if (pending.Request->Get()->Record.GetNodeId() == node.NodeId) {
+ LOG_DEBUG_IC("ICN05", "Pending nodeId: %u discovered", node.NodeId);
+ RegisterWithSameMailbox(
+ CreateResolveActor(node.NodeId, NodeTable[node.NodeId], pending.Request->Sender, SelfId(), pending.Deadline));
+ pending.Request.Reset();
+ compactionCount++;
+ }
+ }
+ }
+
+ if (printInfo) {
+ PrintInfo();
+ }
+
+ DiscardTimedOutRequests(ctx, compactionCount);
+ }
+
+ void HandlePeriodic(const TActorContext& ctx) {
+ DiscardTimedOutRequests(ctx, 0);
+ if (PendingRequests.size()) {
+ SchedulePeriodic();
+ }
+ }
+ };
+
+ IActor* CreateDynamicNameserver(const TIntrusivePtr<TTableNameserverSetup>& setup,
+ const TDuration& pendingPeriod,
+ ui32 poolId) {
+ return new TInterconnectDynamicNameserver(setup, pendingPeriod, poolId);
+ }
+
+}
diff --git a/library/cpp/actors/interconnect/interconnect_nameserver_table.cpp b/library/cpp/actors/interconnect/interconnect_nameserver_table.cpp
new file mode 100644
index 0000000000..43419bf70d
--- /dev/null
+++ b/library/cpp/actors/interconnect/interconnect_nameserver_table.cpp
@@ -0,0 +1,86 @@
+#include "interconnect.h"
+#include "interconnect_impl.h"
+#include "interconnect_address.h"
+#include "interconnect_nameserver_base.h"
+#include "events_local.h"
+
+#include <library/cpp/actors/core/hfunc.h>
+#include <library/cpp/actors/memory_log/memlog.h>
+
+namespace NActors {
+
+ class TInterconnectNameserverTable: public TInterconnectNameserverBase<TInterconnectNameserverTable> {
+ TIntrusivePtr<TTableNameserverSetup> Config;
+
+ public:
+ static constexpr EActivityType ActorActivityType() {
+ return NAMESERVICE;
+ }
+
+ TInterconnectNameserverTable(const TIntrusivePtr<TTableNameserverSetup>& setup, ui32 /*resolvePoolId*/)
+ : TInterconnectNameserverBase<TInterconnectNameserverTable>(&TInterconnectNameserverTable::StateFunc, setup->StaticNodeTable)
+ , Config(setup)
+ {
+ Y_VERIFY(Config->IsEntriesUnique());
+ }
+
+ STFUNC(StateFunc) {
+ try {
+ switch (ev->GetTypeRewrite()) {
+ HFunc(TEvInterconnect::TEvResolveNode, Handle);
+ HFunc(TEvResolveAddress, Handle);
+ HFunc(TEvInterconnect::TEvListNodes, Handle);
+ HFunc(TEvInterconnect::TEvGetNode, Handle);
+ }
+ } catch (...) {
+ // on error - do nothing
+ }
+ }
+ };
+
+ IActor* CreateNameserverTable(const TIntrusivePtr<TTableNameserverSetup>& setup, ui32 poolId) {
+ return new TInterconnectNameserverTable(setup, poolId);
+ }
+
+ bool TTableNameserverSetup::IsEntriesUnique() const {
+ TVector<const TNodeInfo*> infos;
+ infos.reserve(StaticNodeTable.size());
+ for (const auto& x : StaticNodeTable)
+ infos.push_back(&x.second);
+
+ auto CompareAddressLambda =
+ [](const TNodeInfo* left, const TNodeInfo* right) {
+ return left->Port == right->Port ? left->Address < right->Address : left->Port < right->Port;
+ };
+
+ Sort(infos, CompareAddressLambda);
+
+ for (ui32 idx = 1, end = StaticNodeTable.size(); idx < end; ++idx) {
+ const TNodeInfo* left = infos[idx - 1];
+ const TNodeInfo* right = infos[idx];
+ if (left->Address && left->Address == right->Address && left->Port == right->Port)
+ return false;
+ }
+
+ auto CompareHostLambda =
+ [](const TNodeInfo* left, const TNodeInfo* right) {
+ return left->Port == right->Port ? left->ResolveHost < right->ResolveHost : left->Port < right->Port;
+ };
+
+ Sort(infos, CompareHostLambda);
+
+ for (ui32 idx = 1, end = StaticNodeTable.size(); idx < end; ++idx) {
+ const TNodeInfo* left = infos[idx - 1];
+ const TNodeInfo* right = infos[idx];
+ if (left->ResolveHost == right->ResolveHost && left->Port == right->Port)
+ return false;
+ }
+
+ return true;
+ }
+
+ TActorId GetNameserviceActorId() {
+ return TActorId(0, "namesvc");
+ }
+
+}
diff --git a/library/cpp/actors/interconnect/interconnect_proxy_wrapper.cpp b/library/cpp/actors/interconnect/interconnect_proxy_wrapper.cpp
new file mode 100644
index 0000000000..1c44b4c59b
--- /dev/null
+++ b/library/cpp/actors/interconnect/interconnect_proxy_wrapper.cpp
@@ -0,0 +1,47 @@
+#include "interconnect_proxy_wrapper.h"
+#include "interconnect_tcp_proxy.h"
+#include <library/cpp/actors/interconnect/mock/ic_mock.h>
+
+namespace NActors {
+
+ class TInterconnectProxyWrapper : public IActor {
+ TIntrusivePtr<TInterconnectProxyCommon> Common;
+ const ui32 NodeId;
+ TInterconnectMock *Mock;
+ IActor *Proxy = nullptr;
+
+ public:
+ TInterconnectProxyWrapper(TIntrusivePtr<TInterconnectProxyCommon> common, ui32 nodeId, TInterconnectMock *mock)
+ : IActor(static_cast<TReceiveFunc>(&TInterconnectProxyWrapper::StateFunc), INTERCONNECT_PROXY_WRAPPER)
+ , Common(std::move(common))
+ , NodeId(nodeId)
+ , Mock(mock)
+ {}
+
+ STFUNC(StateFunc) {
+ if (ev->GetTypeRewrite() == TEvents::TSystem::Poison && !Proxy) {
+ PassAway();
+ } else {
+ if (!Proxy) {
+ IActor *actor = Mock
+ ? Mock->CreateProxyMock(TActivationContext::ActorSystem()->NodeId, NodeId, Common)
+ : new TInterconnectProxyTCP(NodeId, Common, &Proxy);
+ RegisterWithSameMailbox(actor);
+ if (Mock) {
+ Proxy = actor;
+ }
+ Y_VERIFY(Proxy);
+ }
+ InvokeOtherActor(*Proxy, &IActor::Receive, ev, ctx);
+ }
+ }
+ };
+
+ TProxyWrapperFactory CreateProxyWrapperFactory(TIntrusivePtr<TInterconnectProxyCommon> common, ui32 poolId,
+ TInterconnectMock *mock) {
+ return [=](TActorSystem *as, ui32 nodeId) -> TActorId {
+ return as->Register(new TInterconnectProxyWrapper(common, nodeId, mock), TMailboxType::HTSwap, poolId);
+ };
+ }
+
+} // NActors
diff --git a/library/cpp/actors/interconnect/interconnect_proxy_wrapper.h b/library/cpp/actors/interconnect/interconnect_proxy_wrapper.h
new file mode 100644
index 0000000000..e5942351a7
--- /dev/null
+++ b/library/cpp/actors/interconnect/interconnect_proxy_wrapper.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include "interconnect_common.h"
+
+#include <library/cpp/actors/core/actorsystem.h>
+
+namespace NActors {
+
+ TProxyWrapperFactory CreateProxyWrapperFactory(TIntrusivePtr<TInterconnectProxyCommon> common, ui32 poolId,
+ class TInterconnectMock *mock = nullptr);
+
+}
diff --git a/library/cpp/actors/interconnect/interconnect_resolve.cpp b/library/cpp/actors/interconnect/interconnect_resolve.cpp
new file mode 100644
index 0000000000..14296194df
--- /dev/null
+++ b/library/cpp/actors/interconnect/interconnect_resolve.cpp
@@ -0,0 +1,174 @@
+#include "interconnect.h"
+#include "interconnect_address.h"
+#include "events_local.h"
+
+#include <library/cpp/actors/core/actor_bootstrapped.h>
+#include <library/cpp/actors/core/hfunc.h>
+#include <library/cpp/actors/dnsresolver/dnsresolver.h>
+
+namespace NActors {
+
+ using namespace NActors::NDnsResolver;
+
+ class TInterconnectResolveActor : public TActorBootstrapped<TInterconnectResolveActor> {
+ public:
+ TInterconnectResolveActor(
+ const TString& host, ui16 port, ui32 nodeId, const TString& defaultAddress,
+ const TActorId& replyTo, const TActorId& replyFrom, TInstant deadline)
+ : Host(host)
+ , NodeId(nodeId)
+ , Port(port)
+ , DefaultAddress(defaultAddress)
+ , ReplyTo(replyTo)
+ , ReplyFrom(replyFrom)
+ , Deadline(deadline)
+ { }
+
+ TInterconnectResolveActor(
+ const TString& host, ui16 port,
+ const TActorId& replyTo, const TActorId& replyFrom, TInstant deadline)
+ : Host(host)
+ , Port(port)
+ , ReplyTo(replyTo)
+ , ReplyFrom(replyFrom)
+ , Deadline(deadline)
+ { }
+
+ static constexpr EActivityType ActorActivityType() {
+ return NAMESERVICE;
+ }
+
+ void Bootstrap() {
+ TMaybe<TString> errorText;
+ if (auto addr = ExtractDefaultAddr(errorText)) {
+ return SendAddrAndDie(std::move(addr));
+ }
+
+ if (errorText) {
+ SendErrorAndDie(*errorText);
+ }
+
+ auto now = TActivationContext::Now();
+ if (Deadline < now) {
+ SendErrorAndDie("Deadline");
+ return;
+ }
+
+ Send(MakeDnsResolverActorId(),
+ new TEvDns::TEvGetAddr(Host, AF_UNSPEC),
+ IEventHandle::FlagTrackDelivery);
+
+ if (Deadline != TInstant::Max()) {
+ Schedule(Deadline, new TEvents::TEvWakeup);
+ }
+
+ Become(&TThis::StateWork);
+ }
+
+ STRICT_STFUNC(StateWork, {
+ sFunc(TEvents::TEvWakeup, HandleTimeout);
+ sFunc(TEvents::TEvUndelivered, HandleUndelivered);
+ hFunc(TEvDns::TEvGetAddrResult, Handle);
+ });
+
+ void HandleTimeout() {
+ SendErrorAndDie("Deadline");
+ }
+
+ void HandleUndelivered() {
+ SendErrorAndDie("Dns resolver is unavailable");
+ }
+
+ void Handle(TEvDns::TEvGetAddrResult::TPtr& ev) {
+ if (auto addr = ExtractAddr(ev->Get())) {
+ return SendAddrAndDie(std::move(addr));
+ }
+
+ SendErrorAndDie(ev->Get()->ErrorText);
+ }
+
+ void SendAddrAndDie(NAddr::IRemoteAddrPtr addr) {
+ if (NodeId) {
+ auto reply = new TEvLocalNodeInfo;
+ reply->NodeId = *NodeId;
+ reply->Address = std::move(addr);
+ TActivationContext::Send(new IEventHandle(ReplyTo, ReplyFrom, reply));
+ } else {
+ auto reply = new TEvAddressInfo;
+ reply->Address = std::move(addr);
+ TActivationContext::Send(new IEventHandle(ReplyTo, ReplyFrom, reply));
+ }
+ PassAway();
+ }
+
+ void SendErrorAndDie(const TString& errorText) {
+ auto *event = new TEvResolveError;
+ event->Explain = errorText;
+ TActivationContext::Send(new IEventHandle(ReplyTo, ReplyFrom, event));
+ PassAway();
+ }
+
+ NAddr::IRemoteAddrPtr ExtractAddr(TEvDns::TEvGetAddrResult* msg) {
+ if (msg->Status == 0) {
+ if (msg->IsV6()) {
+ struct sockaddr_in6 sin6;
+ Zero(sin6);
+ sin6.sin6_family = AF_INET6;
+ sin6.sin6_addr = msg->GetAddrV6();
+ sin6.sin6_port = HostToInet(Port);
+ return MakeHolder<NAddr::TIPv6Addr>(sin6);
+ }
+
+ if (msg->IsV4()) {
+ return MakeHolder<NAddr::TIPv4Addr>(TIpAddress(msg->GetAddrV4().s_addr, Port));
+ }
+
+ Y_FAIL("Unexpected result address family");
+ }
+
+ return nullptr;
+ }
+
+ NAddr::IRemoteAddrPtr ExtractDefaultAddr(TMaybe<TString>& errorText) {
+ if (DefaultAddress) {
+ NInterconnect::TAddress address(DefaultAddress.data(), Port);
+
+ switch (address.GetFamily()) {
+ case AF_INET:
+ return MakeHolder<NAddr::TIPv4Addr>(*(sockaddr_in*)address.SockAddr());
+ case AF_INET6:
+ return MakeHolder<NAddr::TIPv6Addr>(*(sockaddr_in6*)address.SockAddr());
+ default:
+ errorText = "Unsupported default address: " + DefaultAddress;
+ break;
+ }
+ }
+
+ return nullptr;
+ }
+
+ private:
+ const TString Host;
+ const std::optional<ui32> NodeId;
+ const ui16 Port;
+ const TString DefaultAddress;
+ const TActorId ReplyTo;
+ const TActorId ReplyFrom;
+ const TInstant Deadline;
+ };
+
+ IActor* CreateResolveActor(
+ const TString& host, ui16 port, ui32 nodeId, const TString& defaultAddress,
+ const TActorId& replyTo, const TActorId& replyFrom, TInstant deadline)
+ {
+ return new TInterconnectResolveActor(host, port, nodeId, defaultAddress, replyTo, replyFrom, deadline);
+ }
+
+ IActor* CreateResolveActor(
+ const TString& host, ui16 port,
+ const TActorId& replyTo, const TActorId& replyFrom, TInstant deadline)
+ {
+ return new TInterconnectResolveActor(host, port, replyTo, replyFrom, deadline);
+ }
+
+} // namespace NActors
diff --git a/library/cpp/actors/interconnect/interconnect_stream.cpp b/library/cpp/actors/interconnect/interconnect_stream.cpp
new file mode 100644
index 0000000000..158ebc9e1d
--- /dev/null
+++ b/library/cpp/actors/interconnect/interconnect_stream.cpp
@@ -0,0 +1,628 @@
+#include "interconnect_stream.h"
+#include "logging.h"
+#include <library/cpp/openssl/init/init.h>
+#include <util/network/socket.h>
+#include <openssl/ssl.h>
+#include <openssl/err.h>
+#include <openssl/pem.h>
+
+#if defined(_win_)
+#include <util/system/file.h>
+#define SOCK_NONBLOCK 0
+#elif defined(_darwin_)
+#define SOCK_NONBLOCK 0
+#else
+#include <sys/un.h>
+#include <sys/stat.h>
+#endif //_win_
+
+#if !defined(_win_)
+#include <sys/ioctl.h>
+#endif
+
+#include <cerrno>
+
+namespace NInterconnect {
+ namespace {
+ inline int
+ LastSocketError() {
+#if defined(_win_)
+ return WSAGetLastError();
+#else
+ return errno;
+#endif
+ }
+ }
+
+ TSocket::TSocket(SOCKET fd)
+ : Descriptor(fd)
+ {
+ }
+
+ TSocket::~TSocket() {
+ if (Descriptor == INVALID_SOCKET) {
+ return;
+ }
+
+ auto const result = ::closesocket(Descriptor);
+ if (result == 0)
+ return;
+ switch (LastSocketError()) {
+ case EBADF:
+ Y_FAIL("Close bad descriptor");
+ case EINTR:
+ break;
+ case EIO:
+ Y_FAIL("EIO");
+ default:
+ Y_FAIL("It's something unexpected");
+ }
+ }
+
+ int TSocket::GetDescriptor() {
+ return Descriptor;
+ }
+
+ int
+ TSocket::Bind(const TAddress& addr) const {
+ const auto ret = ::bind(Descriptor, addr.SockAddr(), addr.Size());
+ if (ret < 0)
+ return -LastSocketError();
+
+ return 0;
+ }
+
+ int
+ TSocket::Shutdown(int how) const {
+ const auto ret = ::shutdown(Descriptor, how);
+ if (ret < 0)
+ return -LastSocketError();
+
+ return 0;
+ }
+
+ int TSocket::GetConnectStatus() const {
+ int err = 0;
+ socklen_t len = sizeof(err);
+ if (getsockopt(Descriptor, SOL_SOCKET, SO_ERROR, reinterpret_cast<char*>(&err), &len) == -1) {
+ err = LastSocketError();
+ }
+ return err;
+ }
+
+ /////////////////////////////////////////////////////////////////
+
+ TIntrusivePtr<TStreamSocket> TStreamSocket::Make(int domain) {
+ const SOCKET res = ::socket(domain, SOCK_STREAM | SOCK_NONBLOCK, 0);
+ if (res == -1) {
+ const int err = LastSocketError();
+ Y_VERIFY(err != EMFILE && err != ENFILE);
+ }
+ return MakeIntrusive<TStreamSocket>(res);
+ }
+
+ TStreamSocket::TStreamSocket(SOCKET fd)
+ : TSocket(fd)
+ {
+ }
+
+ ssize_t
+ TStreamSocket::Send(const void* msg, size_t len, TString* /*err*/) const {
+ const auto ret = ::send(Descriptor, static_cast<const char*>(msg), int(len), 0);
+ if (ret < 0)
+ return -LastSocketError();
+
+ return ret;
+ }
+
+ ssize_t
+ TStreamSocket::Recv(void* buf, size_t len, TString* /*err*/) const {
+ const auto ret = ::recv(Descriptor, static_cast<char*>(buf), int(len), 0);
+ if (ret < 0)
+ return -LastSocketError();
+
+ return ret;
+ }
+
+ ssize_t
+ TStreamSocket::WriteV(const struct iovec* iov, int iovcnt) const {
+#ifndef _win_
+ const auto ret = ::writev(Descriptor, iov, iovcnt);
+ if (ret < 0)
+ return -LastSocketError();
+ return ret;
+#else
+ Y_FAIL("WriteV() unsupported on Windows");
+#endif
+ }
+
+ ssize_t
+ TStreamSocket::ReadV(const struct iovec* iov, int iovcnt) const {
+#ifndef _win_
+ const auto ret = ::readv(Descriptor, iov, iovcnt);
+ if (ret < 0)
+ return -LastSocketError();
+ return ret;
+#else
+ Y_FAIL("ReadV() unsupported on Windows");
+#endif
+ }
+
+ ssize_t TStreamSocket::GetUnsentQueueSize() const {
+ int num = -1;
+#ifndef _win_ // we have no means to determine output queue size on Windows
+ if (ioctl(Descriptor, TIOCOUTQ, &num) == -1) {
+ num = -1;
+ }
+#endif
+ return num;
+ }
+
+ int
+ TStreamSocket::Connect(const TAddress& addr) const {
+ const auto ret = ::connect(Descriptor, addr.SockAddr(), addr.Size());
+ if (ret < 0)
+ return -LastSocketError();
+
+ return ret;
+ }
+
+ int
+ TStreamSocket::Connect(const NAddr::IRemoteAddr* addr) const {
+ const auto ret = ::connect(Descriptor, addr->Addr(), addr->Len());
+ if (ret < 0)
+ return -LastSocketError();
+
+ return ret;
+ }
+
+ int
+ TStreamSocket::Listen(int backlog) const {
+ const auto ret = ::listen(Descriptor, backlog);
+ if (ret < 0)
+ return -LastSocketError();
+
+ return ret;
+ }
+
+ int
+ TStreamSocket::Accept(TAddress& acceptedAddr) const {
+ socklen_t acceptedSize = sizeof(::sockaddr_in6);
+ const auto ret = ::accept(Descriptor, acceptedAddr.SockAddr(), &acceptedSize);
+ if (ret == INVALID_SOCKET)
+ return -LastSocketError();
+
+ return ret;
+ }
+
+ void
+ TStreamSocket::SetSendBufferSize(i32 len) const {
+ (void)SetSockOpt(Descriptor, SOL_SOCKET, SO_SNDBUF, len);
+ }
+
+ ui32 TStreamSocket::GetSendBufferSize() const {
+ ui32 res = 0;
+ CheckedGetSockOpt(Descriptor, SOL_SOCKET, SO_SNDBUF, res, "SO_SNDBUF");
+ return res;
+ }
+
+ //////////////////////////////////////////////////////
+
+ TDatagramSocket::TPtr TDatagramSocket::Make(int domain) {
+ const SOCKET res = ::socket(domain, SOCK_DGRAM, 0);
+ if (res == -1) {
+ const int err = LastSocketError();
+ Y_VERIFY(err != EMFILE && err != ENFILE);
+ }
+ return std::make_shared<TDatagramSocket>(res);
+ }
+
+ TDatagramSocket::TDatagramSocket(SOCKET fd)
+ : TSocket(fd)
+ {
+ }
+
+ ssize_t
+ TDatagramSocket::SendTo(const void* msg, size_t len, const TAddress& toAddr) const {
+ const auto ret = ::sendto(Descriptor, static_cast<const char*>(msg), int(len), 0, toAddr.SockAddr(), toAddr.Size());
+ if (ret < 0)
+ return -LastSocketError();
+
+ return ret;
+ }
+
+ ssize_t
+ TDatagramSocket::RecvFrom(void* buf, size_t len, TAddress& fromAddr) const {
+ socklen_t fromSize = sizeof(::sockaddr_in6);
+ const auto ret = ::recvfrom(Descriptor, static_cast<char*>(buf), int(len), 0, fromAddr.SockAddr(), &fromSize);
+ if (ret < 0)
+ return -LastSocketError();
+
+ return ret;
+ }
+
+
+ // deleter for SSL objects
+ struct TDeleter {
+ void operator ()(BIO *bio) const {
+ BIO_free(bio);
+ }
+
+ void operator ()(X509 *x509) const {
+ X509_free(x509);
+ }
+
+ void operator ()(RSA *rsa) const {
+ RSA_free(rsa);
+ }
+
+ void operator ()(SSL_CTX *ctx) const {
+ SSL_CTX_free(ctx);
+ }
+ };
+
+ class TSecureSocketContext::TImpl {
+ std::unique_ptr<SSL_CTX, TDeleter> Ctx;
+
+ public:
+ TImpl(const TString& certificate, const TString& privateKey, const TString& caFilePath,
+ const TString& ciphers) {
+ int ret;
+ InitOpenSSL();
+#if OPENSSL_VERSION_NUMBER < 0x10100000L
+ Ctx.reset(SSL_CTX_new(TLSv1_2_method()));
+ Y_VERIFY(Ctx, "SSL_CTX_new() failed");
+#else
+ Ctx.reset(SSL_CTX_new(TLS_method()));
+ Y_VERIFY(Ctx, "SSL_CTX_new() failed");
+ ret = SSL_CTX_set_min_proto_version(Ctx.get(), TLS1_2_VERSION);
+ Y_VERIFY(ret == 1, "failed to set min proto version");
+ ret = SSL_CTX_set_max_proto_version(Ctx.get(), TLS1_2_VERSION);
+ Y_VERIFY(ret == 1, "failed to set max proto version");
+#endif
+ SSL_CTX_set_verify(Ctx.get(), SSL_VERIFY_PEER | SSL_VERIFY_FAIL_IF_NO_PEER_CERT, &Verify);
+ SSL_CTX_set_mode(*this, SSL_MODE_ENABLE_PARTIAL_WRITE | SSL_MODE_ACCEPT_MOVING_WRITE_BUFFER);
+
+ // apply certificates in SSL context
+ if (certificate) {
+ std::unique_ptr<BIO, TDeleter> bio(BIO_new_mem_buf(certificate.data(), certificate.size()));
+ Y_VERIFY(bio);
+
+ // first certificate in the chain is expected to be a leaf
+ std::unique_ptr<X509, TDeleter> cert(PEM_read_bio_X509(bio.get(), nullptr, nullptr, nullptr));
+ Y_VERIFY(cert, "failed to parse certificate");
+ ret = SSL_CTX_use_certificate(Ctx.get(), cert.get());
+ Y_VERIFY(ret == 1);
+
+ // loading additional certificates in the chain, if any
+ while(true) {
+ X509 *ca = PEM_read_bio_X509(bio.get(), nullptr, nullptr, nullptr);
+ if (ca == nullptr) {
+ break;
+ }
+ ret = SSL_CTX_add0_chain_cert(Ctx.get(), ca);
+ Y_VERIFY(ret == 1);
+ // we must not free memory if certificate was added successfully by SSL_CTX_add0_chain_cert
+ }
+ }
+ if (privateKey) {
+ std::unique_ptr<BIO, TDeleter> bio(BIO_new_mem_buf(privateKey.data(), privateKey.size()));
+ Y_VERIFY(bio);
+ std::unique_ptr<RSA, TDeleter> pkey(PEM_read_bio_RSAPrivateKey(bio.get(), nullptr, nullptr, nullptr));
+ Y_VERIFY(pkey);
+ ret = SSL_CTX_use_RSAPrivateKey(Ctx.get(), pkey.get());
+ Y_VERIFY(ret == 1);
+ }
+ if (caFilePath) {
+ ret = SSL_CTX_load_verify_locations(Ctx.get(), caFilePath.data(), nullptr);
+ Y_VERIFY(ret == 1);
+ }
+
+ int success = SSL_CTX_set_cipher_list(Ctx.get(), ciphers ? ciphers.data() : "AES128-GCM-SHA256");
+ Y_VERIFY(success, "failed to set cipher list");
+ }
+
+ operator SSL_CTX*() const {
+ return Ctx.get();
+ }
+
+ static int GetExIndex() {
+ static int index = SSL_get_ex_new_index(0, nullptr, nullptr, nullptr, nullptr);
+ return index;
+ }
+
+ private:
+ static int Verify(int preverify, X509_STORE_CTX *ctx) {
+ if (!preverify) {
+ X509 *badCert = X509_STORE_CTX_get_current_cert(ctx);
+ int err = X509_STORE_CTX_get_error(ctx);
+ int depth = X509_STORE_CTX_get_error_depth(ctx);
+ SSL *ssl = static_cast<SSL*>(X509_STORE_CTX_get_ex_data(ctx, SSL_get_ex_data_X509_STORE_CTX_idx()));
+ TString *errp = static_cast<TString*>(SSL_get_ex_data(ssl, GetExIndex()));
+ char buffer[1024];
+ X509_NAME_oneline(X509_get_subject_name(badCert), buffer, sizeof(buffer));
+ TStringBuilder s;
+ s << "Error during certificate validation"
+ << " error# " << X509_verify_cert_error_string(err)
+ << " depth# " << depth
+ << " cert# " << buffer;
+ if (err == X509_V_ERR_UNABLE_TO_GET_ISSUER_CERT) {
+ X509_NAME_oneline(X509_get_issuer_name(badCert), buffer, sizeof(buffer));
+ s << " issuer# " << buffer;
+ }
+ *errp = s;
+ }
+ return preverify;
+ }
+ };
+
+ TSecureSocketContext::TSecureSocketContext(const TString& certificate, const TString& privateKey,
+ const TString& caFilePath, const TString& ciphers)
+ : Impl(new TImpl(certificate, privateKey, caFilePath, ciphers))
+ {}
+
+ TSecureSocketContext::~TSecureSocketContext()
+ {}
+
+ class TSecureSocket::TImpl {
+ SSL *Ssl;
+ TString ErrorDescription;
+ bool WantRead_ = false;
+ bool WantWrite_ = false;
+
+ public:
+ TImpl(SSL_CTX *ctx, int fd)
+ : Ssl(SSL_new(ctx))
+ {
+ Y_VERIFY(Ssl, "SSL_new() failed");
+ SSL_set_fd(Ssl, fd);
+ SSL_set_ex_data(Ssl, TSecureSocketContext::TImpl::GetExIndex(), &ErrorDescription);
+ }
+
+ ~TImpl() {
+ SSL_free(Ssl);
+ }
+
+ TString GetErrorStack() {
+ if (ErrorDescription) {
+ return ErrorDescription;
+ }
+ std::unique_ptr<BIO, int(*)(BIO*)> mem(BIO_new(BIO_s_mem()), BIO_free);
+ ERR_print_errors(mem.get());
+ char *p = nullptr;
+ auto len = BIO_get_mem_data(mem.get(), &p);
+ return TString(p, len);
+ }
+
+ EStatus ConvertResult(int res, TString& err) {
+ switch (res) {
+ case SSL_ERROR_NONE:
+ return EStatus::SUCCESS;
+
+ case SSL_ERROR_WANT_READ:
+ return EStatus::WANT_READ;
+
+ case SSL_ERROR_WANT_WRITE:
+ return EStatus::WANT_WRITE;
+
+ case SSL_ERROR_SYSCALL:
+ err = TStringBuilder() << "syscall error: " << strerror(LastSocketError()) << ": " << GetErrorStack();
+ break;
+
+ case SSL_ERROR_ZERO_RETURN:
+ err = "TLS negotiation failed";
+ break;
+
+ case SSL_ERROR_SSL:
+ err = "SSL error: " + GetErrorStack();
+ break;
+
+ default:
+ err = "unknown OpenSSL error";
+ break;
+ }
+ return EStatus::ERROR;
+ }
+
+ enum EConnectState {
+ CONNECT,
+ SHUTDOWN,
+ READ,
+ } ConnectState = EConnectState::CONNECT;
+
+ EStatus Establish(bool server, bool authOnly, TString& err) {
+ switch (ConnectState) {
+ case EConnectState::CONNECT: {
+ auto callback = server ? SSL_accept : SSL_connect;
+ const EStatus status = ConvertResult(SSL_get_error(Ssl, callback(Ssl)), err);
+ if (status != EStatus::SUCCESS || !authOnly) {
+ return status;
+ }
+ ConnectState = EConnectState::SHUTDOWN;
+ [[fallthrough]];
+ }
+
+ case EConnectState::SHUTDOWN: {
+ const int res = SSL_shutdown(Ssl);
+ if (res == 1) {
+ return EStatus::SUCCESS;
+ } else if (res != 0) {
+ return ConvertResult(SSL_get_error(Ssl, res), err);
+ }
+ ConnectState = EConnectState::READ;
+ [[fallthrough]];
+ }
+
+ case EConnectState::READ: {
+ char data[256];
+ size_t numRead = 0;
+ const int res = SSL_get_error(Ssl, SSL_read_ex(Ssl, data, sizeof(data), &numRead));
+ if (res == SSL_ERROR_ZERO_RETURN) {
+ return EStatus::SUCCESS;
+ } else if (res != SSL_ERROR_NONE) {
+ return ConvertResult(res, err);
+ } else if (numRead) {
+ err = "non-zero return from SSL_read_ex: " + ToString(numRead);
+ return EStatus::ERROR;
+ } else {
+ return EStatus::SUCCESS;
+ }
+ }
+ }
+ Y_FAIL();
+ }
+
+ std::optional<std::pair<const void*, size_t>> BlockedSend;
+
+ ssize_t Send(const void* msg, size_t len, TString *err) {
+ Y_VERIFY(!BlockedSend || *BlockedSend == std::make_pair(msg, len));
+ const ssize_t res = Operate(msg, len, &SSL_write_ex, err);
+ if (res == -EAGAIN) {
+ BlockedSend.emplace(msg, len);
+ } else {
+ BlockedSend.reset();
+ }
+ return res;
+ }
+
+ std::optional<std::pair<void*, size_t>> BlockedReceive;
+
+ ssize_t Recv(void* msg, size_t len, TString *err) {
+ Y_VERIFY(!BlockedReceive || *BlockedReceive == std::make_pair(msg, len));
+ const ssize_t res = Operate(msg, len, &SSL_read_ex, err);
+ if (res == -EAGAIN) {
+ BlockedReceive.emplace(msg, len);
+ } else {
+ BlockedReceive.reset();
+ }
+ return res;
+ }
+
+ TString GetCipherName() const {
+ return SSL_get_cipher_name(Ssl);
+ }
+
+ int GetCipherBits() const {
+ return SSL_get_cipher_bits(Ssl, nullptr);
+ }
+
+ TString GetProtocolName() const {
+ return SSL_get_cipher_version(Ssl);
+ }
+
+ TString GetPeerCommonName() const {
+ TString res;
+ if (X509 *cert = SSL_get_peer_certificate(Ssl)) {
+ char buffer[256];
+ memset(buffer, 0, sizeof(buffer));
+ if (X509_NAME *name = X509_get_subject_name(cert)) {
+ X509_NAME_get_text_by_NID(name, NID_commonName, buffer, sizeof(buffer));
+ }
+ X509_free(cert);
+ res = TString(buffer, strnlen(buffer, sizeof(buffer)));
+ }
+ return res;
+ }
+
+ bool WantRead() const {
+ return WantRead_;
+ }
+
+ bool WantWrite() const {
+ return WantWrite_;
+ }
+
+ private:
+ template<typename TBuffer, typename TOp>
+ ssize_t Operate(TBuffer* buffer, size_t len, TOp&& op, TString *err) {
+ WantRead_ = WantWrite_ = false;
+ size_t processed = 0;
+ int ret = op(Ssl, buffer, len, &processed);
+ if (ret == 1) {
+ return processed;
+ }
+ switch (const int status = SSL_get_error(Ssl, ret)) {
+ case SSL_ERROR_ZERO_RETURN:
+ return 0;
+
+ case SSL_ERROR_WANT_READ:
+ WantRead_ = true;
+ return -EAGAIN;
+
+ case SSL_ERROR_WANT_WRITE:
+ WantWrite_ = true;
+ return -EAGAIN;
+
+ case SSL_ERROR_SYSCALL:
+ return -LastSocketError();
+
+ case SSL_ERROR_SSL:
+ if (err) {
+ *err = GetErrorStack();
+ }
+ return -EPROTO;
+
+ default:
+ Y_FAIL("unexpected SSL_get_error() status# %d", status);
+ }
+ }
+ };
+
+ TSecureSocket::TSecureSocket(TStreamSocket& socket, TSecureSocketContext::TPtr context)
+ : TStreamSocket(socket.ReleaseDescriptor())
+ , Context(std::move(context))
+ , Impl(new TImpl(*Context->Impl, Descriptor))
+ {}
+
+ TSecureSocket::~TSecureSocket()
+ {}
+
+ TSecureSocket::EStatus TSecureSocket::Establish(bool server, bool authOnly, TString& err) const {
+ return Impl->Establish(server, authOnly, err);
+ }
+
+ TIntrusivePtr<TStreamSocket> TSecureSocket::Detach() {
+ return MakeIntrusive<TStreamSocket>(ReleaseDescriptor());
+ }
+
+ ssize_t TSecureSocket::Send(const void* msg, size_t len, TString *err) const {
+ return Impl->Send(msg, len, err);
+ }
+
+ ssize_t TSecureSocket::Recv(void* msg, size_t len, TString *err) const {
+ return Impl->Recv(msg, len, err);
+ }
+
+ ssize_t TSecureSocket::WriteV(const struct iovec* /*iov*/, int /*iovcnt*/) const {
+ Y_FAIL("unsupported on SSL sockets");
+ }
+
+ ssize_t TSecureSocket::ReadV(const struct iovec* /*iov*/, int /*iovcnt*/) const {
+ Y_FAIL("unsupported on SSL sockets");
+ }
+
+ TString TSecureSocket::GetCipherName() const {
+ return Impl->GetCipherName();
+ }
+
+ int TSecureSocket::GetCipherBits() const {
+ return Impl->GetCipherBits();
+ }
+
+ TString TSecureSocket::GetProtocolName() const {
+ return Impl->GetProtocolName();
+ }
+
+ TString TSecureSocket::GetPeerCommonName() const {
+ return Impl->GetPeerCommonName();
+ }
+
+ bool TSecureSocket::WantRead() const {
+ return Impl->WantRead();
+ }
+
+ bool TSecureSocket::WantWrite() const {
+ return Impl->WantWrite();
+ }
+
+}
diff --git a/library/cpp/actors/interconnect/interconnect_stream.h b/library/cpp/actors/interconnect/interconnect_stream.h
new file mode 100644
index 0000000000..074adc6e74
--- /dev/null
+++ b/library/cpp/actors/interconnect/interconnect_stream.h
@@ -0,0 +1,131 @@
+#pragma once
+
+#include <util/generic/string.h>
+#include <util/generic/noncopyable.h>
+#include <util/network/address.h>
+#include <util/network/init.h>
+#include <util/system/defaults.h>
+
+#include "poller.h"
+
+#include "interconnect_address.h"
+
+#include <memory>
+
+#include <sys/uio.h>
+
+namespace NInterconnect {
+ class TSocket: public NActors::TSharedDescriptor, public TNonCopyable {
+ protected:
+ TSocket(SOCKET fd);
+
+ virtual ~TSocket() override;
+
+ SOCKET Descriptor;
+
+ virtual int GetDescriptor() override;
+
+ private:
+ friend class TSecureSocket;
+
+ SOCKET ReleaseDescriptor() {
+ return std::exchange(Descriptor, INVALID_SOCKET);
+ }
+
+ public:
+ operator SOCKET() const {
+ return Descriptor;
+ }
+
+ int Bind(const TAddress& addr) const;
+ int Shutdown(int how) const;
+ int GetConnectStatus() const;
+ };
+
+ class TStreamSocket: public TSocket {
+ public:
+ TStreamSocket(SOCKET fd);
+
+ static TIntrusivePtr<TStreamSocket> Make(int domain);
+
+ virtual ssize_t Send(const void* msg, size_t len, TString *err = nullptr) const;
+ virtual ssize_t Recv(void* buf, size_t len, TString *err = nullptr) const;
+
+ virtual ssize_t WriteV(const struct iovec* iov, int iovcnt) const;
+ virtual ssize_t ReadV(const struct iovec* iov, int iovcnt) const;
+
+ int Connect(const TAddress& addr) const;
+ int Connect(const NAddr::IRemoteAddr* addr) const;
+ int Listen(int backlog) const;
+ int Accept(TAddress& acceptedAddr) const;
+
+ ssize_t GetUnsentQueueSize() const;
+
+ void SetSendBufferSize(i32 len) const;
+ ui32 GetSendBufferSize() const;
+ };
+
+ class TSecureSocketContext {
+ class TImpl;
+ THolder<TImpl> Impl;
+
+ friend class TSecureSocket;
+
+ public:
+ TSecureSocketContext(const TString& certificate, const TString& privateKey, const TString& caFilePath,
+ const TString& ciphers);
+ ~TSecureSocketContext();
+
+ public:
+ using TPtr = std::shared_ptr<TSecureSocketContext>;
+ };
+
+ class TSecureSocket : public TStreamSocket {
+ TSecureSocketContext::TPtr Context;
+
+ class TImpl;
+ THolder<TImpl> Impl;
+
+ public:
+ enum class EStatus {
+ SUCCESS,
+ ERROR,
+ WANT_READ,
+ WANT_WRITE,
+ };
+
+ public:
+ TSecureSocket(TStreamSocket& socket, TSecureSocketContext::TPtr context);
+ ~TSecureSocket();
+
+ EStatus Establish(bool server, bool authOnly, TString& err) const;
+ TIntrusivePtr<TStreamSocket> Detach();
+
+ ssize_t Send(const void* msg, size_t len, TString *err) const override;
+ ssize_t Recv(void* msg, size_t len, TString *err) const override;
+
+ ssize_t WriteV(const struct iovec* iov, int iovcnt) const override;
+ ssize_t ReadV(const struct iovec* iov, int iovcnt) const override;
+
+ TString GetCipherName() const;
+ int GetCipherBits() const;
+ TString GetProtocolName() const;
+ TString GetPeerCommonName() const;
+
+ bool WantRead() const;
+ bool WantWrite() const;
+ };
+
+ class TDatagramSocket: public TSocket {
+ public:
+ typedef std::shared_ptr<TDatagramSocket> TPtr;
+
+ TDatagramSocket(SOCKET fd);
+
+ static TPtr Make(int domain);
+
+ ssize_t SendTo(const void* msg, size_t len, const TAddress& toAddr) const;
+ ssize_t RecvFrom(void* buf, size_t len, TAddress& fromAddr) const;
+ };
+
+}
diff --git a/library/cpp/actors/interconnect/interconnect_tcp_input_session.cpp b/library/cpp/actors/interconnect/interconnect_tcp_input_session.cpp
new file mode 100644
index 0000000000..0abe9fe659
--- /dev/null
+++ b/library/cpp/actors/interconnect/interconnect_tcp_input_session.cpp
@@ -0,0 +1,476 @@
+#include "interconnect_tcp_session.h"
+#include "interconnect_tcp_proxy.h"
+#include <library/cpp/actors/core/probes.h>
+#include <library/cpp/actors/util/datetime.h>
+
+namespace NActors {
+ LWTRACE_USING(ACTORLIB_PROVIDER);
+
+ TInputSessionTCP::TInputSessionTCP(const TActorId& sessionId, TIntrusivePtr<NInterconnect::TStreamSocket> socket,
+ TIntrusivePtr<TReceiveContext> context, TInterconnectProxyCommon::TPtr common,
+ std::shared_ptr<IInterconnectMetrics> metrics, ui32 nodeId, ui64 lastConfirmed,
+ TDuration deadPeerTimeout, TSessionParams params)
+ : SessionId(sessionId)
+ , Socket(std::move(socket))
+ , Context(std::move(context))
+ , Common(std::move(common))
+ , NodeId(nodeId)
+ , Params(std::move(params))
+ , ConfirmedByInput(lastConfirmed)
+ , Metrics(std::move(metrics))
+ , DeadPeerTimeout(deadPeerTimeout)
+ {
+ Y_VERIFY(Context);
+ Y_VERIFY(Socket);
+ Y_VERIFY(SessionId);
+
+ AtomicSet(Context->PacketsReadFromSocket, 0);
+
+ Metrics->SetClockSkewMicrosec(0);
+
+ Context->UpdateState = EUpdateState::NONE;
+
+ // ensure that we do not spawn new session while the previous one is still alive
+ TAtomicBase sessions = AtomicIncrement(Context->NumInputSessions);
+ Y_VERIFY(sessions == 1, "sessions# %" PRIu64, ui64(sessions));
+ }
+
+ void TInputSessionTCP::Bootstrap() {
+ SetPrefix(Sprintf("InputSession %s [node %" PRIu32 "]", SelfId().ToString().data(), NodeId));
+ Become(&TThis::WorkingState, DeadPeerTimeout, new TEvCheckDeadPeer);
+ LOG_DEBUG_IC_SESSION("ICIS01", "InputSession created");
+ LastReceiveTimestamp = TActivationContext::Now();
+ ReceiveData();
+ }
+
+ void TInputSessionTCP::CloseInputSession() {
+ CloseInputSessionRequested = true;
+ ReceiveData();
+ }
+
+ void TInputSessionTCP::Handle(TEvPollerReady::TPtr ev) {
+ if (Context->ReadPending) {
+ Metrics->IncUsefulReadWakeups();
+ } else if (!ev->Cookie) {
+ Metrics->IncSpuriousReadWakeups();
+ }
+ Context->ReadPending = false;
+ ReceiveData();
+ if (Params.Encryption && Context->WriteBlockedByFullSendBuffer && !ev->Cookie) {
+ Send(SessionId, ev->Release().Release(), 0, 1);
+ }
+ }
+
+ void TInputSessionTCP::Handle(TEvPollerRegisterResult::TPtr ev) {
+ PollerToken = std::move(ev->Get()->PollerToken);
+ ReceiveData();
+ }
+
+ void TInputSessionTCP::HandleResumeReceiveData() {
+ ReceiveData();
+ }
+
+ void TInputSessionTCP::ReceiveData() {
+ TTimeLimit limit(GetMaxCyclesPerEvent());
+ ui64 numDataBytes = 0;
+ const size_t headerLen = Params.UseModernFrame ? sizeof(TTcpPacketHeader_v2) : sizeof(TTcpPacketHeader_v1);
+
+ LOG_DEBUG_IC_SESSION("ICIS02", "ReceiveData called");
+
+ for (int iteration = 0; Socket; ++iteration) {
+ if (iteration && limit.CheckExceeded()) {
+ // we have hit processing time limit for this message, send notification to resume processing a bit later
+ Send(SelfId(), new TEvResumeReceiveData);
+ break;
+ }
+
+ switch (State) {
+ case EState::HEADER:
+ if (IncomingData.GetSize() < headerLen) {
+ break;
+ } else {
+ ProcessHeader(headerLen);
+ }
+ continue;
+
+ case EState::PAYLOAD:
+ if (!IncomingData) {
+ break;
+ } else {
+ ProcessPayload(numDataBytes);
+ }
+ continue;
+ }
+
+ // if we have reached this point, it means that we do not have enough data in read buffer; try to obtain some
+ if (!ReadMore()) {
+ // we have no data from socket, so we have some free time to spend -- preallocate buffers using this time
+ PreallocateBuffers();
+ break;
+ }
+ }
+
+ // calculate ping time
+ auto it = std::min_element(PingQ.begin(), PingQ.end());
+ const TDuration ping = it != PingQ.end() ? *it : TDuration::Zero();
+
+ // send update to main session actor if something valuable has changed
+ if (!UpdateFromInputSession) {
+ UpdateFromInputSession = MakeHolder<TEvUpdateFromInputSession>(ConfirmedByInput, numDataBytes, ping);
+ } else {
+ Y_VERIFY(ConfirmedByInput >= UpdateFromInputSession->ConfirmedByInput);
+ UpdateFromInputSession->ConfirmedByInput = ConfirmedByInput;
+ UpdateFromInputSession->NumDataBytes += numDataBytes;
+ UpdateFromInputSession->Ping = Min(UpdateFromInputSession->Ping, ping);
+ }
+
+ for (;;) {
+ EUpdateState state = Context->UpdateState;
+ EUpdateState next;
+
+ // calculate next state
+ switch (state) {
+ case EUpdateState::NONE:
+ case EUpdateState::CONFIRMING:
+ // we have no inflight messages to session actor, we will issue one a bit later
+ next = EUpdateState::INFLIGHT;
+ break;
+
+ case EUpdateState::INFLIGHT:
+ case EUpdateState::INFLIGHT_AND_PENDING:
+ // we already have inflight message, so we will keep pending message and session actor will issue
+ // TEvConfirmUpdate to kick processing
+ next = EUpdateState::INFLIGHT_AND_PENDING;
+ break;
+ }
+
+ if (Context->UpdateState.compare_exchange_weak(state, next)) {
+ switch (next) {
+ case EUpdateState::INFLIGHT:
+ Send(SessionId, UpdateFromInputSession.Release());
+ break;
+
+ case EUpdateState::INFLIGHT_AND_PENDING:
+ Y_VERIFY(UpdateFromInputSession);
+ break;
+
+ default:
+ Y_FAIL("unexpected state");
+ }
+ break;
+ }
+ }
+ }
+
+ void TInputSessionTCP::ProcessHeader(size_t headerLen) {
+ const bool success = IncomingData.ExtractFrontPlain(Header.Data, headerLen);
+ Y_VERIFY(success);
+ if (Params.UseModernFrame) {
+ PayloadSize = Header.v2.PayloadLength;
+ HeaderSerial = Header.v2.Serial;
+ HeaderConfirm = Header.v2.Confirm;
+ if (!Params.Encryption) {
+ ChecksumExpected = std::exchange(Header.v2.Checksum, 0);
+ Checksum = Crc32cExtendMSanCompatible(0, &Header.v2, sizeof(Header.v2)); // start calculating checksum now
+ if (!PayloadSize && Checksum != ChecksumExpected) {
+ LOG_ERROR_IC_SESSION("ICIS10", "payload checksum error");
+ return ReestablishConnection(TDisconnectReason::ChecksumError());
+ }
+ }
+ } else if (!Header.v1.Check()) {
+ LOG_ERROR_IC_SESSION("ICIS03", "header checksum error");
+ return ReestablishConnection(TDisconnectReason::ChecksumError());
+ } else {
+ PayloadSize = Header.v1.DataSize;
+ HeaderSerial = Header.v1.Serial;
+ HeaderConfirm = Header.v1.Confirm;
+ ChecksumExpected = Header.v1.PayloadCRC32;
+ Checksum = 0;
+ }
+ if (PayloadSize >= 65536) {
+ LOG_CRIT_IC_SESSION("ICIS07", "payload is way too big");
+ return DestroySession(TDisconnectReason::FormatError());
+ }
+ if (ConfirmedByInput < HeaderConfirm) {
+ ConfirmedByInput = HeaderConfirm;
+ if (AtomicGet(Context->ControlPacketId) <= HeaderConfirm && !NewPingProtocol) {
+ ui64 sendTime = AtomicGet(Context->ControlPacketSendTimer);
+ TDuration duration = CyclesToDuration(GetCycleCountFast() - sendTime);
+ const auto durationUs = duration.MicroSeconds();
+ Metrics->UpdateLegacyPingTimeHist(durationUs);
+ PingQ.push_back(duration);
+ if (PingQ.size() > 16) {
+ PingQ.pop_front();
+ }
+ AtomicSet(Context->ControlPacketId, 0ULL);
+ }
+ }
+ if (PayloadSize) {
+ const ui64 expected = Context->GetLastProcessedPacketSerial() + 1;
+ if (HeaderSerial == 0 || HeaderSerial > expected) {
+ LOG_CRIT_IC_SESSION("ICIS06", "packet serial %" PRIu64 ", but %" PRIu64 " expected", HeaderSerial, expected);
+ return DestroySession(TDisconnectReason::FormatError());
+ }
+ IgnorePayload = HeaderSerial != expected;
+ State = EState::PAYLOAD;
+ } else if (HeaderSerial & TTcpPacketBuf::PingRequestMask) {
+ Send(SessionId, new TEvProcessPingRequest(HeaderSerial & ~TTcpPacketBuf::PingRequestMask));
+ } else if (HeaderSerial & TTcpPacketBuf::PingResponseMask) {
+ const ui64 sent = HeaderSerial & ~TTcpPacketBuf::PingResponseMask;
+ const ui64 received = GetCycleCountFast();
+ HandlePingResponse(CyclesToDuration(received - sent));
+ } else if (HeaderSerial & TTcpPacketBuf::ClockMask) {
+ HandleClock(TInstant::MicroSeconds(HeaderSerial & ~TTcpPacketBuf::ClockMask));
+ }
+ }
+
+ void TInputSessionTCP::ProcessPayload(ui64& numDataBytes) {
+ const size_t numBytes = Min(PayloadSize, IncomingData.GetSize());
+ IncomingData.ExtractFront(numBytes, &Payload);
+ numDataBytes += numBytes;
+ PayloadSize -= numBytes;
+ if (PayloadSize) {
+ return; // there is still some data to receive in the Payload rope
+ }
+ State = EState::HEADER; // we'll continue with header next time
+ if (!Params.UseModernFrame || !Params.Encryption) { // see if we are checksumming packet body
+ for (const auto&& [data, size] : Payload) {
+ Checksum = Crc32cExtendMSanCompatible(Checksum, data, size);
+ }
+ if (Checksum != ChecksumExpected) { // validate payload checksum
+ LOG_ERROR_IC_SESSION("ICIS04", "payload checksum error");
+ return ReestablishConnection(TDisconnectReason::ChecksumError());
+ }
+ }
+ if (Y_UNLIKELY(IgnorePayload)) {
+ return;
+ }
+ if (!Context->AdvanceLastProcessedPacketSerial()) {
+ return DestroySession(TDisconnectReason::NewSession());
+ }
+
+ while (Payload && Socket) {
+ // extract channel part header from the payload stream
+ TChannelPart part;
+ if (!Payload.ExtractFrontPlain(&part, sizeof(part))) {
+ LOG_CRIT_IC_SESSION("ICIS14", "missing TChannelPart header in payload");
+ return DestroySession(TDisconnectReason::FormatError());
+ }
+ if (!part.Size) { // bogus frame
+ continue;
+ } else if (Payload.GetSize() < part.Size) {
+ LOG_CRIT_IC_SESSION("ICIS08", "payload format error ChannelPart# %s", part.ToString().data());
+ return DestroySession(TDisconnectReason::FormatError());
+ }
+
+ const ui16 channel = part.Channel & ~TChannelPart::LastPartFlag;
+ TRope *eventData = channel < Context->ChannelArray.size()
+ ? &Context->ChannelArray[channel]
+ : &Context->ChannelMap[channel];
+
+ Metrics->AddInputChannelsIncomingTraffic(channel, sizeof(part) + part.Size);
+
+ TEventDescr descr;
+ if (~part.Channel & TChannelPart::LastPartFlag) {
+ Payload.ExtractFront(part.Size, eventData);
+ } else if (part.Size != sizeof(descr)) {
+ LOG_CRIT_IC_SESSION("ICIS11", "incorrect last part of an event");
+ return DestroySession(TDisconnectReason::FormatError());
+ } else if (Payload.ExtractFrontPlain(&descr, sizeof(descr))) {
+ Metrics->IncInputChannelsIncomingEvents(channel);
+ ProcessEvent(*eventData, descr);
+ *eventData = TRope();
+ } else {
+ Y_FAIL();
+ }
+ }
+ }
+
+ void TInputSessionTCP::ProcessEvent(TRope& data, TEventDescr& descr) {
+ if (!Params.UseModernFrame || descr.Checksum) {
+ ui32 checksum = 0;
+ for (const auto&& [data, size] : data) {
+ checksum = Crc32cExtendMSanCompatible(checksum, data, size);
+ }
+ if (checksum != descr.Checksum) {
+ LOG_CRIT_IC_SESSION("ICIS05", "event checksum error");
+ return ReestablishConnection(TDisconnectReason::ChecksumError());
+ }
+ }
+ auto ev = std::make_unique<IEventHandle>(SessionId,
+ descr.Type,
+ descr.Flags & ~IEventHandle::FlagExtendedFormat,
+ descr.Recipient,
+ descr.Sender,
+ MakeIntrusive<TEventSerializedData>(std::move(data), bool(descr.Flags & IEventHandle::FlagExtendedFormat)),
+ descr.Cookie,
+ Params.PeerScopeId,
+ NWilson::TTraceId(descr.TraceId));
+ if (Common->EventFilter && !Common->EventFilter->CheckIncomingEvent(*ev, Common->LocalScopeId)) {
+ LOG_CRIT_IC_SESSION("ICIC03", "Event dropped due to scope error LocalScopeId# %s PeerScopeId# %s Type# 0x%08" PRIx32,
+ ScopeIdToString(Common->LocalScopeId).data(), ScopeIdToString(Params.PeerScopeId).data(), descr.Type);
+ ev.reset();
+ }
+ if (ev) {
+ TActivationContext::Send(ev.release());
+ }
+ }
+
+ void TInputSessionTCP::HandleConfirmUpdate() {
+ for (;;) {
+ switch (EUpdateState state = Context->UpdateState) {
+ case EUpdateState::NONE:
+ case EUpdateState::INFLIGHT:
+ case EUpdateState::INFLIGHT_AND_PENDING:
+ // here we may have a race
+ return;
+
+ case EUpdateState::CONFIRMING:
+ Y_VERIFY(UpdateFromInputSession);
+ if (Context->UpdateState.compare_exchange_weak(state, EUpdateState::INFLIGHT)) {
+ Send(SessionId, UpdateFromInputSession.Release());
+ return;
+ }
+ }
+ }
+ }
+
+ bool TInputSessionTCP::ReadMore() {
+ PreallocateBuffers();
+
+ TStackVec<TIoVec, NumPreallocatedBuffers> buffs;
+ for (const auto& item : Buffers) {
+ TIoVec iov{item->GetBuffer(), item->GetCapacity()};
+ buffs.push_back(iov);
+ if (Params.Encryption) {
+ break; // do not put more than one buffer in queue to prevent using ReadV
+ }
+ }
+
+ const struct iovec* iovec = reinterpret_cast<const struct iovec*>(buffs.data());
+ int iovcnt = buffs.size();
+
+ ssize_t recvres = 0;
+ TString err;
+ LWPROBE_IF_TOO_LONG(SlowICReadFromSocket, ms) {
+ do {
+#ifndef _win_
+ recvres = iovcnt == 1 ? Socket->Recv(iovec->iov_base, iovec->iov_len, &err) : Socket->ReadV(iovec, iovcnt);
+#else
+ recvres = Socket->Recv(iovec[0].iov_base, iovec[0].iov_len, &err);
+#endif
+ Metrics->IncRecvSyscalls();
+ } while (recvres == -EINTR);
+ }
+
+ LOG_DEBUG_IC_SESSION("ICIS12", "ReadMore recvres# %zd iovcnt# %d err# %s", recvres, iovcnt, err.data());
+
+ if (recvres <= 0 || CloseInputSessionRequested) {
+ if ((-recvres != EAGAIN && -recvres != EWOULDBLOCK) || CloseInputSessionRequested) {
+ TString message = CloseInputSessionRequested ? "connection closed by debug command"
+ : recvres == 0 ? "connection closed by peer"
+ : err ? err
+ : Sprintf("socket: %s", strerror(-recvres));
+ LOG_NOTICE_NET(NodeId, "%s", message.data());
+ ReestablishConnection(CloseInputSessionRequested ? TDisconnectReason::Debug() :
+ recvres == 0 ? TDisconnectReason::EndOfStream() : TDisconnectReason::FromErrno(-recvres));
+ } else if (PollerToken && !std::exchange(Context->ReadPending, true)) {
+ if (Params.Encryption) {
+ auto *secure = static_cast<NInterconnect::TSecureSocket*>(Socket.Get());
+ const bool wantRead = secure->WantRead(), wantWrite = secure->WantWrite();
+ Y_VERIFY_DEBUG(wantRead || wantWrite);
+ PollerToken->Request(wantRead, wantWrite);
+ } else {
+ PollerToken->Request(true, false);
+ }
+ }
+ return false;
+ }
+
+ Y_VERIFY(recvres > 0);
+ Metrics->AddTotalBytesRead(recvres);
+ TDeque<TIntrusivePtr<TRopeAlignedBuffer>>::iterator it;
+ for (it = Buffers.begin(); recvres; ++it) {
+ Y_VERIFY(it != Buffers.end());
+ const size_t bytesFromFrontBuffer = Min<size_t>(recvres, (*it)->GetCapacity());
+ (*it)->AdjustSize(bytesFromFrontBuffer);
+ IncomingData.Insert(IncomingData.End(), TRope(std::move(*it)));
+ recvres -= bytesFromFrontBuffer;
+ }
+ Buffers.erase(Buffers.begin(), it);
+
+ LastReceiveTimestamp = TActivationContext::Now();
+
+ return true;
+ }
+
+ void TInputSessionTCP::PreallocateBuffers() {
+ // ensure that we have exactly "numBuffers" in queue
+ LWPROBE_IF_TOO_LONG(SlowICReadLoopAdjustSize, ms) {
+ const ui32 target = Params.Encryption ? 1 : NumPreallocatedBuffers;
+ while (Buffers.size() < target) {
+ Buffers.emplace_back(TRopeAlignedBuffer::Allocate(sizeof(TTcpPacketBuf)));
+ }
+ }
+ }
+
+ void TInputSessionTCP::ReestablishConnection(TDisconnectReason reason) {
+ LOG_DEBUG_IC_SESSION("ICIS09", "ReestablishConnection, reason# %s", reason.ToString().data());
+ AtomicDecrement(Context->NumInputSessions);
+ Send(SessionId, new TEvSocketDisconnect(std::move(reason)));
+ PassAway();
+ Socket.Reset();
+ }
+
+ void TInputSessionTCP::DestroySession(TDisconnectReason reason) {
+ LOG_DEBUG_IC_SESSION("ICIS13", "DestroySession, reason# %s", reason.ToString().data());
+ AtomicDecrement(Context->NumInputSessions);
+ Send(SessionId, TInterconnectSessionTCP::NewEvTerminate(std::move(reason)));
+ PassAway();
+ Socket.Reset();
+ }
+
+ void TInputSessionTCP::HandleCheckDeadPeer() {
+ const TInstant now = TActivationContext::Now();
+ if (now >= LastReceiveTimestamp + DeadPeerTimeout) {
+ ReceiveData();
+ if (Socket && now >= LastReceiveTimestamp + DeadPeerTimeout) {
+ // nothing has changed, terminate session
+ DestroySession(TDisconnectReason::DeadPeer());
+ }
+ }
+ Schedule(LastReceiveTimestamp + DeadPeerTimeout - now, new TEvCheckDeadPeer);
+ }
+
+ void TInputSessionTCP::HandlePingResponse(TDuration passed) {
+ PingQ.push_back(passed);
+ if (PingQ.size() > 16) {
+ PingQ.pop_front();
+ }
+ const TDuration ping = *std::min_element(PingQ.begin(), PingQ.end());
+ const auto pingUs = ping.MicroSeconds();
+ Context->PingRTT_us = pingUs;
+ NewPingProtocol = true;
+ Metrics->UpdateLegacyPingTimeHist(pingUs);
+ }
+
+ void TInputSessionTCP::HandleClock(TInstant clock) {
+ const TInstant here = TInstant::Now(); // wall clock
+ const TInstant remote = clock + TDuration::MicroSeconds(Context->PingRTT_us / 2);
+ i64 skew = remote.MicroSeconds() - here.MicroSeconds();
+ SkewQ.push_back(skew);
+ if (SkewQ.size() > 16) {
+ SkewQ.pop_front();
+ }
+ i64 clockSkew = SkewQ.front();
+ for (i64 skew : SkewQ) {
+ if (abs(skew) < abs(clockSkew)) {
+ clockSkew = skew;
+ }
+ }
+ Context->ClockSkew_us = clockSkew;
+ Metrics->SetClockSkewMicrosec(clockSkew);
+ }
+
+
+}
diff --git a/library/cpp/actors/interconnect/interconnect_tcp_proxy.cpp b/library/cpp/actors/interconnect/interconnect_tcp_proxy.cpp
new file mode 100644
index 0000000000..7e2d8ccb94
--- /dev/null
+++ b/library/cpp/actors/interconnect/interconnect_tcp_proxy.cpp
@@ -0,0 +1,936 @@
+#include "interconnect_tcp_proxy.h"
+#include "interconnect_handshake.h"
+#include "interconnect_tcp_session.h"
+#include <library/cpp/actors/core/log.h>
+#include <library/cpp/actors/protos/services_common.pb.h>
+#include <library/cpp/monlib/service/pages/templates.h>
+#include <util/system/getpid.h>
+
+namespace NActors {
+ static constexpr TDuration GetNodeRequestTimeout = TDuration::Seconds(5);
+
+ static constexpr TDuration FirstErrorSleep = TDuration::MilliSeconds(10);
+ static constexpr TDuration MaxErrorSleep = TDuration::Seconds(10);
+ static constexpr ui32 SleepRetryMultiplier = 4;
+
+ static TString PeerNameForHuman(ui32 nodeNum, const TString& longName, ui16 port) {
+ TStringBuf token;
+ TStringBuf(longName).NextTok('.', token);
+ return ToString<ui32>(nodeNum) + ":" + (token.size() > 0 ? TString(token) : longName) + ":" + ToString<ui16>(port);
+ }
+
+ TInterconnectProxyTCP::TInterconnectProxyTCP(const ui32 node, TInterconnectProxyCommon::TPtr common,
+ IActor **dynamicPtr)
+ : TActor(&TThis::StateInit)
+ , PeerNodeId(node)
+ , DynamicPtr(dynamicPtr)
+ , Common(std::move(common))
+ , SecureContext(new NInterconnect::TSecureSocketContext(Common->Settings.Certificate, Common->Settings.PrivateKey,
+ Common->Settings.CaFilePath, Common->Settings.CipherList))
+ {
+ Y_VERIFY(Common);
+ Y_VERIFY(Common->NameserviceId);
+ if (DynamicPtr) {
+ Y_VERIFY(!*DynamicPtr);
+ *DynamicPtr = this;
+ }
+ }
+
+ void TInterconnectProxyTCP::Bootstrap() {
+ SetPrefix(Sprintf("Proxy %s [node %" PRIu32 "]", SelfId().ToString().data(), PeerNodeId));
+
+ SwitchToInitialState();
+ PassAwayTimestamp = TActivationContext::Now() + TDuration::Seconds(15);
+
+ LOG_INFO_IC("ICP01", "ready to work");
+ }
+
+ void TInterconnectProxyTCP::Registered(TActorSystem* sys, const TActorId& owner) {
+ if (!DynamicPtr) {
+ // perform usual bootstrap for static nodes
+ sys->Send(new IEventHandle(TEvents::TSystem::Bootstrap, 0, SelfId(), owner, nullptr, 0));
+ }
+ if (const auto& mon = Common->RegisterMonPage) {
+ TString path = Sprintf("peer%04" PRIu32, PeerNodeId);
+ TString title = Sprintf("Peer #%04" PRIu32, PeerNodeId);
+ mon(path, title, sys, SelfId());
+ }
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+ // PendingActivation
+ ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+ void TInterconnectProxyTCP::RequestNodeInfo(STATEFN_SIG) {
+ ICPROXY_PROFILED;
+
+ Y_VERIFY(!IncomingHandshakeActor && !OutgoingHandshakeActor && !PendingIncomingHandshakeEvents && !PendingSessionEvents);
+ EnqueueSessionEvent(ev);
+ StartConfiguring();
+ }
+
+ void TInterconnectProxyTCP::RequestNodeInfoForIncomingHandshake(STATEFN_SIG) {
+ ICPROXY_PROFILED;
+
+ if (!Terminated) {
+ Y_VERIFY(!IncomingHandshakeActor && !OutgoingHandshakeActor && !PendingIncomingHandshakeEvents && !PendingSessionEvents);
+ EnqueueIncomingHandshakeEvent(ev);
+ StartConfiguring();
+ }
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+ // PendingNodeInfo
+ ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+ void TInterconnectProxyTCP::StartConfiguring() {
+ ICPROXY_PROFILED;
+
+ Y_VERIFY(!IncomingHandshakeActor && !OutgoingHandshakeActor);
+
+ // issue node info request
+ Send(Common->NameserviceId, new TEvInterconnect::TEvGetNode(PeerNodeId));
+
+ // arm configure timer; store pointer to event to ensure that we will handle correct one if there were any other
+ // wakeup events in flight
+ SwitchToState(__LINE__, "PendingNodeInfo", &TThis::PendingNodeInfo, GetNodeRequestTimeout,
+ ConfigureTimeoutCookie = new TEvents::TEvWakeup);
+ }
+
+ void TInterconnectProxyTCP::Configure(TEvInterconnect::TEvNodeInfo::TPtr& ev) {
+ ICPROXY_PROFILED;
+
+ Y_VERIFY(!IncomingHandshakeActor && !OutgoingHandshakeActor && !Session);
+
+ if (!ev->Get()->Node) {
+ TransitToErrorState("cannot get node info");
+ } else {
+ auto& info = *ev->Get()->Node;
+ TString name = PeerNameForHuman(PeerNodeId, info.Host, info.Port);
+ TechnicalPeerHostName = info.Host;
+ if (!Metrics) {
+ Metrics = Common->Metrics ? CreateInterconnectMetrics(Common) : CreateInterconnectCounters(Common);
+ }
+ Metrics->SetPeerInfo(name, info.Location.GetDataCenterId());
+
+ LOG_DEBUG_IC("ICP02", "configured for host %s", name.data());
+
+ ProcessConfigured();
+ }
+ }
+
+ void TInterconnectProxyTCP::ConfigureTimeout(TEvents::TEvWakeup::TPtr& ev) {
+ ICPROXY_PROFILED;
+
+ if (ev->Get() == ConfigureTimeoutCookie) {
+ TransitToErrorState("timed out while waiting for node info");
+ }
+ }
+
+ void TInterconnectProxyTCP::ProcessConfigured() {
+ ICPROXY_PROFILED;
+
+ // if the request was initiated by some activity involving Interconnect, then we are expected to start handshake
+ if (PendingSessionEvents) {
+ StartInitialHandshake();
+ }
+
+ // process incoming handshake requests; all failures were ejected from the queue along with the matching initiation requests
+ for (THolder<IEventHandle>& ev : PendingIncomingHandshakeEvents) {
+ TAutoPtr<IEventHandle> x(ev.Release());
+ IncomingHandshake(x);
+ }
+ PendingIncomingHandshakeEvents.clear();
+
+ // possible situation -- incoming handshake arrives, but actually it is not satisfied and rejected; in this case
+ // we are going to return to initial state as we have nothing to do
+ if (!IncomingHandshakeActor && !OutgoingHandshakeActor) {
+ SwitchToInitialState();
+ }
+ }
+
+ void TInterconnectProxyTCP::StartInitialHandshake() {
+ ICPROXY_PROFILED;
+
+ // since we are starting initial handshake for some reason, we'll drop any existing handshakes, if any
+ DropHandshakes();
+
+ // create and register handshake actor
+ OutgoingHandshakeActor = Register(CreateOutgoingHandshakeActor(Common, GenerateSessionVirtualId(),
+ TActorId(), PeerNodeId, 0, TechnicalPeerHostName, TSessionParams()), TMailboxType::ReadAsFilled);
+ OutgoingHandshakeActorCreated = TActivationContext::Now();
+
+ // prepare for new handshake
+ PrepareNewSessionHandshake();
+ }
+
+ void TInterconnectProxyTCP::StartResumeHandshake(ui64 inputCounter) {
+ ICPROXY_PROFILED;
+
+ // drop outgoing handshake if we have one; keep incoming handshakes as they may be useful
+ DropOutgoingHandshake();
+
+ // ensure that we have session
+ Y_VERIFY(Session);
+
+ // ensure that we have both virtual ids
+ Y_VERIFY(SessionVirtualId);
+ Y_VERIFY(RemoteSessionVirtualId);
+
+ // create and register handshake actor
+ OutgoingHandshakeActor = Register(CreateOutgoingHandshakeActor(Common, SessionVirtualId,
+ RemoteSessionVirtualId, PeerNodeId, inputCounter, TechnicalPeerHostName, Session->Params),
+ TMailboxType::ReadAsFilled);
+ OutgoingHandshakeActorCreated = TActivationContext::Now();
+ }
+
+ void TInterconnectProxyTCP::IssueIncomingHandshakeReply(const TActorId& handshakeId, ui64 peerLocalId,
+ THolder<IEventBase> event) {
+ ICPROXY_PROFILED;
+
+ Y_VERIFY(!IncomingHandshakeActor);
+ IncomingHandshakeActor = handshakeId;
+ IncomingHandshakeActorFilledIn = TActivationContext::Now();
+ Y_VERIFY(!LastSerialFromIncomingHandshake || *LastSerialFromIncomingHandshake <= peerLocalId);
+ LastSerialFromIncomingHandshake = peerLocalId;
+
+ if (OutgoingHandshakeActor && SelfId().NodeId() < PeerNodeId) {
+ // Both outgoing and incoming handshake are in progress. To prevent race condition during semultanous handshake
+ // incoming handshake must be held till outgoing handshake is complete or failed
+ LOG_DEBUG_IC("ICP06", "reply for incoming handshake (actor %s) is held", IncomingHandshakeActor.ToString().data());
+ HeldHandshakeReply = std::move(event);
+
+ // Check that we are in one of acceptable states that would properly handle handshake statuses.
+ const auto state = CurrentStateFunc();
+ Y_VERIFY(state == &TThis::PendingConnection || state == &TThis::StateWork, "invalid handshake request in state# %s", State);
+ } else {
+ LOG_DEBUG_IC("ICP07", "issued incoming handshake reply");
+
+ // No race, so we can send reply immediately.
+ Y_VERIFY(!HeldHandshakeReply);
+ Send(IncomingHandshakeActor, event.Release());
+
+ // Start waiting for handshake reply, if not yet started; also, if session is already created, then we don't
+ // switch from working state.
+ if (!Session) {
+ LOG_INFO_IC("ICP08", "No active sessions, becoming PendingConnection");
+ SwitchToState(__LINE__, "PendingConnection", &TThis::PendingConnection);
+ } else {
+ Y_VERIFY(CurrentStateFunc() == &TThis::StateWork);
+ }
+ }
+ }
+
+ void TInterconnectProxyTCP::IncomingHandshake(TEvHandshakeAsk::TPtr& ev) {
+ ICPROXY_PROFILED;
+
+ TEvHandshakeAsk *msg = ev->Get();
+
+ // TEvHandshakeAsk is only applicable for continuation requests
+ LOG_DEBUG_IC("ICP09", "(actor %s) from: %s for: %s", ev->Sender.ToString().data(),
+ ev->Get()->Self.ToString().data(), ev->Get()->Peer.ToString().data());
+
+ if (!Session) {
+ // if there is no open session, report error -- continuation request works only with open sessions
+ LOG_NOTICE_IC("ICP12", "(actor %s) peer tries to resume nonexistent session Self# %s Peer# %s",
+ ev->Sender.ToString().data(), msg->Self.ToString().data(), msg->Peer.ToString().data());
+ } else if (SessionVirtualId != ev->Get()->Peer || RemoteSessionVirtualId != ev->Get()->Self) {
+ // check session virtual ids for continuation
+ LOG_NOTICE_IC("ICP13", "(actor %s) virtual id mismatch with existing session (Peer: %s Self: %s"
+ " SessionVirtualId: %s RemoteSessionVirtualId: %s)", ev->Sender.ToString().data(),
+ ev->Get()->Peer.ToString().data(), ev->Get()->Self.ToString().data(), SessionVirtualId.ToString().data(),
+ RemoteSessionVirtualId.ToString().data());
+ } else {
+ // if we already have incoming handshake, then terminate existing one
+ DropIncomingHandshake();
+
+ // issue reply to the sender, possibly holding it while outgoing handshake is at race
+ THolder<IEventBase> reply = IActor::InvokeOtherActor(*Session, &TInterconnectSessionTCP::ProcessHandshakeRequest, ev);
+ return IssueIncomingHandshakeReply(ev->Sender, RemoteSessionVirtualId.LocalId(), std::move(reply));
+ }
+
+ // error case -- report error to the handshake actor
+ Send(ev->Sender, new TEvHandshakeNak);
+ }
+
+ void TInterconnectProxyTCP::IncomingHandshake(TEvHandshakeRequest::TPtr& ev) {
+ ICPROXY_PROFILED;
+
+ LOG_DEBUG_IC("ICP17", "incoming handshake (actor %s)", ev->Sender.ToString().data());
+
+ const auto& record = ev->Get()->Record;
+ ui64 remotePID = record.GetProgramPID();
+ ui64 remoteStartTime = record.GetProgramStartTime();
+ ui64 remoteSerial = record.GetSerial();
+
+ if (RemoteProgramInfo && remotePID == RemoteProgramInfo->PID && remoteStartTime == RemoteProgramInfo->StartTime) {
+ if (remoteSerial < RemoteProgramInfo->Serial) {
+ LOG_INFO_IC("ICP18", "handshake (actor %s) is too old", ev->Sender.ToString().data());
+ Send(ev->Sender, new TEvents::TEvPoisonPill);
+ return;
+ } else {
+ RemoteProgramInfo->Serial = remoteSerial;
+ }
+ } else {
+ const auto ptr = new TProgramInfo;
+ ptr->PID = remotePID;
+ ptr->StartTime = remoteStartTime;
+ ptr->Serial = remoteSerial;
+ RemoteProgramInfo.Reset(ptr);
+ }
+
+ /* Let's check peer technical hostname */
+ if (record.HasSenderHostName() && TechnicalPeerHostName != record.GetSenderHostName()) {
+ Send(ev->Sender, new TEvHandshakeReplyError("host name mismatch"));
+ return;
+ }
+
+ // check sender actor id and check if it is not very old
+ if (LastSerialFromIncomingHandshake) {
+ const ui64 serial = record.GetSerial();
+ if (serial < *LastSerialFromIncomingHandshake) {
+ LOG_NOTICE_IC("ICP15", "Handshake# %s has duplicate serial# %" PRIu64
+ " LastSerialFromIncomingHandshake# %" PRIu64, ev->Sender.ToString().data(),
+ serial, *LastSerialFromIncomingHandshake);
+ Send(ev->Sender, new TEvHandshakeReplyError("duplicate serial"));
+ return;
+ } else if (serial == *LastSerialFromIncomingHandshake) {
+ LOG_NOTICE_IC("ICP15", "Handshake# %s is obsolete, serial# %" PRIu64
+ " LastSerialFromIncomingHandshake# %" PRIu64, ev->Sender.ToString().data(),
+ serial, *LastSerialFromIncomingHandshake);
+ Send(ev->Sender, new TEvents::TEvPoisonPill);
+ return;
+ }
+ }
+
+ // drop incoming handshake as this is definitely more recent
+ DropIncomingHandshake();
+
+ // prepare for new session
+ PrepareNewSessionHandshake();
+
+ auto event = MakeHolder<TEvHandshakeReplyOK>();
+ auto* pb = event->Record.MutableSuccess();
+ const TActorId virtualId = GenerateSessionVirtualId();
+ pb->SetProtocol(INTERCONNECT_PROTOCOL_VERSION);
+ pb->SetSenderActorId(virtualId.ToString());
+ pb->SetProgramPID(GetPID());
+ pb->SetProgramStartTime(Common->StartTime);
+ pb->SetSerial(virtualId.LocalId());
+
+ IssueIncomingHandshakeReply(ev->Sender, 0, std::move(event));
+ }
+
+ void TInterconnectProxyTCP::HandleHandshakeStatus(TEvHandshakeDone::TPtr& ev) {
+ ICPROXY_PROFILED;
+
+ TEvHandshakeDone *msg = ev->Get();
+
+ // Terminate handshake actor working in opposite direction, if set up.
+ if (ev->Sender == IncomingHandshakeActor) {
+ LOG_INFO_IC("ICP19", "incoming handshake succeeded");
+ DropIncomingHandshake(false);
+ DropOutgoingHandshake();
+ } else if (ev->Sender == OutgoingHandshakeActor) {
+ LOG_INFO_IC("ICP20", "outgoing handshake succeeded");
+ DropIncomingHandshake();
+ DropOutgoingHandshake(false);
+ } else {
+ /* It seems to be an old handshake. */
+ return;
+ }
+
+ Y_VERIFY(!IncomingHandshakeActor && !OutgoingHandshakeActor);
+ SwitchToState(__LINE__, "StateWork", &TThis::StateWork);
+
+ if (Session) {
+ // this is continuation request, check that virtual ids match
+ Y_VERIFY(SessionVirtualId == msg->Self && RemoteSessionVirtualId == msg->Peer);
+ } else {
+ // this is initial request, check that we have virtual ids not filled in
+ Y_VERIFY(!SessionVirtualId && !RemoteSessionVirtualId);
+ }
+
+ auto error = [&](const char* description) {
+ TransitToErrorState(description);
+ };
+
+ // If session is not created, then create new one.
+ if (!Session) {
+ RemoteProgramInfo = std::move(msg->ProgramInfo);
+ if (!RemoteProgramInfo) {
+ // we have received resume handshake, but session was closed concurrently while handshaking
+ return error("Session continuation race");
+ }
+
+ // Create new session actor.
+ SessionID = RegisterWithSameMailbox(Session = new TInterconnectSessionTCP(this, msg->Params));
+ IActor::InvokeOtherActor(*Session, &TInterconnectSessionTCP::Init);
+ SessionVirtualId = msg->Self;
+ RemoteSessionVirtualId = msg->Peer;
+ LOG_INFO_IC("ICP22", "created new session: %s", SessionID.ToString().data());
+ }
+
+ // ensure that we have session local/peer virtual ids
+ Y_VERIFY(Session && SessionVirtualId && RemoteSessionVirtualId);
+
+ // Set up new connection for the session.
+ IActor::InvokeOtherActor(*Session, &TInterconnectSessionTCP::SetNewConnection, ev);
+
+ // Reset retry timer
+ HoldByErrorWakeupDuration = TDuration::Zero();
+
+ /* Forward all held events */
+ ProcessPendingSessionEvents();
+ }
+
+ void TInterconnectProxyTCP::HandleHandshakeStatus(TEvHandshakeFail::TPtr& ev) {
+ ICPROXY_PROFILED;
+
+ // update error state log; this fail is inconclusive unless this is the last pending handshake
+ const bool inconclusive = (ev->Sender != IncomingHandshakeActor && ev->Sender != OutgoingHandshakeActor) ||
+ (IncomingHandshakeActor && OutgoingHandshakeActor);
+ LogHandshakeFail(ev, inconclusive);
+
+ if (ev->Sender == IncomingHandshakeActor) {
+ LOG_NOTICE_IC("ICP24", "incoming handshake failed, temporary: %" PRIu32 " explanation: %s outgoing: %s",
+ ui32(ev->Get()->Temporary), ev->Get()->Explanation.data(), OutgoingHandshakeActor.ToString().data());
+ DropIncomingHandshake(false);
+ } else if (ev->Sender == OutgoingHandshakeActor) {
+ LOG_NOTICE_IC("ICP25", "outgoing handshake failed, temporary: %" PRIu32 " explanation: %s incoming: %s held: %s",
+ ui32(ev->Get()->Temporary), ev->Get()->Explanation.data(), IncomingHandshakeActor.ToString().data(),
+ HeldHandshakeReply ? "yes" : "no");
+ DropOutgoingHandshake(false);
+
+ if (IEventBase* reply = HeldHandshakeReply.Release()) {
+ Y_VERIFY(IncomingHandshakeActor);
+ LOG_DEBUG_IC("ICP26", "sent held handshake reply to %s", IncomingHandshakeActor.ToString().data());
+ Send(IncomingHandshakeActor, reply);
+ }
+
+ // if we have no current session, then we have to drop all pending events as the outgoing handshake has failed
+ ProcessPendingSessionEvents();
+ } else {
+ /* It seems to be an old fail, just ignore it */
+ LOG_NOTICE_IC("ICP27", "obsolete handshake fail ignored");
+ return;
+ }
+
+ if (Metrics) {
+ Metrics->IncHandshakeFails();
+ }
+
+ if (IncomingHandshakeActor || OutgoingHandshakeActor) {
+ // one of handshakes is still going on
+ LOG_DEBUG_IC("ICP28", "other handshake is still going on");
+ return;
+ }
+
+ switch (ev->Get()->Temporary) {
+ case TEvHandshakeFail::HANDSHAKE_FAIL_TRANSIENT:
+ if (!Session) {
+ if (PendingSessionEvents) {
+ // try to start outgoing handshake as we have some events enqueued
+ StartInitialHandshake();
+ } else {
+ // return back to initial state as we have no session and no pending handshakes
+ SwitchToInitialState();
+ }
+ } else if (Session->Socket) {
+ // try to reestablish connection -- meaning restart handshake from the last known position
+ IActor::InvokeOtherActor(*Session, &TInterconnectSessionTCP::ReestablishConnectionWithHandshake,
+ TDisconnectReason::HandshakeFailTransient());
+ } else {
+ // we have no active connection in that session, so just restart handshake from last known position
+ IActor::InvokeOtherActor(*Session, &TInterconnectSessionTCP::StartHandshake);
+ }
+ break;
+
+ case TEvHandshakeFail::HANDSHAKE_FAIL_SESSION_MISMATCH:
+ StartInitialHandshake();
+ break;
+
+ case TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT:
+ TString timeExplanation = " LastSessionDieTime# " + LastSessionDieTime.ToString();
+ if (Session) {
+ InvokeOtherActor(*Session, &TInterconnectSessionTCP::Terminate,
+ TDisconnectReason::HandshakeFailPermanent());
+ }
+ TransitToErrorState(ev->Get()->Explanation + timeExplanation, false);
+ break;
+ }
+ }
+
+ void TInterconnectProxyTCP::LogHandshakeFail(TEvHandshakeFail::TPtr& ev, bool inconclusive) {
+ ICPROXY_PROFILED;
+
+ TString kind = "unknown";
+ switch (ev->Get()->Temporary) {
+ case TEvHandshakeFail::HANDSHAKE_FAIL_TRANSIENT:
+ kind = Session ? "transient w/session" : "transient w/o session";
+ break;
+
+ case TEvHandshakeFail::HANDSHAKE_FAIL_SESSION_MISMATCH:
+ kind = "session_mismatch";
+ break;
+
+ case TEvHandshakeFail::HANDSHAKE_FAIL_PERMANENT:
+ kind = "permanent";
+ break;
+ }
+ if (inconclusive) {
+ kind += " inconclusive";
+ }
+ UpdateErrorStateLog(TActivationContext::Now(), kind, ev->Get()->Explanation);
+ }
+
+ void TInterconnectProxyTCP::ProcessPendingSessionEvents() {
+ ICPROXY_PROFILED;
+
+ while (PendingSessionEvents) {
+ TPendingSessionEvent ev = std::move(PendingSessionEvents.front());
+ PendingSessionEventsSize -= ev.Size;
+ TAutoPtr<IEventHandle> event(ev.Event.Release());
+ PendingSessionEvents.pop_front();
+
+ if (Session) {
+ ForwardSessionEventToSession(event);
+ } else {
+ DropSessionEvent(event);
+ }
+ }
+ }
+
+ void TInterconnectProxyTCP::DropSessionEvent(STATEFN_SIG) {
+ ICPROXY_PROFILED;
+
+ ValidateEvent(ev, "DropSessionEvent");
+ switch (ev->GetTypeRewrite()) {
+ case TEvInterconnect::EvForward:
+ if (ev->Flags & IEventHandle::FlagSubscribeOnSession) {
+ Send(ev->Sender, new TEvInterconnect::TEvNodeDisconnected(PeerNodeId), 0, ev->Cookie);
+ }
+ TActivationContext::Send(ev->ForwardOnNondelivery(TEvents::TEvUndelivered::Disconnected));
+ break;
+
+ case TEvInterconnect::TEvConnectNode::EventType:
+ case TEvents::TEvSubscribe::EventType:
+ Send(ev->Sender, new TEvInterconnect::TEvNodeDisconnected(PeerNodeId), 0, ev->Cookie);
+ break;
+
+ case TEvents::TEvUnsubscribe::EventType:
+ /* Do nothing */
+ break;
+
+ default:
+ Y_FAIL("Unexpected type of event in held event queue");
+ }
+ }
+
+ void TInterconnectProxyTCP::UnregisterSession(TInterconnectSessionTCP* session) {
+ ICPROXY_PROFILED;
+
+ Y_VERIFY(Session && Session == session && SessionID);
+
+ LOG_INFO_IC("ICP30", "unregister session Session# %s VirtualId# %s", SessionID.ToString().data(),
+ SessionVirtualId.ToString().data());
+
+ Session = nullptr;
+ SessionID = TActorId();
+
+ // drop all pending events as we are closed
+ ProcessPendingSessionEvents();
+
+ // reset virtual ids as this session is terminated
+ SessionVirtualId = TActorId();
+ RemoteSessionVirtualId = TActorId();
+
+ if (Metrics) {
+ Metrics->IncSessionDeaths();
+ }
+ LastSessionDieTime = TActivationContext::Now();
+
+ if (IncomingHandshakeActor || OutgoingHandshakeActor) {
+ PrepareNewSessionHandshake();
+ } else {
+ SwitchToInitialState();
+ }
+ }
+
+ void TInterconnectProxyTCP::EnqueueSessionEvent(STATEFN_SIG) {
+ ICPROXY_PROFILED;
+
+ ValidateEvent(ev, "EnqueueSessionEvent");
+ const ui32 size = ev->GetSize();
+ PendingSessionEventsSize += size;
+ PendingSessionEvents.emplace_back(TActivationContext::Now() + Common->Settings.MessagePendingTimeout, size, ev);
+ ScheduleCleanupEventQueue();
+ CleanupEventQueue();
+ }
+
+ void TInterconnectProxyTCP::EnqueueIncomingHandshakeEvent(STATEFN_SIG) {
+ ICPROXY_PROFILED;
+
+ // enqueue handshake request
+ Y_UNUSED();
+ PendingIncomingHandshakeEvents.emplace_back(ev);
+ }
+
+ void TInterconnectProxyTCP::EnqueueIncomingHandshakeEvent(TEvHandshakeDone::TPtr& /*ev*/) {
+ ICPROXY_PROFILED;
+
+ // TEvHandshakeDone can't get into the queue, because we have to process handshake request first; this may be the
+ // race with the previous handshakes, so simply ignore it
+ }
+
+ void TInterconnectProxyTCP::EnqueueIncomingHandshakeEvent(TEvHandshakeFail::TPtr& ev) {
+ ICPROXY_PROFILED;
+
+ for (auto it = PendingIncomingHandshakeEvents.begin(); it != PendingIncomingHandshakeEvents.end(); ++it) {
+ THolder<IEventHandle>& pendingEvent = *it;
+ if (pendingEvent->Sender == ev->Sender) {
+ // we have found cancellation request for the pending handshake request; so simply remove it from the
+ // deque, as we are not interested in failure reason; must likely it happens because of handshake timeout
+ if (pendingEvent->GetTypeRewrite() == TEvHandshakeFail::EventType) {
+ TEvHandshakeFail::TPtr tmp(static_cast<TEventHandle<TEvHandshakeFail>*>(pendingEvent.Release()));
+ LogHandshakeFail(tmp, true);
+ }
+ PendingIncomingHandshakeEvents.erase(it);
+ break;
+ }
+ }
+ }
+
+ void TInterconnectProxyTCP::ForwardSessionEventToSession(STATEFN_SIG) {
+ ICPROXY_PROFILED;
+
+ Y_VERIFY(Session && SessionID);
+ ValidateEvent(ev, "ForwardSessionEventToSession");
+ InvokeOtherActor(*Session, &TInterconnectSessionTCP::Receive, ev, TActivationContext::ActorContextFor(SessionID));
+ }
+
+ void TInterconnectProxyTCP::GenerateHttpInfo(NMon::TEvHttpInfo::TPtr& ev) {
+ ICPROXY_PROFILED;
+
+ LOG_INFO_IC("ICP31", "proxy http called");
+
+ TStringStream str;
+
+ HTML(str) {
+ DIV_CLASS("panel panel-info") {
+ DIV_CLASS("panel-heading") {
+ str << "Proxy";
+ }
+ DIV_CLASS("panel-body") {
+ TABLE_CLASS("table") {
+ TABLEHEAD() {
+ TABLER() {
+ TABLEH() {
+ str << "Sensor";
+ }
+ TABLEH() {
+ str << "Value";
+ }
+ }
+ }
+#define MON_VAR(NAME) \
+ TABLER() { \
+ TABLED() { \
+ str << #NAME; \
+ } \
+ TABLED() { \
+ str << NAME; \
+ } \
+ }
+
+ TABLEBODY() {
+ MON_VAR(TActivationContext::Now())
+ MON_VAR(SessionID)
+ MON_VAR(LastSessionDieTime)
+ MON_VAR(IncomingHandshakeActor)
+ MON_VAR(IncomingHandshakeActorFilledIn)
+ MON_VAR(IncomingHandshakeActorReset)
+ MON_VAR(OutgoingHandshakeActor)
+ MON_VAR(OutgoingHandshakeActorCreated)
+ MON_VAR(OutgoingHandshakeActorReset)
+ MON_VAR(State)
+ MON_VAR(StateSwitchTime)
+ }
+ }
+ }
+ }
+
+ DIV_CLASS("panel panel-info") {
+ DIV_CLASS("panel-heading") {
+ str << "Error Log";
+ }
+ DIV_CLASS("panel-body") {
+ TABLE_CLASS("table") {
+ TABLEHEAD() {
+ TABLER() {
+ TABLEH() {
+ str << "Timestamp";
+ }
+ TABLEH() {
+ str << "Elapsed";
+ }
+ TABLEH() {
+ str << "Kind";
+ }
+ TABLEH() {
+ str << "Explanation";
+ }
+ }
+ }
+ TABLEBODY() {
+ const TInstant now = TActivationContext::Now();
+ const TInstant barrier = now - TDuration::Minutes(1);
+ for (auto it = ErrorStateLog.rbegin(); it != ErrorStateLog.rend(); ++it) {
+ auto wrapper = [&](const auto& lambda) {
+ if (std::get<0>(*it) > barrier) {
+ str << "<strong>";
+ lambda();
+ str << "</strong>";
+ } else {
+ lambda();
+ }
+ };
+ TABLER() {
+ TABLED() {
+ wrapper([&] {
+ str << std::get<0>(*it);
+ });
+ }
+ TABLED() {
+ wrapper([&] {
+ str << now - std::get<0>(*it);
+ });
+ }
+ TABLED() {
+ wrapper([&] {
+ str << std::get<1>(*it);
+ });
+ }
+ TABLED() {
+ wrapper([&] {
+ str << std::get<2>(*it);
+ });
+
+ ui32 rep = std::get<3>(*it);
+ if (rep != 1) {
+ str << " <strong>x" << rep << "</strong>";
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ if (Session != nullptr) {
+ Session->GenerateHttpInfo(str);
+ }
+
+ Send(ev->Sender, new NMon::TEvHttpInfoRes(str.Str()));
+ }
+
+ void TInterconnectProxyTCP::TransitToErrorState(TString explanation, bool updateErrorLog) {
+ ICPROXY_PROFILED;
+
+ LOG_NOTICE_IC("ICP32", "transit to hold-by-error state Explanation# %s", explanation.data());
+ LOG_INFO(*TlsActivationContext, NActorsServices::INTERCONNECT_STATUS, "[%u] error state: %s", PeerNodeId, explanation.data());
+
+ if (updateErrorLog) {
+ UpdateErrorStateLog(TActivationContext::Now(), "permanent conclusive", explanation);
+ }
+
+ Y_VERIFY(Session == nullptr);
+ Y_VERIFY(!SessionID);
+
+ // recalculate wakeup timeout -- if this is the first failure, then we sleep for default timeout; otherwise we
+ // sleep N times longer than the previous try, but not longer than desired number of seconds
+ HoldByErrorWakeupDuration = HoldByErrorWakeupDuration != TDuration::Zero()
+ ? Min(HoldByErrorWakeupDuration * SleepRetryMultiplier, MaxErrorSleep)
+ : FirstErrorSleep;
+
+ // transit to required state and arm wakeup timer
+ if (Terminated) {
+ // switch to this state permanently
+ SwitchToState(__LINE__, "HoldByError", &TThis::HoldByError);
+ HoldByErrorWakeupCookie = nullptr;
+ } else {
+ SwitchToState(__LINE__, "HoldByError", &TThis::HoldByError, HoldByErrorWakeupDuration,
+ HoldByErrorWakeupCookie = new TEvents::TEvWakeup);
+ }
+
+ /* Process all pending events. */
+ ProcessPendingSessionEvents();
+
+ /* Terminate handshakes */
+ DropHandshakes();
+
+ /* Terminate pending incoming handshake requests. */
+ for (auto& ev : PendingIncomingHandshakeEvents) {
+ Send(ev->Sender, new TEvents::TEvPoisonPill);
+ if (ev->GetTypeRewrite() == TEvHandshakeFail::EventType) {
+ TEvHandshakeFail::TPtr tmp(static_cast<TEventHandle<TEvHandshakeFail>*>(ev.Release()));
+ LogHandshakeFail(tmp, true);
+ }
+ }
+ PendingIncomingHandshakeEvents.clear();
+ }
+
+ void TInterconnectProxyTCP::WakeupFromErrorState(TEvents::TEvWakeup::TPtr& ev) {
+ ICPROXY_PROFILED;
+
+ LOG_INFO_IC("ICP33", "wake up from error state");
+
+ if (ev->Get() == HoldByErrorWakeupCookie) {
+ SwitchToInitialState();
+ }
+ }
+
+ void TInterconnectProxyTCP::Disconnect() {
+ ICPROXY_PROFILED;
+
+ // terminate handshakes (if any)
+ DropHandshakes();
+
+ if (Session) {
+ IActor::InvokeOtherActor(*Session, &TInterconnectSessionTCP::Terminate, TDisconnectReason::UserRequest());
+ } else {
+ TransitToErrorState("forced disconnect");
+ }
+ }
+
+ void TInterconnectProxyTCP::ScheduleCleanupEventQueue() {
+ ICPROXY_PROFILED;
+
+ if (!CleanupEventQueueScheduled && PendingSessionEvents) {
+ // apply batching at 50 ms granularity
+ Schedule(Max(TDuration::MilliSeconds(50), PendingSessionEvents.front().Deadline - TActivationContext::Now()), new TEvCleanupEventQueue);
+ CleanupEventQueueScheduled = true;
+ }
+ }
+
+ void TInterconnectProxyTCP::HandleCleanupEventQueue() {
+ ICPROXY_PROFILED;
+
+ Y_VERIFY(CleanupEventQueueScheduled);
+ CleanupEventQueueScheduled = false;
+ CleanupEventQueue();
+ ScheduleCleanupEventQueue();
+ }
+
+ void TInterconnectProxyTCP::CleanupEventQueue() {
+ ICPROXY_PROFILED;
+
+ const TInstant now = TActivationContext::Now();
+ while (PendingSessionEvents) {
+ TPendingSessionEvent& ev = PendingSessionEvents.front();
+ if (now >= ev.Deadline || PendingSessionEventsSize > Common->Settings.MessagePendingSize) {
+ TAutoPtr<IEventHandle> event(ev.Event.Release());
+ PendingSessionEventsSize -= ev.Size;
+ DropSessionEvent(event);
+ PendingSessionEvents.pop_front();
+ } else {
+ break;
+ }
+ }
+ }
+
+ void TInterconnectProxyTCP::HandleClosePeerSocket() {
+ ICPROXY_PROFILED;
+
+ if (Session && Session->Socket) {
+ LOG_INFO_IC("ICP34", "closed connection by debug command");
+ Session->Socket->Shutdown(SHUT_RDWR);
+ }
+ }
+
+ void TInterconnectProxyTCP::HandleCloseInputSession() {
+ ICPROXY_PROFILED;
+
+ if (Session) {
+ IActor::InvokeOtherActor(*Session, &TInterconnectSessionTCP::CloseInputSession);
+ }
+ }
+
+ void TInterconnectProxyTCP::HandlePoisonSession() {
+ ICPROXY_PROFILED;
+
+ if (Session) {
+ IActor::InvokeOtherActor(*Session, &TInterconnectSessionTCP::Terminate, TDisconnectReason::Debug());
+ }
+ }
+
+ void TInterconnectProxyTCP::HandleSessionBufferSizeRequest(TEvSessionBufferSizeRequest::TPtr& ev) {
+ ICPROXY_PROFILED;
+
+ ui64 bufSize = 0;
+ if (Session) {
+ bufSize = Session->TotalOutputQueueSize;
+ }
+
+ Send(ev->Sender, new TEvSessionBufferSizeResponse(SessionID, bufSize));
+ }
+
+ void TInterconnectProxyTCP::Handle(TEvQueryStats::TPtr& ev) {
+ ICPROXY_PROFILED;
+
+ TProxyStats stats;
+ stats.Path = Sprintf("peer%04" PRIu32, PeerNodeId);
+ stats.State = State;
+ stats.PeerScopeId = Session ? Session->Params.PeerScopeId : TScopeId();
+ stats.LastSessionDieTime = LastSessionDieTime;
+ stats.TotalOutputQueueSize = Session ? Session->TotalOutputQueueSize : 0;
+ stats.Connected = Session ? (bool)Session->Socket : false;
+ stats.Host = TechnicalPeerHostName;
+ stats.Port = 0;
+ ui32 rep = 0;
+ std::tie(stats.LastErrorTimestamp, stats.LastErrorKind, stats.LastErrorExplanation, rep) = ErrorStateLog
+ ? ErrorStateLog.back()
+ : std::make_tuple(TInstant(), TString(), TString(), 1U);
+ if (rep != 1) {
+ stats.LastErrorExplanation += Sprintf(" x%" PRIu32, rep);
+ }
+ stats.Ping = Session ? Session->GetPingRTT() : TDuration::Zero();
+ stats.ClockSkew = Session ? Session->GetClockSkew() : 0;
+ if (Session) {
+ if (auto *x = dynamic_cast<NInterconnect::TSecureSocket*>(Session->Socket.Get())) {
+ stats.Encryption = Sprintf("%s/%u", x->GetCipherName().data(), x->GetCipherBits());
+ } else {
+ stats.Encryption = "none";
+ }
+ }
+
+ auto response = MakeHolder<TEvStats>();
+ response->PeerNodeId = PeerNodeId;
+ response->ProxyStats = std::move(stats);
+ Send(ev->Sender, response.Release());
+ }
+
+ void TInterconnectProxyTCP::HandleTerminate() {
+ ICPROXY_PROFILED;
+
+ if (Session) {
+ IActor::InvokeOtherActor(*Session, &TInterconnectSessionTCP::Terminate, TDisconnectReason());
+ }
+ Terminated = true;
+ TransitToErrorState("terminated");
+ }
+
+ void TInterconnectProxyTCP::PassAway() {
+ if (Session) {
+ IActor::InvokeOtherActor(*Session, &TInterconnectSessionTCP::Terminate, TDisconnectReason());
+ }
+ if (DynamicPtr) {
+ Y_VERIFY(*DynamicPtr == this);
+ *DynamicPtr = nullptr;
+ }
+ // TODO: unregister actor mon page
+ TActor::PassAway();
+ }
+}
diff --git a/library/cpp/actors/interconnect/interconnect_tcp_proxy.h b/library/cpp/actors/interconnect/interconnect_tcp_proxy.h
new file mode 100644
index 0000000000..023e5bd1ee
--- /dev/null
+++ b/library/cpp/actors/interconnect/interconnect_tcp_proxy.h
@@ -0,0 +1,537 @@
+#pragma once
+
+#include <library/cpp/actors/core/actor_bootstrapped.h>
+#include <library/cpp/actors/core/hfunc.h>
+#include <library/cpp/actors/core/event_pb.h>
+#include <library/cpp/actors/core/events.h>
+#include <library/cpp/monlib/dynamic_counters/counters.h>
+
+#include "interconnect_common.h"
+#include "interconnect_counters.h"
+#include "interconnect_tcp_session.h"
+#include "profiler.h"
+
+#define ICPROXY_PROFILED TFunction func(*this, __func__, __LINE__)
+
+namespace NActors {
+
+
+ /* WARNING: all proxy actors should be alive during actorsystem activity */
+ class TInterconnectProxyTCP
+ : public TActor<TInterconnectProxyTCP>
+ , public TInterconnectLoggingBase
+ , public TProfiled
+ {
+ enum {
+ EvCleanupEventQueue = EventSpaceBegin(TEvents::ES_PRIVATE),
+ EvQueryStats,
+ EvStats,
+ EvPassAwayIfNeeded,
+ };
+
+ struct TEvCleanupEventQueue : TEventLocal<TEvCleanupEventQueue, EvCleanupEventQueue> {};
+
+ public:
+ struct TEvQueryStats : TEventLocal<TEvQueryStats, EvQueryStats> {};
+
+ struct TProxyStats {
+ TString Path;
+ TString State;
+ TScopeId PeerScopeId;
+ TInstant LastSessionDieTime;
+ ui64 TotalOutputQueueSize;
+ bool Connected;
+ TString Host;
+ ui16 Port;
+ TInstant LastErrorTimestamp;
+ TString LastErrorKind;
+ TString LastErrorExplanation;
+ TDuration Ping;
+ i64 ClockSkew;
+ TString Encryption;
+ };
+
+ struct TEvStats : TEventLocal<TEvStats, EvStats> {
+ ui32 PeerNodeId;
+ TProxyStats ProxyStats;
+ };
+
+ static constexpr EActivityType ActorActivityType() {
+ return INTERCONNECT_PROXY_TCP;
+ }
+
+ TInterconnectProxyTCP(const ui32 node, TInterconnectProxyCommon::TPtr common, IActor **dynamicPtr = nullptr);
+
+ STFUNC(StateInit) {
+ Bootstrap();
+ if (ev->Type != TEvents::TSystem::Bootstrap) { // for dynamic nodes we do not receive Bootstrap event
+ Receive(ev, ctx);
+ }
+ }
+
+ void Bootstrap();
+ void Registered(TActorSystem* sys, const TActorId& owner) override;
+
+ private:
+ friend class TInterconnectSessionTCP;
+ friend class TInterconnectSessionTCPv0;
+ friend class THandshake;
+ friend class TInputSessionTCP;
+
+ void UnregisterSession(TInterconnectSessionTCP* session);
+
+#define SESSION_EVENTS(HANDLER) \
+ fFunc(TEvInterconnect::EvForward, HANDLER) \
+ fFunc(TEvInterconnect::TEvConnectNode::EventType, HANDLER) \
+ fFunc(TEvents::TEvSubscribe::EventType, HANDLER) \
+ fFunc(TEvents::TEvUnsubscribe::EventType, HANDLER)
+
+#define INCOMING_HANDSHAKE_EVENTS(HANDLER) \
+ fFunc(TEvHandshakeAsk::EventType, HANDLER) \
+ fFunc(TEvHandshakeRequest::EventType, HANDLER)
+
+#define HANDSHAKE_STATUS_EVENTS(HANDLER) \
+ hFunc(TEvHandshakeDone, HANDLER) \
+ hFunc(TEvHandshakeFail, HANDLER)
+
+#define PROXY_STFUNC(STATE, SESSION_HANDLER, INCOMING_HANDSHAKE_HANDLER, \
+ HANDSHAKE_STATUS_HANDLER, DISCONNECT_HANDLER, \
+ WAKEUP_HANDLER, NODE_INFO_HANDLER) \
+ STATEFN(STATE) { \
+ const ui32 type = ev->GetTypeRewrite(); \
+ const bool profiled = type != TEvInterconnect::EvForward \
+ && type != TEvInterconnect::EvConnectNode \
+ && type != TEvents::TSystem::Subscribe \
+ && type != TEvents::TSystem::Unsubscribe; \
+ if (profiled) { \
+ TProfiled::Start(); \
+ } \
+ { \
+ TProfiled::TFunction func(*this, __func__, __LINE__); \
+ switch (type) { \
+ SESSION_EVENTS(SESSION_HANDLER) \
+ INCOMING_HANDSHAKE_EVENTS(INCOMING_HANDSHAKE_HANDLER) \
+ HANDSHAKE_STATUS_EVENTS(HANDSHAKE_STATUS_HANDLER) \
+ cFunc(TEvInterconnect::EvDisconnect, DISCONNECT_HANDLER) \
+ hFunc(TEvents::TEvWakeup, WAKEUP_HANDLER) \
+ hFunc(TEvGetSecureSocket, Handle) \
+ hFunc(NMon::TEvHttpInfo, GenerateHttpInfo) \
+ cFunc(EvCleanupEventQueue, HandleCleanupEventQueue) \
+ hFunc(TEvInterconnect::TEvNodeInfo, NODE_INFO_HANDLER) \
+ cFunc(TEvInterconnect::EvClosePeerSocket, HandleClosePeerSocket) \
+ cFunc(TEvInterconnect::EvCloseInputSession, HandleCloseInputSession) \
+ cFunc(TEvInterconnect::EvPoisonSession, HandlePoisonSession) \
+ hFunc(TEvSessionBufferSizeRequest, HandleSessionBufferSizeRequest) \
+ hFunc(TEvQueryStats, Handle) \
+ cFunc(TEvInterconnect::EvTerminate, HandleTerminate) \
+ cFunc(EvPassAwayIfNeeded, HandlePassAwayIfNeeded) \
+ default: \
+ Y_FAIL("unexpected event Type# 0x%08" PRIx32, type); \
+ } \
+ } \
+ if (profiled) { \
+ if (TProfiled::Duration() >= TDuration::MilliSeconds(16)) { \
+ const TString report = TProfiled::Format(); \
+ LOG_ERROR_IC("ICP35", "event processing took too much time %s", report.data()); \
+ } \
+ TProfiled::Finish(); \
+ } \
+ }
+
+ template <typename T>
+ void Ignore(T& /*ev*/) {
+ ICPROXY_PROFILED;
+ }
+
+ void Ignore() {
+ ICPROXY_PROFILED;
+ }
+
+ void Ignore(TEvHandshakeDone::TPtr& ev) {
+ ICPROXY_PROFILED;
+
+ Y_VERIFY(ev->Sender != IncomingHandshakeActor);
+ Y_VERIFY(ev->Sender != OutgoingHandshakeActor);
+ }
+
+ void Ignore(TEvHandshakeFail::TPtr& ev) {
+ ICPROXY_PROFILED;
+
+ Y_VERIFY(ev->Sender != IncomingHandshakeActor);
+ Y_VERIFY(ev->Sender != OutgoingHandshakeActor);
+ LogHandshakeFail(ev, true);
+ }
+
+ const char* State = nullptr;
+ TInstant StateSwitchTime;
+
+ template <typename... TArgs>
+ void SwitchToState(int line, const char* name, TArgs&&... args) {
+ ICPROXY_PROFILED;
+
+ LOG_DEBUG_IC("ICP77", "@%d %s -> %s", line, State, name);
+ State = name;
+ StateSwitchTime = TActivationContext::Now();
+ Become(std::forward<TArgs>(args)...);
+ Y_VERIFY(!Terminated || CurrentStateFunc() == &TThis::HoldByError); // ensure we never escape this state
+ if (CurrentStateFunc() != &TThis::PendingActivation) {
+ PassAwayTimestamp = TInstant::Max();
+ }
+ }
+
+ TInstant PassAwayTimestamp;
+ bool PassAwayScheduled = false;
+
+ void SwitchToInitialState() {
+ ICPROXY_PROFILED;
+
+ Y_VERIFY(!PendingSessionEvents && !PendingIncomingHandshakeEvents, "%s PendingSessionEvents# %zu"
+ " PendingIncomingHandshakeEvents# %zu State# %s", LogPrefix.data(), PendingSessionEvents.size(),
+ PendingIncomingHandshakeEvents.size(), State);
+ SwitchToState(__LINE__, "PendingActivation", &TThis::PendingActivation);
+ if (DynamicPtr && !PassAwayScheduled && PassAwayTimestamp != TInstant::Max()) {
+ TActivationContext::Schedule(PassAwayTimestamp, new IEventHandle(EvPassAwayIfNeeded, 0, SelfId(),
+ {}, nullptr, 0));
+ PassAwayScheduled = true;
+ }
+ }
+
+ void HandlePassAwayIfNeeded() {
+ Y_VERIFY(PassAwayScheduled);
+ if (PassAwayTimestamp != TInstant::Max()) {
+ PassAway();
+ }
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+ // PendingActivation
+ //
+ // In this state we are just waiting for some activities, which may include:
+ // * an external Session event
+ // * incoming handshake request
+ //
+ // Upon receiving such event, we put it to corresponding queue and initiate start up by calling IssueGetNodeRequest,
+ // which, as the name says, issued TEvGetNode to the nameservice and arms timer to handle timeout (which should not
+ // occur, but we want to be sure we don't hang on this), and then switches to PendingNodeInfo state.
+
+ PROXY_STFUNC(PendingActivation,
+ RequestNodeInfo, // Session events
+ RequestNodeInfoForIncomingHandshake, // Incoming handshake requests
+ Ignore, // Handshake status
+ Ignore, // Disconnect request
+ Ignore, // Wakeup
+ Ignore // Node info
+ )
+
+ ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+ // PendingNodeInfo
+ //
+ // This state is entered when we asked nameserver to provide description for peer node we are working with. All
+ // external Session events and incoming handshake requests are enqueued into their respective queues, TEvNodeInfo
+ // is main event that triggers processing. On success, we try to initiate outgoing handshake if needed, or process
+ // incoming handshakes. On error, we enter HoldByError state.
+ //
+ // NOTE: handshake status events are also enqueued as the handshake actor may have generated failure event due to
+ // timeout or some other reason without waiting for acknowledge, and it must be processed correctly to prevent
+ // session hang
+
+ PROXY_STFUNC(PendingNodeInfo,
+ EnqueueSessionEvent, // Session events
+ EnqueueIncomingHandshakeEvent, // Incoming handshake requests
+ EnqueueIncomingHandshakeEvent, // Handshake status
+ Disconnect, // Disconnect request
+ ConfigureTimeout, // Wakeup
+ Configure // Node info
+ )
+
+ ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+ // PendingConnection
+ //
+ // Here we have issued outgoing handshake or have accepted (or may be both) incoming handshake and we are waiting for
+ // the status of the handshake. When one if handshakes finishes, we use this status to establish connection (or to
+ // go to error state). When one handshake terminates with error while other is running, we will still wait for the
+ // second one to finish.
+
+ PROXY_STFUNC(PendingConnection,
+ EnqueueSessionEvent, // Session events
+ IncomingHandshake, // Incoming handshake requests
+ HandleHandshakeStatus, // Handshake status
+ Disconnect, // Disconnect request
+ Ignore, // Wakeup
+ Ignore // Node info
+ )
+
+ ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+ // StateWork
+ //
+ // We have accepted session and process any incoming messages with the session. Incoming handshakes are accepted
+ // concurrently and applied when finished.
+
+ PROXY_STFUNC(StateWork,
+ ForwardSessionEventToSession, // Session events
+ IncomingHandshake, // Incoming handshake requests
+ HandleHandshakeStatus, // Handshake status
+ Disconnect, // Disconnect request
+ Ignore, // Wakeup
+ Ignore // Node info
+ )
+
+ ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+ // HoldByError
+ //
+ // When something bad happens with the connection, we sleep in this state. After wake up we go back to
+ // PendingActivation.
+
+ PROXY_STFUNC(HoldByError,
+ DropSessionEvent, // Session events
+ RequestNodeInfoForIncomingHandshake, // Incoming handshake requests
+ Ignore, // Handshake status
+ Ignore, // Disconnect request
+ WakeupFromErrorState, // Wakeup
+ Ignore // Node info
+ )
+
+#undef SESSION_EVENTS
+#undef INCOMING_HANDSHAKE_EVENTS
+#undef HANDSHAKE_STATUS_EVENTS
+#undef PROXY_STFUNC
+
+ void ForwardSessionEventToSession(STATEFN_SIG);
+ void EnqueueSessionEvent(STATEFN_SIG);
+
+ // Incoming handshake handlers, including special wrapper when the IncomingHandshake is used as fFunc
+ void IncomingHandshake(STATEFN_SIG) {
+ switch (ev->GetTypeRewrite()) {
+ hFunc(TEvHandshakeAsk, IncomingHandshake);
+ hFunc(TEvHandshakeRequest, IncomingHandshake);
+ default:
+ Y_FAIL();
+ }
+ }
+ void IncomingHandshake(TEvHandshakeAsk::TPtr& ev);
+ void IncomingHandshake(TEvHandshakeRequest::TPtr& ev);
+
+ void RequestNodeInfo(STATEFN_SIG);
+ void RequestNodeInfoForIncomingHandshake(STATEFN_SIG);
+
+ void StartInitialHandshake();
+ void StartResumeHandshake(ui64 inputCounter);
+
+ ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+ // Incoming handshake event queue processing
+ ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+ void EnqueueIncomingHandshakeEvent(STATEFN_SIG);
+ void EnqueueIncomingHandshakeEvent(TEvHandshakeDone::TPtr& ev);
+ void EnqueueIncomingHandshakeEvent(TEvHandshakeFail::TPtr& ev);
+
+ ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+ // PendingNodeInfo
+ ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+ IEventBase* ConfigureTimeoutCookie; // pointer to the scheduled event used to match sent and received events
+
+ void StartConfiguring();
+ void Configure(TEvInterconnect::TEvNodeInfo::TPtr& ev);
+ void ConfigureTimeout(TEvents::TEvWakeup::TPtr& ev);
+ void ProcessConfigured();
+
+ void HandleHandshakeStatus(TEvHandshakeDone::TPtr& ev);
+ void HandleHandshakeStatus(TEvHandshakeFail::TPtr& ev);
+
+ void TransitToErrorState(TString Explanation, bool updateErrorLog = true);
+ void WakeupFromErrorState(TEvents::TEvWakeup::TPtr& ev);
+ void Disconnect();
+
+ const ui32 PeerNodeId;
+ IActor **DynamicPtr;
+
+ void ValidateEvent(TAutoPtr<IEventHandle>& ev, const char* func) {
+ if (SelfId().NodeId() == PeerNodeId) {
+ TString msg = Sprintf("Event Type# 0x%08" PRIx32 " TypeRewrite# 0x%08" PRIx32
+ " from Sender# %s sent to the proxy for the node itself via Interconnect;"
+ " THIS IS NOT A BUG IN INTERCONNECT, check the event sender instead",
+ ev->Type, ev->GetTypeRewrite(), ev->Sender.ToString().data());
+ LOG_ERROR_IC("ICP03", "%s", msg.data());
+ Y_VERIFY_DEBUG(false, "%s", msg.data());
+ }
+
+ Y_VERIFY(ev->GetTypeRewrite() != TEvInterconnect::EvForward || ev->Recipient.NodeId() == PeerNodeId,
+ "Recipient/Proxy NodeId mismatch Recipient# %s Type# 0x%08" PRIx32 " PeerNodeId# %" PRIu32 " Func# %s",
+ ev->Recipient.ToString().data(), ev->Type, PeerNodeId, func);
+ }
+
+ // Common with helpers
+ // All proxy actors share the same information in the object
+ // read only
+ TInterconnectProxyCommon::TPtr const Common;
+
+ const TActorId& GetNameserviceId() const {
+ return Common->NameserviceId;
+ }
+
+ TString TechnicalPeerHostName;
+
+ std::shared_ptr<IInterconnectMetrics> Metrics;
+
+ void HandleClosePeerSocket();
+ void HandleCloseInputSession();
+ void HandlePoisonSession();
+
+ void HandleSessionBufferSizeRequest(TEvSessionBufferSizeRequest::TPtr& ev);
+
+ bool CleanupEventQueueScheduled = false;
+ void ScheduleCleanupEventQueue();
+ void HandleCleanupEventQueue();
+ void CleanupEventQueue();
+
+ // hold all events before connection is established
+ struct TPendingSessionEvent {
+ TInstant Deadline;
+ ui32 Size;
+ THolder<IEventHandle> Event;
+
+ TPendingSessionEvent(TInstant deadline, ui32 size, TAutoPtr<IEventHandle> event)
+ : Deadline(deadline)
+ , Size(size)
+ , Event(event)
+ {}
+ };
+ TDeque<TPendingSessionEvent> PendingSessionEvents;
+ ui64 PendingSessionEventsSize = 0;
+ void ProcessPendingSessionEvents();
+ void DropSessionEvent(STATEFN_SIG);
+
+ TInterconnectSessionTCP* Session = nullptr;
+ TActorId SessionID;
+
+ // virtual ids used during handshake to check if it is the connection
+ // for the same session or to find out the latest shandshake
+ // it's virtual because session actor apears after successfull handshake
+ TActorId SessionVirtualId;
+ TActorId RemoteSessionVirtualId;
+
+ TActorId GenerateSessionVirtualId() {
+ ICPROXY_PROFILED;
+
+ const ui64 localId = TlsActivationContext->ExecutorThread.ActorSystem->AllocateIDSpace(1);
+ return NActors::TActorId(SelfId().NodeId(), 0, localId, 0);
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+ TActorId IncomingHandshakeActor;
+ TInstant IncomingHandshakeActorFilledIn;
+ TInstant IncomingHandshakeActorReset;
+ TMaybe<ui64> LastSerialFromIncomingHandshake;
+ THolder<IEventBase> HeldHandshakeReply;
+
+ void DropIncomingHandshake(bool poison = true) {
+ ICPROXY_PROFILED;
+
+ if (const TActorId& actorId = std::exchange(IncomingHandshakeActor, TActorId())) {
+ LOG_DEBUG_IC("ICP111", "dropped incoming handshake: %s poison: %s", actorId.ToString().data(),
+ poison ? "true" : "false");
+ if (poison) {
+ Send(actorId, new TEvents::TEvPoisonPill);
+ }
+ LastSerialFromIncomingHandshake.Clear();
+ HeldHandshakeReply.Reset();
+ IncomingHandshakeActorReset = TActivationContext::Now();
+ }
+ }
+
+ void DropOutgoingHandshake(bool poison = true) {
+ ICPROXY_PROFILED;
+
+ if (const TActorId& actorId = std::exchange(OutgoingHandshakeActor, TActorId())) {
+ LOG_DEBUG_IC("ICP112", "dropped outgoing handshake: %s poison: %s", actorId.ToString().data(),
+ poison ? "true" : "false");
+ if (poison) {
+ Send(actorId, new TEvents::TEvPoisonPill);
+ }
+ OutgoingHandshakeActorReset = TActivationContext::Now();
+ }
+ }
+
+ void DropHandshakes() {
+ ICPROXY_PROFILED;
+
+ DropIncomingHandshake();
+ DropOutgoingHandshake();
+ }
+
+ void PrepareNewSessionHandshake() {
+ ICPROXY_PROFILED;
+
+ // drop existing session if we have one
+ if (Session) {
+ LOG_INFO_IC("ICP04", "terminating current session as we are negotiating a new one");
+ IActor::InvokeOtherActor(*Session, &TInterconnectSessionTCP::Terminate, TDisconnectReason::NewSession());
+ }
+
+ // ensure we have no current session
+ Y_VERIFY(!Session);
+
+ // switch to pending connection state -- we wait for handshakes, we want more handshakes!
+ SwitchToState(__LINE__, "PendingConnection", &TThis::PendingConnection);
+ }
+
+ void IssueIncomingHandshakeReply(const TActorId& handshakeId, ui64 peerLocalId,
+ THolder<IEventBase> event);
+
+ ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+ TActorId OutgoingHandshakeActor;
+ TInstant OutgoingHandshakeActorCreated;
+ TInstant OutgoingHandshakeActorReset;
+
+ TInstant LastSessionDieTime;
+
+ void GenerateHttpInfo(NMon::TEvHttpInfo::TPtr& ev);
+
+ void Handle(TEvQueryStats::TPtr& ev);
+
+ TDuration HoldByErrorWakeupDuration = TDuration::Zero();
+ TEvents::TEvWakeup* HoldByErrorWakeupCookie;
+
+ THolder<TProgramInfo> RemoteProgramInfo;
+ NInterconnect::TSecureSocketContext::TPtr SecureContext;
+
+ void Handle(TEvGetSecureSocket::TPtr ev) {
+ auto socket = MakeIntrusive<NInterconnect::TSecureSocket>(*ev->Get()->Socket, SecureContext);
+ Send(ev->Sender, new TEvSecureSocket(std::move(socket)));
+ }
+
+ TDeque<THolder<IEventHandle>> PendingIncomingHandshakeEvents;
+
+ TDeque<std::tuple<TInstant, TString, TString, ui32>> ErrorStateLog;
+
+ void UpdateErrorStateLog(TInstant now, TString kind, TString explanation) {
+ ICPROXY_PROFILED;
+
+ if (ErrorStateLog) {
+ auto& back = ErrorStateLog.back();
+ TString lastKind, lastExpl;
+ if (kind == std::get<1>(back) && explanation == std::get<2>(back)) {
+ std::get<0>(back) = now;
+ ++std::get<3>(back);
+ return;
+ }
+ }
+
+ ErrorStateLog.emplace_back(now, std::move(kind), std::move(explanation), 1);
+ if (ErrorStateLog.size() > 20) {
+ ErrorStateLog.pop_front();
+ }
+ }
+
+ void LogHandshakeFail(TEvHandshakeFail::TPtr& ev, bool inconclusive);
+
+ bool Terminated = false;
+ void HandleTerminate();
+
+ void PassAway() override;
+ };
+
+}
diff --git a/library/cpp/actors/interconnect/interconnect_tcp_server.cpp b/library/cpp/actors/interconnect/interconnect_tcp_server.cpp
new file mode 100644
index 0000000000..b95c994598
--- /dev/null
+++ b/library/cpp/actors/interconnect/interconnect_tcp_server.cpp
@@ -0,0 +1,117 @@
+#include "interconnect_tcp_server.h"
+#include "interconnect_handshake.h"
+
+#include <library/cpp/actors/core/log.h>
+#include <library/cpp/actors/protos/services_common.pb.h>
+
+#include "interconnect_common.h"
+
+namespace NActors {
+ TInterconnectListenerTCP::TInterconnectListenerTCP(const TString& address, ui16 port, TInterconnectProxyCommon::TPtr common, const TMaybe<SOCKET>& socket)
+ : TActor(&TThis::Initial)
+ , TInterconnectLoggingBase(Sprintf("ICListener: %s", SelfId().ToString().data()))
+ , Address(address.c_str(), port)
+ , Listener(
+ socket
+ ? new NInterconnect::TStreamSocket(*socket)
+ : nullptr)
+ , ExternalSocket(!!Listener)
+ , ProxyCommonCtx(std::move(common))
+ {
+ if (ExternalSocket) {
+ SetNonBlock(*Listener);
+ }
+ }
+
+ TAutoPtr<IEventHandle> TInterconnectListenerTCP::AfterRegister(const TActorId& self, const TActorId& parentId) {
+ return new IEventHandle(self, parentId, new TEvents::TEvBootstrap, 0);
+ }
+
+ void TInterconnectListenerTCP::Die(const TActorContext& ctx) {
+ LOG_DEBUG_IC("ICL08", "Dying");
+ TActor::Die(ctx);
+ }
+
+ int TInterconnectListenerTCP::Bind() {
+ NInterconnect::TAddress addr = Address;
+
+ if (ProxyCommonCtx->Settings.BindOnAllAddresses) {
+ switch (addr.GetFamily()) {
+ case AF_INET: {
+ auto *sa = reinterpret_cast<sockaddr_in*>(addr.SockAddr());
+ sa->sin_addr = {INADDR_ANY};
+ break;
+ }
+
+ case AF_INET6: {
+ auto *sa = reinterpret_cast<sockaddr_in6*>(addr.SockAddr());
+ sa->sin6_addr = in6addr_any;
+ break;
+ }
+
+ default:
+ Y_FAIL("Unsupported address family");
+ }
+ }
+
+ Listener = NInterconnect::TStreamSocket::Make(addr.GetFamily());
+ if (*Listener == -1) {
+ return errno;
+ }
+ SetNonBlock(*Listener);
+ Listener->SetSendBufferSize(ProxyCommonCtx->Settings.GetSendBufferSize()); // TODO(alexvru): WTF?
+ SetSockOpt(*Listener, SOL_SOCKET, SO_REUSEADDR, 1);
+ if (const auto e = -Listener->Bind(addr)) {
+ return e;
+ } else if (const auto e = -Listener->Listen(SOMAXCONN)) {
+ return e;
+ } else {
+ return 0;
+ }
+ }
+
+ void TInterconnectListenerTCP::Bootstrap(const TActorContext& ctx) {
+ if (!Listener) {
+ if (const int err = Bind()) {
+ LOG_ERROR_IC("ICL01", "Bind failed: %s (%s)", strerror(err), Address.ToString().data());
+ Listener.Reset();
+ Become(&TThis::Initial, TDuration::Seconds(1), new TEvents::TEvBootstrap);
+ return;
+ }
+ }
+ if (const auto& callback = ProxyCommonCtx->InitWhiteboard) {
+ callback(Address.GetPort(), TlsActivationContext->ExecutorThread.ActorSystem);
+ }
+ const bool success = ctx.Send(MakePollerActorId(), new TEvPollerRegister(Listener, SelfId(), {}));
+ Y_VERIFY(success);
+ Become(&TThis::Listen);
+ }
+
+ void TInterconnectListenerTCP::Handle(TEvPollerRegisterResult::TPtr ev, const TActorContext& ctx) {
+ PollerToken = std::move(ev->Get()->PollerToken);
+ Process(ctx);
+ }
+
+ void TInterconnectListenerTCP::Process(const TActorContext& ctx) {
+ for (;;) {
+ NInterconnect::TAddress address;
+ const int r = Listener->Accept(address);
+ if (r >= 0) {
+ LOG_DEBUG_IC("ICL04", "Accepted from: %s", address.ToString().data());
+ auto socket = MakeIntrusive<NInterconnect::TStreamSocket>(static_cast<SOCKET>(r));
+ ctx.Register(CreateIncomingHandshakeActor(ProxyCommonCtx, std::move(socket)));
+ continue;
+ } else if (-r != EAGAIN && -r != EWOULDBLOCK) {
+ Y_VERIFY(-r != ENFILE && -r != EMFILE && !ExternalSocket);
+ LOG_ERROR_IC("ICL06", "Listen failed: %s (%s)", strerror(-r), Address.ToString().data());
+ Listener.Reset();
+ PollerToken.Reset();
+ Become(&TThis::Initial, TDuration::Seconds(1), new TEvents::TEvBootstrap);
+ } else if (PollerToken) {
+ PollerToken->Request(true, false);
+ }
+ break;
+ }
+ }
+
+}
diff --git a/library/cpp/actors/interconnect/interconnect_tcp_server.h b/library/cpp/actors/interconnect/interconnect_tcp_server.h
new file mode 100644
index 0000000000..fc71073c2d
--- /dev/null
+++ b/library/cpp/actors/interconnect/interconnect_tcp_server.h
@@ -0,0 +1,57 @@
+#pragma once
+
+#include <library/cpp/actors/core/hfunc.h>
+#include <library/cpp/actors/core/event_pb.h>
+#include <library/cpp/actors/core/events.h>
+
+#include "interconnect_common.h"
+#include "poller_actor.h"
+#include "events_local.h"
+
+namespace NActors {
+ class TInterconnectListenerTCP: public TActor<TInterconnectListenerTCP>, public TInterconnectLoggingBase {
+ public:
+ static constexpr EActivityType ActorActivityType() {
+ return INTERCONNECT_COMMON;
+ }
+
+ TInterconnectListenerTCP(const TString& address, ui16 port, TInterconnectProxyCommon::TPtr common, const TMaybe<SOCKET>& socket = Nothing());
+ int Bind();
+
+ private:
+ STFUNC(Initial) {
+ switch (ev->GetTypeRewrite()) {
+ CFunc(TEvents::TEvBootstrap::EventType, Bootstrap);
+ CFunc(TEvents::TEvPoisonPill::EventType, Die);
+ }
+ }
+
+ STFUNC(Listen) {
+ switch (ev->GetTypeRewrite()) {
+ CFunc(TEvents::TEvPoisonPill::EventType, Die);
+ HFunc(TEvPollerRegisterResult, Handle);
+ CFunc(TEvPollerReady::EventType, Process);
+ }
+ }
+
+ TAutoPtr<IEventHandle> AfterRegister(const TActorId& self, const TActorId& parentId) override;
+
+ void Die(const TActorContext& ctx) override;
+
+ void Bootstrap(const TActorContext& ctx);
+ void Handle(TEvPollerRegisterResult::TPtr ev, const TActorContext& ctx);
+
+ void Process(const TActorContext& ctx);
+
+ const NInterconnect::TAddress Address;
+ TIntrusivePtr<NInterconnect::TStreamSocket> Listener;
+ const bool ExternalSocket;
+ TPollerToken::TPtr PollerToken;
+ TInterconnectProxyCommon::TPtr const ProxyCommonCtx;
+ };
+
+ static inline TActorId MakeInterconnectListenerActorId(bool dynamic) {
+ char x[12] = {'I', 'C', 'L', 'i', 's', 't', 'e', 'n', 'e', 'r', '/', dynamic ? 'D' : 'S'};
+ return TActorId(0, TStringBuf(x, 12));
+ }
+}
diff --git a/library/cpp/actors/interconnect/interconnect_tcp_session.cpp b/library/cpp/actors/interconnect/interconnect_tcp_session.cpp
new file mode 100644
index 0000000000..2ded7f9f53
--- /dev/null
+++ b/library/cpp/actors/interconnect/interconnect_tcp_session.cpp
@@ -0,0 +1,1228 @@
+#include "interconnect_tcp_proxy.h"
+#include "interconnect_tcp_session.h"
+#include "interconnect_handshake.h"
+
+#include <library/cpp/actors/core/probes.h>
+#include <library/cpp/actors/core/log.h>
+#include <library/cpp/actors/core/interconnect.h>
+#include <library/cpp/actors/util/datetime.h>
+#include <library/cpp/actors/protos/services_common.pb.h>
+#include <library/cpp/monlib/service/pages/templates.h>
+
+namespace NActors {
+ LWTRACE_USING(ACTORLIB_PROVIDER);
+
+ DECLARE_WILSON_EVENT(OutputQueuePush, (ui32, QueueSizeInEvents), (ui64, QueueSizeInBytes));
+
+ template<typename T>
+ T Coalesce(T&& x) {
+ return x;
+ }
+
+ template<typename T, typename T2, typename... TRest>
+ typename std::common_type<T, T2, TRest...>::type Coalesce(T&& first, T2&& mid, TRest&&... rest) {
+ if (first != typename std::remove_reference<T>::type()) {
+ return first;
+ } else {
+ return Coalesce(std::forward<T2>(mid), std::forward<TRest>(rest)...);
+ }
+ }
+
+ TInterconnectSessionTCP::TInterconnectSessionTCP(TInterconnectProxyTCP* const proxy, TSessionParams params)
+ : TActor(&TInterconnectSessionTCP::StateFunc)
+ , Created(TInstant::Now())
+ , Proxy(proxy)
+ , CloseOnIdleWatchdog(GetCloseOnIdleTimeout(), std::bind(&TThis::OnCloseOnIdleTimerHit, this))
+ , LostConnectionWatchdog(GetLostConnectionTimeout(), std::bind(&TThis::OnLostConnectionTimerHit, this))
+ , Params(std::move(params))
+ , TotalOutputQueueSize(0)
+ , OutputStuckFlag(false)
+ , OutputQueueUtilization(16)
+ , OutputCounter(0ULL)
+ {
+ Proxy->Metrics->SetConnected(0);
+ ReceiveContext.Reset(new TReceiveContext);
+ }
+
+ TInterconnectSessionTCP::~TInterconnectSessionTCP() {
+ // close socket ASAP when actor system is being shut down
+ if (Socket) {
+ Socket->Shutdown(SHUT_RDWR);
+ }
+ }
+
+ void TInterconnectSessionTCP::Init() {
+ auto destroyCallback = [as = TlsActivationContext->ExecutorThread.ActorSystem, id = Proxy->Common->DestructorId](THolder<IEventBase> event) {
+ as->Send(id, event.Release());
+ };
+ Pool.ConstructInPlace(Proxy->Common, std::move(destroyCallback));
+ ChannelScheduler.ConstructInPlace(Proxy->PeerNodeId, Proxy->Common->ChannelsConfig, Proxy->Metrics, *Pool,
+ Proxy->Common->Settings.MaxSerializedEventSize, Params);
+
+ LOG_INFO(*TlsActivationContext, NActorsServices::INTERCONNECT_STATUS, "[%u] session created", Proxy->PeerNodeId);
+ SetPrefix(Sprintf("Session %s [node %" PRIu32 "]", SelfId().ToString().data(), Proxy->PeerNodeId));
+ SendUpdateToWhiteboard();
+ }
+
+ void TInterconnectSessionTCP::CloseInputSession() {
+ Send(ReceiverId, new TEvInterconnect::TEvCloseInputSession);
+ }
+
+ void TInterconnectSessionTCP::Handle(TEvTerminate::TPtr& ev) {
+ Terminate(ev->Get()->Reason);
+ }
+
+ void TInterconnectSessionTCP::HandlePoison() {
+ Terminate(TDisconnectReason());
+ }
+
+ void TInterconnectSessionTCP::Terminate(TDisconnectReason reason) {
+ LOG_INFO_IC_SESSION("ICS01", "socket: %" PRIi64, (Socket ? i64(*Socket) : -1));
+
+ IActor::InvokeOtherActor(*Proxy, &TInterconnectProxyTCP::UnregisterSession, this);
+ ShutdownSocket(std::move(reason));
+
+ for (const auto& kv : Subscribers) {
+ Send(kv.first, new TEvInterconnect::TEvNodeDisconnected(Proxy->PeerNodeId), 0, kv.second);
+ }
+ Proxy->Metrics->SubSubscribersCount(Subscribers.size());
+ Subscribers.clear();
+
+ ChannelScheduler->ForEach([&](TEventOutputChannel& channel) {
+ channel.NotifyUndelivered();
+ });
+
+ if (ReceiverId) {
+ Send(ReceiverId, new TEvents::TEvPoisonPill);
+ }
+
+ SendUpdateToWhiteboard(false);
+
+ Proxy->Metrics->SubOutputBuffersTotalSize(TotalOutputQueueSize);
+ Proxy->Metrics->SubInflightDataAmount(InflightDataAmount);
+
+ LOG_INFO(*TlsActivationContext, NActorsServices::INTERCONNECT_STATUS, "[%u] session destroyed", Proxy->PeerNodeId);
+
+ if (!Subscribers.empty()) {
+ Proxy->Metrics->SubSubscribersCount(Subscribers.size());
+ }
+
+ TActor::PassAway();
+ }
+
+ void TInterconnectSessionTCP::PassAway() {
+ Y_FAIL("TInterconnectSessionTCP::PassAway() can't be called directly");
+ }
+
+ void TInterconnectSessionTCP::Forward(STATEFN_SIG) {
+ Proxy->ValidateEvent(ev, "Forward");
+
+ LOG_DEBUG_IC_SESSION("ICS02", "send event from: %s to: %s", ev->Sender.ToString().data(), ev->Recipient.ToString().data());
+ ++MessagesGot;
+
+ if (ev->Flags & IEventHandle::FlagSubscribeOnSession) {
+ Subscribe(ev);
+ }
+
+ ui16 evChannel = ev->GetChannel();
+ auto& oChannel = ChannelScheduler->GetOutputChannel(evChannel);
+ const bool wasWorking = oChannel.IsWorking();
+
+ const auto [dataSize, event] = oChannel.Push(*ev);
+ LWTRACK(ForwardEvent, event->Orbit, Proxy->PeerNodeId, event->Descr.Type, event->Descr.Flags, LWACTORID(event->Descr.Recipient), LWACTORID(event->Descr.Sender), event->Descr.Cookie, event->EventSerializedSize);
+
+ TotalOutputQueueSize += dataSize;
+ Proxy->Metrics->AddOutputBuffersTotalSize(dataSize);
+ if (!wasWorking) {
+ // this channel has returned to work -- it was empty and this we have just put first event in the queue
+ ChannelScheduler->AddToHeap(oChannel, EqualizeCounter);
+ }
+
+ SetOutputStuckFlag(true);
+ ++NumEventsInReadyChannels;
+
+ LWTRACK(EnqueueEvent, event->Orbit, Proxy->PeerNodeId, NumEventsInReadyChannels, GetWriteBlockedTotal(), evChannel, oChannel.GetQueueSize(), oChannel.GetBufferedAmountOfData());
+ WILSON_TRACE(*TlsActivationContext, &ev->TraceId, OutputQueuePush,
+ QueueSizeInEvents = oChannel.GetQueueSize(),
+ QueueSizeInBytes = oChannel.GetBufferedAmountOfData());
+
+ // check for overloaded queues
+ ui64 sendBufferDieLimit = Proxy->Common->Settings.SendBufferDieLimitInMB * ui64(1 << 20);
+ if (sendBufferDieLimit != 0 && TotalOutputQueueSize > sendBufferDieLimit) {
+ LOG_ERROR_IC_SESSION("ICS03", "socket: %" PRIi64 " output queue is overloaded, actual %" PRIu64 " bytes, limit is %" PRIu64,
+ Socket ? i64(*Socket) : -1, TotalOutputQueueSize, sendBufferDieLimit);
+ return Terminate(TDisconnectReason::QueueOverload());
+ }
+
+ ui64 outputBuffersTotalSizeLimit = Proxy->Common->Settings.OutputBuffersTotalSizeLimitInMB * ui64(1 << 20);
+ if (outputBuffersTotalSizeLimit != 0 && static_cast<ui64>(Proxy->Metrics->GetOutputBuffersTotalSize()) > outputBuffersTotalSizeLimit) {
+ LOG_ERROR_IC_SESSION("ICS77", "Exceeded total limit on output buffers size");
+ if (AtomicTryLock(&Proxy->Common->StartedSessionKiller)) {
+ CreateSessionKillingActor(Proxy->Common);
+ }
+ }
+
+ if (RamInQueue && !RamInQueue->Batching) {
+ // we have pending TEvRam, so GenerateTraffic will be called no matter what
+ } else if (InflightDataAmount >= GetTotalInflightAmountOfData() || !Socket || ReceiveContext->WriteBlockedByFullSendBuffer) {
+ // we can't issue more traffic now; GenerateTraffic will be called upon unblocking
+ } else if (TotalOutputQueueSize >= 64 * 1024) {
+ // output queue size is quite big to issue some traffic
+ GenerateTraffic();
+ } else if (!RamInQueue) {
+ Y_VERIFY_DEBUG(NumEventsInReadyChannels == 1);
+ RamInQueue = new TEvRam(true);
+ auto *ev = new IEventHandle(SelfId(), {}, RamInQueue);
+ const TDuration batchPeriod = Proxy->Common->Settings.BatchPeriod;
+ if (batchPeriod != TDuration()) {
+ TActivationContext::Schedule(batchPeriod, ev);
+ } else {
+ TActivationContext::Send(ev);
+ }
+ LWPROBE(StartBatching, Proxy->PeerNodeId, batchPeriod.MillisecondsFloat());
+ LOG_DEBUG_IC_SESSION("ICS17", "batching started");
+ }
+ }
+
+ void TInterconnectSessionTCP::Subscribe(STATEFN_SIG) {
+ LOG_DEBUG_IC_SESSION("ICS04", "subscribe for session state for %s", ev->Sender.ToString().data());
+ const auto [it, inserted] = Subscribers.emplace(ev->Sender, ev->Cookie);
+ if (inserted) {
+ Proxy->Metrics->IncSubscribersCount();
+ } else {
+ it->second = ev->Cookie;
+ }
+ Send(ev->Sender, new TEvInterconnect::TEvNodeConnected(Proxy->PeerNodeId), 0, ev->Cookie);
+ }
+
+ void TInterconnectSessionTCP::Unsubscribe(STATEFN_SIG) {
+ LOG_DEBUG_IC_SESSION("ICS05", "unsubscribe for session state for %s", ev->Sender.ToString().data());
+ Proxy->Metrics->SubSubscribersCount( Subscribers.erase(ev->Sender));
+ }
+
+ THolder<TEvHandshakeAck> TInterconnectSessionTCP::ProcessHandshakeRequest(TEvHandshakeAsk::TPtr& ev) {
+ TEvHandshakeAsk *msg = ev->Get();
+
+ // close existing input session, if any, and do nothing upon its destruction
+ ReestablishConnection({}, false, TDisconnectReason::NewSession());
+ const ui64 lastInputSerial = ReceiveContext->LockLastProcessedPacketSerial();
+
+ LOG_INFO_IC_SESSION("ICS08", "incoming handshake Self# %s Peer# %s Counter# %" PRIu64 " LastInputSerial# %" PRIu64,
+ msg->Self.ToString().data(), msg->Peer.ToString().data(), msg->Counter, lastInputSerial);
+
+ return MakeHolder<TEvHandshakeAck>(msg->Peer, lastInputSerial, Params);
+ }
+
+ void TInterconnectSessionTCP::SetNewConnection(TEvHandshakeDone::TPtr& ev) {
+ if (ReceiverId) {
+ // upon destruction of input session actor invoke this callback again
+ ReestablishConnection(std::move(ev), false, TDisconnectReason::NewSession());
+ return;
+ }
+
+ LOG_INFO_IC_SESSION("ICS09", "handshake done sender: %s self: %s peer: %s socket: %" PRIi64,
+ ev->Sender.ToString().data(), ev->Get()->Self.ToString().data(), ev->Get()->Peer.ToString().data(),
+ i64(*ev->Get()->Socket));
+
+ NewConnectionSet = TActivationContext::Now();
+ PacketsWrittenToSocket = 0;
+
+ SendBufferSize = ev->Get()->Socket->GetSendBufferSize();
+ Socket = std::move(ev->Get()->Socket);
+
+ // there may be a race
+ const ui64 nextPacket = Max(LastConfirmed, ev->Get()->NextPacket);
+
+ // arm watchdogs
+ CloseOnIdleWatchdog.Arm(SelfId());
+
+ // reset activity timestamps
+ LastInputActivityTimestamp = LastPayloadActivityTimestamp = TActivationContext::Now();
+
+ LOG_INFO_IC_SESSION("ICS10", "traffic start");
+
+ // create input session actor
+ auto actor = MakeHolder<TInputSessionTCP>(SelfId(), Socket, ReceiveContext, Proxy->Common,
+ Proxy->Metrics, Proxy->PeerNodeId, nextPacket, GetDeadPeerTimeout(), Params);
+ ReceiveContext->UnlockLastProcessedPacketSerial();
+ ReceiverId = Params.Encryption ? RegisterWithSameMailbox(actor.Release()) : Register(actor.Release(), TMailboxType::ReadAsFilled);
+
+ // register our socket in poller actor
+ LOG_DEBUG_IC_SESSION("ICS11", "registering socket in PollerActor");
+ const bool success = Send(MakePollerActorId(), new TEvPollerRegister(Socket, ReceiverId, SelfId()));
+ Y_VERIFY(success);
+ ReceiveContext->WriteBlockedByFullSendBuffer = false;
+
+ LostConnectionWatchdog.Disarm();
+ Proxy->Metrics->SetConnected(1);
+ LOG_INFO(*TlsActivationContext, NActorsServices::INTERCONNECT_STATUS, "[%u] connected", Proxy->PeerNodeId);
+
+ // arm pinger timer
+ ResetFlushLogic();
+
+ ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+ // REINITIALIZE SEND QUEUE
+ //
+ // scan through send queue and leave only those packets who have data -- we will simply resend them; drop all other
+ // auxiliary packets; also reset packet metrics to zero to start sending from the beginning
+ // also reset SendQueuePos
+
+ // drop confirmed packets first as we do not need unwanted retransmissions
+ SendQueuePos = SendQueue.end();
+ DropConfirmed(nextPacket);
+
+ for (TSendQueue::iterator it = SendQueue.begin(); it != SendQueue.end(); ) {
+ const TSendQueue::iterator next = std::next(it);
+ if (it->IsEmpty()) {
+ SendQueueCache.splice(SendQueueCache.begin(), SendQueue, it);
+ } else {
+ it->ResetBufs();
+ }
+ it = next;
+ }
+ TrimSendQueueCache();
+ SendQueuePos = SendQueue.begin();
+
+ TMaybe<ui64> s;
+ for (auto it = SendQueuePos; it != SendQueue.end(); ++it) {
+ if (!it->IsEmpty()) {
+ s = it->GetSerial();
+ }
+ }
+ const ui64 serial = s.GetOrElse(Max<ui64>());
+
+ Y_VERIFY(serial > LastConfirmed, "%s serial# %" PRIu64 " LastConfirmed# %" PRIu64, LogPrefix.data(), serial, LastConfirmed);
+ LOG_DEBUG_IC_SESSION("ICS06", "rewind SendQueue size# %zu LastConfirmed# %" PRIu64 " SendQueuePos.Serial# %" PRIu64 "\n",
+ SendQueue.size(), LastConfirmed, serial);
+
+ BytesUnwritten = 0;
+ for (const auto& packet : SendQueue) {
+ BytesUnwritten += (Params.UseModernFrame ? sizeof(TTcpPacketHeader_v2) : sizeof(TTcpPacketHeader_v1)) +
+ packet.GetDataSize();
+ }
+
+ SwitchStuckPeriod();
+
+ LastHandshakeDone = TActivationContext::Now();
+
+ RamInQueue = nullptr;
+ GenerateTraffic();
+ }
+
+ void TInterconnectSessionTCP::Handle(TEvUpdateFromInputSession::TPtr& ev) {
+ if (ev->Sender == ReceiverId) {
+ TEvUpdateFromInputSession& msg = *ev->Get();
+
+ // update ping time
+ Ping = msg.Ping;
+ LWPROBE(UpdateFromInputSession, Proxy->PeerNodeId, Ping.MillisecondsFloat());
+
+ bool needConfirm = false;
+
+ // update activity timer for dead peer checker
+ LastInputActivityTimestamp = TActivationContext::Now();
+
+ if (msg.NumDataBytes) {
+ UnconfirmedBytes += msg.NumDataBytes;
+ if (UnconfirmedBytes >= GetTotalInflightAmountOfData() / 4) {
+ needConfirm = true;
+ } else {
+ SetForcePacketTimestamp(Proxy->Common->Settings.ForceConfirmPeriod);
+ }
+
+ // reset payload watchdog that controls close-on-idle behaviour
+ LastPayloadActivityTimestamp = TActivationContext::Now();
+ CloseOnIdleWatchdog.Reset();
+ }
+
+ bool unblockedSomething = false;
+ LWPROBE_IF_TOO_LONG(SlowICDropConfirmed, Proxy->PeerNodeId, ms) {
+ unblockedSomething = DropConfirmed(msg.ConfirmedByInput);
+ }
+
+ // generate more traffic if we have unblocked state now
+ if (unblockedSomething) {
+ LWPROBE(UnblockByDropConfirmed, Proxy->PeerNodeId, NHPTimer::GetSeconds(GetCycleCountFast() - ev->SendTime) * 1000.0);
+ GenerateTraffic();
+ }
+
+ // if we haven't generated any packets, then make a lone Flush packet without any data
+ if (needConfirm && Socket) {
+ ++ConfirmPacketsForcedBySize;
+ MakePacket(false);
+ }
+
+ for (;;) {
+ switch (EUpdateState state = ReceiveContext->UpdateState) {
+ case EUpdateState::NONE:
+ case EUpdateState::CONFIRMING:
+ Y_FAIL("unexpected state");
+
+ case EUpdateState::INFLIGHT:
+ // this message we are processing was the only one in flight, so we can reset state to NONE here
+ if (ReceiveContext->UpdateState.compare_exchange_weak(state, EUpdateState::NONE)) {
+ return;
+ }
+ break;
+
+ case EUpdateState::INFLIGHT_AND_PENDING:
+ // there is more messages pending from the input session actor, so we have to inform it to release
+ // that message
+ if (ReceiveContext->UpdateState.compare_exchange_weak(state, EUpdateState::CONFIRMING)) {
+ Send(ev->Sender, new TEvConfirmUpdate);
+ return;
+ }
+ break;
+ }
+ }
+ }
+ }
+
+ void TInterconnectSessionTCP::HandleRam(TEvRam::TPtr& ev) {
+ if (ev->Get() == RamInQueue) {
+ LWPROBE(FinishRam, Proxy->PeerNodeId, NHPTimer::GetSeconds(GetCycleCountFast() - ev->SendTime) * 1000.0);
+ RamInQueue = nullptr;
+ GenerateTraffic();
+ }
+ }
+
+ void TInterconnectSessionTCP::GenerateTraffic() {
+ // generate ping request, if needed
+ IssuePingRequest();
+
+ if (RamInQueue && !RamInQueue->Batching) {
+ LWPROBE(SkipGenerateTraffic, Proxy->PeerNodeId, NHPTimer::GetSeconds(GetCycleCountFast() - RamStartedCycles) * 1000.0);
+ return; // we'll do it a bit later
+ } else {
+ RamInQueue = nullptr;
+ }
+
+ LOG_DEBUG_IC_SESSION("ICS19", "GenerateTraffic");
+
+ // There is a tradeoff between fairness and efficiency.
+ // The less traffic is generated here, the less buffering is after fair scheduler,
+ // the more fair system is, the less latency is present.
+ // The more traffic is generated here, the less syscalls and actor-system overhead occurs,
+ // the less cpu is consumed.
+ static const ui64 generateLimit = 64 * 1024;
+
+ const ui64 sizeBefore = TotalOutputQueueSize;
+ ui32 generatedPackets = 0;
+ ui64 generatedBytes = 0;
+ ui64 generateStarted = GetCycleCountFast();
+
+ // apply traffic changes
+ auto accountTraffic = [&] { ChannelScheduler->ForEach([](TEventOutputChannel& channel) { channel.AccountTraffic(); }); };
+
+ // first, we create as many data packets as we can generate under certain conditions; they include presence
+ // of events in channels queues and in flight fitting into requested limit; after we hit one of these conditions
+ // we exit cycle
+ while (Socket && NumEventsInReadyChannels && InflightDataAmount < GetTotalInflightAmountOfData() && !ReceiveContext->WriteBlockedByFullSendBuffer) {
+ if (generatedBytes >= generateLimit) {
+ // resume later but ensure that we have issued at least one packet
+ RamInQueue = new TEvRam(false);
+ Send(SelfId(), RamInQueue);
+ RamStartedCycles = GetCycleCountFast();
+ LWPROBE(StartRam, Proxy->PeerNodeId);
+ break;
+ }
+
+ try {
+ generatedBytes += MakePacket(true);
+ ++generatedPackets;
+ } catch (const TExSerializedEventTooLarge& ex) {
+ // terminate session if the event can't be serialized properly
+ accountTraffic();
+ LOG_CRIT_IC("ICS31", "serialized event Type# 0x%08" PRIx32 " is too large", ex.Type);
+ return Terminate(TDisconnectReason::EventTooLarge());
+ }
+ }
+
+ if (Socket) {
+ WriteData();
+ }
+
+ LWPROBE(GenerateTraffic, Proxy->PeerNodeId, NHPTimer::GetSeconds(GetCycleCountFast() - generateStarted) * 1000.0, sizeBefore - TotalOutputQueueSize, generatedPackets, generatedBytes);
+
+ accountTraffic();
+ EqualizeCounter += ChannelScheduler->Equalize();
+ }
+
+ void TInterconnectSessionTCP::StartHandshake() {
+ LOG_INFO_IC_SESSION("ICS15", "start handshake");
+ IActor::InvokeOtherActor(*Proxy, &TInterconnectProxyTCP::StartResumeHandshake, ReceiveContext->LockLastProcessedPacketSerial());
+ }
+
+ void TInterconnectSessionTCP::ReestablishConnectionWithHandshake(TDisconnectReason reason) {
+ ReestablishConnection({}, true, std::move(reason));
+ }
+
+ void TInterconnectSessionTCP::ReestablishConnection(TEvHandshakeDone::TPtr&& ev, bool startHandshakeOnSessionClose,
+ TDisconnectReason reason) {
+ if (Socket) {
+ LOG_INFO_IC_SESSION("ICS13", "reestablish connection");
+ ShutdownSocket(std::move(reason)); // stop sending/receiving on socket
+ PendingHandshakeDoneEvent = std::move(ev);
+ StartHandshakeOnSessionClose = startHandshakeOnSessionClose;
+ if (!ReceiverId) {
+ ReestablishConnectionExecute();
+ }
+ }
+ }
+
+ void TInterconnectSessionTCP::OnDisconnect(TEvSocketDisconnect::TPtr& ev) {
+ if (ev->Sender == ReceiverId) {
+ const bool wasConnected(Socket);
+ LOG_INFO_IC_SESSION("ICS07", "socket disconnect %" PRIi64 " reason# %s", Socket ? i64(*Socket) : -1, ev->Get()->Reason.ToString().data());
+ ReceiverId = TActorId(); // reset receiver actor id as we have no more receiver yet
+ if (wasConnected) {
+ // we were sucessfully connected and did not expect failure, so it arrived from the input side; we should
+ // restart handshake process, closing our part of socket first
+ ShutdownSocket(ev->Get()->Reason);
+ StartHandshake();
+ } else {
+ ReestablishConnectionExecute();
+ }
+ }
+ }
+
+ void TInterconnectSessionTCP::ShutdownSocket(TDisconnectReason reason) {
+ if (Socket) {
+ if (const TString& s = reason.ToString()) {
+ Proxy->Metrics->IncDisconnectByReason(s);
+ }
+
+ LOG_INFO_IC_SESSION("ICS25", "shutdown socket, reason# %s", reason.ToString().data());
+ Proxy->UpdateErrorStateLog(TActivationContext::Now(), "close_socket", reason.ToString().data());
+ Socket->Shutdown(SHUT_RDWR);
+ Socket.Reset();
+ Proxy->Metrics->IncDisconnections();
+ CloseOnIdleWatchdog.Disarm();
+ LostConnectionWatchdog.Arm(SelfId());
+ Proxy->Metrics->SetConnected(0);
+ LOG_INFO(*TlsActivationContext, NActorsServices::INTERCONNECT_STATUS, "[%u] disconnected", Proxy->PeerNodeId);
+ }
+ }
+
+ void TInterconnectSessionTCP::ReestablishConnectionExecute() {
+ bool startHandshakeOnSessionClose = std::exchange(StartHandshakeOnSessionClose, false);
+ TEvHandshakeDone::TPtr ev = std::move(PendingHandshakeDoneEvent);
+
+ if (startHandshakeOnSessionClose) {
+ StartHandshake();
+ } else if (ev) {
+ SetNewConnection(ev);
+ }
+ }
+
+ void TInterconnectSessionTCP::Handle(TEvPollerReady::TPtr& ev) {
+ LOG_DEBUG_IC_SESSION("ICS29", "HandleReadyWrite WriteBlockedByFullSendBuffer# %s",
+ ReceiveContext->WriteBlockedByFullSendBuffer ? "true" : "false");
+ if (std::exchange(ReceiveContext->WriteBlockedByFullSendBuffer, false)) {
+ Proxy->Metrics->IncUsefulWriteWakeups();
+ ui64 nowCycles = GetCycleCountFast();
+ double blockedUs = NHPTimer::GetSeconds(nowCycles - WriteBlockedCycles) * 1000000.0;
+ LWPROBE(ReadyWrite, Proxy->PeerNodeId, NHPTimer::GetSeconds(nowCycles - ev->SendTime) * 1000.0, blockedUs / 1000.0);
+ WriteBlockedTotal += TDuration::MicroSeconds(blockedUs);
+ GenerateTraffic();
+ } else if (!ev->Cookie) {
+ Proxy->Metrics->IncSpuriousWriteWakeups();
+ }
+ if (Params.Encryption && ReceiveContext->ReadPending && !ev->Cookie) {
+ Send(ReceiverId, ev->Release().Release(), 0, 1);
+ }
+ }
+
+ void TInterconnectSessionTCP::Handle(TEvPollerRegisterResult::TPtr ev) {
+ PollerToken = std::move(ev->Get()->PollerToken);
+ if (ReceiveContext->WriteBlockedByFullSendBuffer) {
+ if (Params.Encryption) {
+ auto *secure = static_cast<NInterconnect::TSecureSocket*>(Socket.Get());
+ PollerToken->Request(secure->WantRead(), secure->WantWrite());
+ } else {
+ PollerToken->Request(false, true);
+ }
+ }
+ }
+
+ void TInterconnectSessionTCP::WriteData() {
+ ui64 written = 0;
+
+ Y_VERIFY(Socket); // ensure that socket wasn't closed
+
+ LWPROBE_IF_TOO_LONG(SlowICWriteData, Proxy->PeerNodeId, ms) {
+ constexpr ui32 iovLimit = 256;
+#ifdef _linux_
+ ui32 maxElementsInIOV = Min<ui32>(iovLimit, sysconf(_SC_IOV_MAX));
+#else
+ ui32 maxElementsInIOV = 64;
+#endif
+ if (Params.Encryption) {
+ maxElementsInIOV = 1;
+ }
+
+ // vector of write buffers with preallocated stack space
+ TStackVec<TConstIoVec, iovLimit> wbuffers;
+
+ LOG_DEBUG_IC_SESSION("ICS30", "WriteData WriteBlockedByFullSendBuffer# %s SendQueue.size# %zu",
+ ReceiveContext->WriteBlockedByFullSendBuffer ? "true" : "false", SendQueue.size());
+
+ // update last confirmed packet number if it has changed
+ if (SendQueuePos != SendQueue.end()) {
+ SendQueuePos->UpdateConfirmIfPossible(ReceiveContext->GetLastProcessedPacketSerial());
+ }
+
+ while (SendQueuePos != SendQueue.end() && !ReceiveContext->WriteBlockedByFullSendBuffer) {
+ for (auto it = SendQueuePos; it != SendQueue.end() && wbuffers.size() < maxElementsInIOV; ++it) {
+ it->AppendToIoVector(wbuffers, maxElementsInIOV);
+ }
+
+ const struct iovec* iovec = reinterpret_cast<const struct iovec*>(wbuffers.data());
+ int iovcnt = wbuffers.size();
+
+ Y_VERIFY(iovcnt > 0);
+ Y_VERIFY(iovec->iov_len > 0);
+
+ TString err;
+ ssize_t r = 0;
+ do {
+#ifndef _win_
+ r = iovcnt == 1 ? Socket->Send(iovec[0].iov_base, iovec[0].iov_len, &err) : Socket->WriteV(iovec, iovcnt);
+#else
+ r = Socket->Send(iovec[0].iov_base, iovec[0].iov_len, &err);
+#endif
+ Proxy->Metrics->IncSendSyscalls();
+ } while (r == -EINTR);
+
+ LOG_DEBUG_IC_SESSION("ICS16", "written# %zd iovcnt# %d err# %s", r, iovcnt, err.data());
+
+ wbuffers.clear();
+
+ if (r > 0) {
+ Y_VERIFY(static_cast<size_t>(r) <= BytesUnwritten);
+ BytesUnwritten -= r;
+ written += r;
+ ui64 packets = 0;
+
+ // advance SendQueuePos to eat all processed items
+ for (size_t amount = r; amount && SendQueuePos->DropBufs(amount); ++SendQueuePos) {
+ if (!SendQueuePos->IsEmpty()) {
+ LastSentSerial = Max(LastSentSerial, SendQueuePos->GetSerial());
+ }
+ ++PacketsWrittenToSocket;
+ ++packets;
+ LWTRACK(PacketWrittenToSocket, SendQueuePos->Orbit, Proxy->PeerNodeId, PacketsWrittenToSocket, SendQueuePos->TriedWriting, SendQueuePos->GetDataSize(), BytesUnwritten, GetWriteBlockedTotal(), (SOCKET)*Socket);
+ }
+
+ LWPROBE(WriteToSocket, Proxy->PeerNodeId, r, packets, PacketsWrittenToSocket, BytesUnwritten, GetWriteBlockedTotal(), (SOCKET)*Socket);
+ } else if (-r != EAGAIN && -r != EWOULDBLOCK) {
+ const TString message = r == 0 ? "connection closed by peer"
+ : err ? err
+ : Sprintf("socket: %s", strerror(-r));
+ LOG_NOTICE_NET(Proxy->PeerNodeId, "%s", message.data());
+ if (written) {
+ Proxy->Metrics->AddTotalBytesWritten(written);
+ }
+ return ReestablishConnectionWithHandshake(r == 0 ? TDisconnectReason::EndOfStream() : TDisconnectReason::FromErrno(-r));
+ } else {
+ // we have to do some hack for secure socket -- mark the packet as 'tried writing'
+ if (Params.Encryption) {
+ Y_VERIFY(SendQueuePos != SendQueue.end());
+ SendQueuePos->MarkTriedWriting(); // do not try to replace buffer under SSL
+ }
+
+ // we have received EAGAIN error code, this means that we can't issue more data until we have received
+ // TEvPollerReadyWrite event from poller; set up flag meaning this and wait for that event
+ Y_VERIFY(!ReceiveContext->WriteBlockedByFullSendBuffer);
+ ReceiveContext->WriteBlockedByFullSendBuffer = true;
+ WriteBlockedCycles = GetCycleCountFast();
+ LWPROBE(BlockedWrite, Proxy->PeerNodeId, SendQueue.size(), written);
+ LOG_DEBUG_IC_SESSION("ICS18", "hit send buffer limit");
+
+ if (PollerToken) {
+ if (Params.Encryption) {
+ auto *secure = static_cast<NInterconnect::TSecureSocket*>(Socket.Get());
+ PollerToken->Request(secure->WantRead(), secure->WantWrite());
+ } else {
+ PollerToken->Request(false, true);
+ }
+ }
+ }
+ }
+ }
+ if (written) {
+ Proxy->Metrics->AddTotalBytesWritten(written);
+ }
+ }
+
+ void TInterconnectSessionTCP::SetForcePacketTimestamp(TDuration period) {
+ if (period != TDuration::Max()) {
+ const TInstant when = TActivationContext::Now() + period;
+ if (when < ForcePacketTimestamp) {
+ ForcePacketTimestamp = when;
+ ScheduleFlush();
+ }
+ }
+ }
+
+ void TInterconnectSessionTCP::ScheduleFlush() {
+ if (FlushSchedule.empty() || ForcePacketTimestamp < FlushSchedule.top()) {
+ Schedule(ForcePacketTimestamp - TActivationContext::Now(), new TEvFlush);
+ FlushSchedule.push(ForcePacketTimestamp);
+ MaxFlushSchedule = Max(MaxFlushSchedule, FlushSchedule.size());
+ ++FlushEventsScheduled;
+ }
+ }
+
+ void TInterconnectSessionTCP::HandleFlush() {
+ const TInstant now = TActivationContext::Now();
+ while (FlushSchedule && now >= FlushSchedule.top()) {
+ FlushSchedule.pop();
+ }
+ IssuePingRequest();
+ if (Socket) {
+ if (now >= ForcePacketTimestamp) {
+ ++ConfirmPacketsForcedByTimeout;
+ ++FlushEventsProcessed;
+ MakePacket(false); // just generate confirmation packet if we have preconditions for this
+ } else if (ForcePacketTimestamp != TInstant::Max()) {
+ ScheduleFlush();
+ }
+ }
+ }
+
+ void TInterconnectSessionTCP::ResetFlushLogic() {
+ ForcePacketTimestamp = TInstant::Max();
+ UnconfirmedBytes = 0;
+ const TDuration ping = Proxy->Common->Settings.PingPeriod;
+ if (ping != TDuration::Zero() && !NumEventsInReadyChannels) {
+ SetForcePacketTimestamp(ping);
+ }
+ }
+
+ void TInterconnectSessionTCP::TrimSendQueueCache() {
+ static constexpr size_t maxItems = 32;
+ static constexpr size_t trimThreshold = maxItems * 2;
+ if (SendQueueCache.size() >= trimThreshold) {
+ auto it = SendQueueCache.end();
+ for (size_t n = SendQueueCache.size() - maxItems; n; --n) {
+ --it;
+ }
+
+ auto ev = std::make_unique<TEvFreeItems>();
+ ev->Items.splice(ev->Items.end(), SendQueueCache, it, SendQueueCache.end());
+ ev->NumBytes = ev->Items.size() * sizeof(TTcpPacketOutTask);
+ if (ev->GetInLineForDestruction(Proxy->Common)) {
+ Send(Proxy->Common->DestructorId, ev.release());
+ }
+ }
+ }
+
+ ui64 TInterconnectSessionTCP::MakePacket(bool data, TMaybe<ui64> pingMask) {
+ Y_VERIFY(Socket);
+
+ TSendQueue::iterator packet;
+ if (SendQueueCache) {
+ // we have entries in cache, take one and move it to the end of SendQueue
+ packet = SendQueueCache.begin();
+ SendQueue.splice(SendQueue.end(), SendQueueCache, packet);
+ packet->Reuse(); // reset packet to initial state
+ } else {
+ // we have to allocate new packet, so just do it
+ LWPROBE_IF_TOO_LONG(SlowICAllocPacketBuffer, Proxy->PeerNodeId, ms) {
+ packet = SendQueue.emplace(SendQueue.end(), Params);
+ }
+ }
+
+ // update send queue position
+ if (SendQueuePos == SendQueue.end()) {
+ SendQueuePos = packet; // start sending this packet if we are not sending anything for now
+ }
+
+ ui64 serial = 0;
+
+ if (data) {
+ // generate serial for this data packet
+ serial = ++OutputCounter;
+
+ // fill the data packet
+ Y_VERIFY(NumEventsInReadyChannels);
+ LWPROBE_IF_TOO_LONG(SlowICFillSendingBuffer, Proxy->PeerNodeId, ms) {
+ FillSendingBuffer(*packet, serial);
+ }
+ Y_VERIFY(!packet->IsEmpty());
+
+ InflightDataAmount += packet->GetDataSize();
+ Proxy->Metrics->AddInflightDataAmount(packet->GetDataSize());
+ if (InflightDataAmount > GetTotalInflightAmountOfData()) {
+ Proxy->Metrics->IncInflyLimitReach();
+ }
+
+ if (AtomicGet(ReceiveContext->ControlPacketId) == 0) {
+ AtomicSet(ReceiveContext->ControlPacketSendTimer, GetCycleCountFast());
+ AtomicSet(ReceiveContext->ControlPacketId, OutputCounter);
+ }
+
+ // update payload activity timer
+ LastPayloadActivityTimestamp = TActivationContext::Now();
+ } else if (pingMask) {
+ serial = *pingMask;
+
+ // make this packet a priority one
+ if (SendQueuePos != packet) {
+ Y_VERIFY(SendQueuePos != SendQueue.end());
+ if (SendQueuePos->IsAtBegin()) {
+ // insert this packet just before the next being sent and step back
+ SendQueue.splice(SendQueuePos, SendQueue, packet);
+ --SendQueuePos;
+ Y_VERIFY(SendQueuePos == packet);
+ } else {
+ // current packet is already being sent, so move new packet just after it
+ SendQueue.splice(std::next(SendQueuePos), SendQueue, packet);
+ }
+ }
+ }
+
+ const ui64 lastInputSerial = ReceiveContext->GetLastProcessedPacketSerial();
+ packet->SetMetadata(serial, lastInputSerial);
+ packet->Sign();
+
+ // count number of bytes pending for write
+ ui64 packetSize = (Params.UseModernFrame ? sizeof(TTcpPacketHeader_v2) : sizeof(TTcpPacketHeader_v1)) + packet->GetDataSize();
+ BytesUnwritten += packetSize;
+
+ LOG_DEBUG_IC_SESSION("ICS22", "outgoing packet Serial# %" PRIu64 " Confirm# %" PRIu64 " DataSize# %zu"
+ " InflightDataAmount# %" PRIu64 " BytesUnwritten# %" PRIu64, serial, lastInputSerial, packet->GetDataSize(),
+ InflightDataAmount, BytesUnwritten);
+
+ // reset forced packet sending timestamp as we have confirmed all received data
+ ResetFlushLogic();
+
+ ++PacketsGenerated;
+ LWTRACK(PacketGenerated, packet->Orbit, Proxy->PeerNodeId, BytesUnwritten, InflightDataAmount, PacketsGenerated, packetSize);
+
+ if (!data) {
+ WriteData();
+ }
+
+ return packetSize;
+ }
+
+ bool TInterconnectSessionTCP::DropConfirmed(ui64 confirm) {
+ LOG_DEBUG_IC_SESSION("ICS23", "confirm count: %" PRIu64, confirm);
+
+ Y_VERIFY(LastConfirmed <= confirm && confirm <= LastSentSerial && LastSentSerial <= OutputCounter,
+ "%s confirm# %" PRIu64 " LastConfirmed# %" PRIu64 " OutputCounter# %" PRIu64 " LastSentSerial# %" PRIu64,
+ LogPrefix.data(), confirm, LastConfirmed, OutputCounter, LastSentSerial);
+ LastConfirmed = confirm;
+
+ ui64 droppedDataAmount = 0;
+ ui32 numDropped = 0;
+
+ // drop confirmed packets; this also includes any auxiliary packets as their serial is set to zero, effectively
+ // making Serial <= confirm true
+ TSendQueue::iterator it;
+ ui64 lastDroppedSerial = 0;
+ for (it = SendQueue.begin(); it != SendQueuePos && it->Confirmed(confirm); ++it) {
+ if (!it->IsEmpty()) {
+ lastDroppedSerial = it->GetSerial();
+ }
+ droppedDataAmount += it->GetDataSize();
+ ++numDropped;
+ }
+ SendQueueCache.splice(SendQueueCache.begin(), SendQueue, SendQueue.begin(), it);
+ TrimSendQueueCache();
+ ChannelScheduler->ForEach([&](TEventOutputChannel& channel) {
+ channel.DropConfirmed(lastDroppedSerial);
+ });
+
+ const ui64 current = InflightDataAmount;
+ const ui64 limit = GetTotalInflightAmountOfData();
+ const bool unblockedSomething = current >= limit && current < limit + droppedDataAmount;
+
+ PacketsConfirmed += numDropped;
+ InflightDataAmount -= droppedDataAmount;
+ Proxy->Metrics->SubInflightDataAmount(droppedDataAmount);
+ LWPROBE(DropConfirmed, Proxy->PeerNodeId, droppedDataAmount, InflightDataAmount);
+
+ LOG_DEBUG_IC_SESSION("ICS24", "exit InflightDataAmount: %" PRIu64 " bytes droppedDataAmount: %" PRIu64 " bytes"
+ " dropped %" PRIu32 " packets", InflightDataAmount, droppedDataAmount, numDropped);
+
+ Pool->Trim(); // send any unsent free requests
+
+ return unblockedSomething;
+ }
+
+ void TInterconnectSessionTCP::FillSendingBuffer(TTcpPacketOutTask& task, ui64 serial) {
+ ui32 bytesGenerated = 0;
+
+ Y_VERIFY(NumEventsInReadyChannels);
+ while (NumEventsInReadyChannels) {
+ TEventOutputChannel *channel = ChannelScheduler->PickChannelWithLeastConsumedWeight();
+ Y_VERIFY_DEBUG(!channel->IsEmpty());
+
+ // generate some data within this channel
+ const ui64 netBefore = channel->GetBufferedAmountOfData();
+ ui64 gross = 0;
+ const bool eventDone = channel->FeedBuf(task, serial, &gross);
+ channel->UnaccountedTraffic += gross;
+ const ui64 netAfter = channel->GetBufferedAmountOfData();
+ Y_VERIFY_DEBUG(netAfter <= netBefore); // net amount should shrink
+ const ui64 net = netBefore - netAfter; // number of net bytes serialized
+
+ // adjust metrics for local and global queue size
+ TotalOutputQueueSize -= net;
+ Proxy->Metrics->SubOutputBuffersTotalSize(net);
+ bytesGenerated += gross;
+ Y_VERIFY_DEBUG(!!net == !!gross && gross >= net, "net# %" PRIu64 " gross# %" PRIu64, net, gross);
+
+ // return it back to queue or delete, depending on whether this channel is still working or not
+ ChannelScheduler->FinishPick(gross, EqualizeCounter);
+
+ // update some stats if the packet was fully serialized
+ if (eventDone) {
+ ++MessagesWrittenToBuffer;
+
+ Y_VERIFY(NumEventsInReadyChannels);
+ --NumEventsInReadyChannels;
+
+ if (!NumEventsInReadyChannels) {
+ SetOutputStuckFlag(false);
+ }
+ }
+
+ if (!gross) { // no progress -- almost full packet buffer
+ break;
+ }
+ }
+
+ LWTRACK(FillSendingBuffer, task.Orbit, Proxy->PeerNodeId, bytesGenerated, NumEventsInReadyChannels, WriteBlockedTotal);
+ Y_VERIFY(bytesGenerated); // ensure we are not stalled in serialization
+ }
+
+ ui32 TInterconnectSessionTCP::CalculateQueueUtilization() {
+ SwitchStuckPeriod();
+ ui64 sumBusy = 0, sumPeriod = 0;
+ for (auto iter = OutputQueueUtilization.begin(); iter != OutputQueueUtilization.end() - 1; ++iter) {
+ sumBusy += iter->first;
+ sumPeriod += iter->second;
+ }
+ return sumBusy * 1000000 / sumPeriod;
+ }
+
+ void TInterconnectSessionTCP::SendUpdateToWhiteboard(bool connected) {
+ const ui32 utilization = Socket ? CalculateQueueUtilization() : 0;
+
+ if (const auto& callback = Proxy->Common->UpdateWhiteboard) {
+ enum class EFlag {
+ GREEN,
+ YELLOW,
+ ORANGE,
+ RED,
+ };
+ EFlag flagState = EFlag::RED;
+
+ if (Socket) {
+ flagState = EFlag::GREEN;
+
+ do {
+ auto lastInputDelay = TActivationContext::Now() - LastInputActivityTimestamp;
+ if (lastInputDelay * 4 >= GetDeadPeerTimeout() * 3) {
+ flagState = EFlag::ORANGE;
+ break;
+ } else if (lastInputDelay * 2 >= GetDeadPeerTimeout()) {
+ flagState = EFlag::YELLOW;
+ }
+
+ // check utilization
+ if (utilization > 875000) { // 7/8
+ flagState = EFlag::ORANGE;
+ break;
+ } else if (utilization > 500000) { // 1/2
+ flagState = EFlag::YELLOW;
+ }
+ } while (false);
+ }
+
+ callback(Proxy->Metrics->GetHumanFriendlyPeerHostName(),
+ connected,
+ flagState == EFlag::GREEN,
+ flagState == EFlag::YELLOW,
+ flagState == EFlag::ORANGE,
+ flagState == EFlag::RED,
+ TlsActivationContext->ExecutorThread.ActorSystem);
+ }
+
+ if (connected) {
+ Schedule(TDuration::Seconds(1), new TEvents::TEvWakeup);
+ }
+ }
+
+ void TInterconnectSessionTCP::SetOutputStuckFlag(bool state) {
+ if (OutputStuckFlag == state)
+ return;
+
+ if (OutputQueueUtilization.Size() == 0)
+ return;
+
+ auto& lastpair = OutputQueueUtilization.Last();
+ if (state)
+ lastpair.first -= GetCycleCountFast();
+ else
+ lastpair.first += GetCycleCountFast();
+
+ OutputStuckFlag = state;
+ }
+
+ void TInterconnectSessionTCP::SwitchStuckPeriod() {
+ auto now = GetCycleCountFast();
+ if (OutputQueueUtilization.Size() != 0) {
+ auto& lastpair = OutputQueueUtilization.Last();
+ lastpair.second = now - lastpair.second;
+ if (OutputStuckFlag)
+ lastpair.first += now;
+ }
+
+ OutputQueueUtilization.Push(std::pair<ui64, ui64>(0, now));
+ if (OutputStuckFlag)
+ OutputQueueUtilization.Last().first -= now;
+ }
+
+ TDuration TInterconnectSessionTCP::GetDeadPeerTimeout() const {
+ return Coalesce(Proxy->Common->Settings.DeadPeer, DEFAULT_DEADPEER_TIMEOUT);
+ }
+
+ TDuration TInterconnectSessionTCP::GetCloseOnIdleTimeout() const {
+ return Proxy->Common->Settings.CloseOnIdle;
+ }
+
+ TDuration TInterconnectSessionTCP::GetLostConnectionTimeout() const {
+ return Coalesce(Proxy->Common->Settings.LostConnection, DEFAULT_LOST_CONNECTION_TIMEOUT);
+ }
+
+ ui32 TInterconnectSessionTCP::GetTotalInflightAmountOfData() const {
+ return Coalesce(Proxy->Common->Settings.TotalInflightAmountOfData, DEFAULT_TOTAL_INFLIGHT_DATA);
+ }
+
+ ui64 TInterconnectSessionTCP::GetMaxCyclesPerEvent() const {
+ return DurationToCycles(TDuration::MicroSeconds(50));
+ }
+
+ void TInterconnectSessionTCP::IssuePingRequest() {
+ const TInstant now = TActivationContext::Now();
+ if (now >= LastPingTimestamp + PingPeriodicity) {
+ LOG_DEBUG_IC_SESSION("ICS22", "Issuing ping request");
+ if (Socket) {
+ MakePacket(false, GetCycleCountFast() | TTcpPacketBuf::PingRequestMask);
+ }
+ if (Socket) {
+ MakePacket(false, TInstant::Now().MicroSeconds() | TTcpPacketBuf::ClockMask);
+ }
+ LastPingTimestamp = now;
+ }
+ }
+
+ void TInterconnectSessionTCP::Handle(TEvProcessPingRequest::TPtr ev) {
+ if (Socket) {
+ MakePacket(false, ev->Get()->Payload | TTcpPacketBuf::PingResponseMask);
+ }
+ }
+
+ void TInterconnectSessionTCP::GenerateHttpInfo(TStringStream& str) {
+ HTML(str) {
+ DIV_CLASS("panel panel-info") {
+ DIV_CLASS("panel-heading") {
+ str << "Session";
+ }
+ DIV_CLASS("panel-body") {
+ TABLE_CLASS("table") {
+ TABLEHEAD() {
+ TABLER() {
+ TABLEH() {
+ str << "Sensor";
+ }
+ TABLEH() {
+ str << "Value";
+ }
+ }
+ }
+ TABLEBODY() {
+ TABLER() {
+ TABLED() {
+ str << "Encryption";
+ }
+ TABLED() {
+ str << (Params.Encryption ? "<font color=green>Enabled</font>" : "<font color=red>Disabled</font>");
+ }
+ }
+ if (auto *x = dynamic_cast<NInterconnect::TSecureSocket*>(Socket.Get())) {
+ TABLER() {
+ TABLED() {
+ str << "Cipher name";
+ }
+ TABLED() {
+ str << x->GetCipherName();
+ }
+ }
+ TABLER() {
+ TABLED() {
+ str << "Cipher bits";
+ }
+ TABLED() {
+ str << x->GetCipherBits();
+ }
+ }
+ TABLER() {
+ TABLED() {
+ str << "Protocol";
+ }
+ TABLED() {
+ str << x->GetProtocolName();
+ }
+ }
+ TABLER() {
+ TABLED() {
+ str << "Peer CN";
+ }
+ TABLED() {
+ str << x->GetPeerCommonName();
+ }
+ }
+ }
+ TABLER() {
+ TABLED() { str << "AuthOnly CN"; }
+ TABLED() { str << Params.AuthCN; }
+ }
+ TABLER() {
+ TABLED() {
+ str << "Local scope id";
+ }
+ TABLED() {
+ str << ScopeIdToString(Proxy->Common->LocalScopeId);
+ }
+ }
+ TABLER() {
+ TABLED() {
+ str << "Peer scope id";
+ }
+ TABLED() {
+ str << ScopeIdToString(Params.PeerScopeId);
+ }
+ }
+ TABLER() {
+ TABLED() {
+ str << "This page generated at";
+ }
+ TABLED() {
+ str << TActivationContext::Now() << " / " << Now();
+ }
+ }
+ TABLER() {
+ TABLED() {
+ str << "SelfID";
+ }
+ TABLED() {
+ str << SelfId().ToString();
+ }
+ }
+ TABLER() {
+ TABLED() { str << "Frame version/Checksum"; }
+ TABLED() { str << (!Params.UseModernFrame ? "v1/crc32c" : Params.Encryption ? "v2/none" : "v2/crc32c"); }
+ }
+#define MON_VAR(NAME) \
+ TABLER() { \
+ TABLED() { \
+ str << #NAME; \
+ } \
+ TABLED() { \
+ str << NAME; \
+ } \
+ }
+
+ MON_VAR(Created)
+ MON_VAR(NewConnectionSet)
+ MON_VAR(ReceiverId)
+ MON_VAR(MessagesGot)
+ MON_VAR(MessagesWrittenToBuffer)
+ MON_VAR(PacketsGenerated)
+ MON_VAR(PacketsWrittenToSocket)
+ MON_VAR(PacketsConfirmed)
+ MON_VAR(AtomicGet(ReceiveContext->PacketsReadFromSocket))
+ MON_VAR(ConfirmPacketsForcedBySize)
+ MON_VAR(ConfirmPacketsForcedByTimeout)
+
+ TABLER() {
+ TABLED() {
+ str << "Virtual self ID";
+ }
+ TABLED() {
+ str << Proxy->SessionVirtualId.ToString();
+ }
+ }
+ TABLER() {
+ TABLED() {
+ str << "Virtual peer ID";
+ }
+ TABLED() {
+ str << Proxy->RemoteSessionVirtualId.ToString();
+ }
+ }
+ TABLER() {
+ TABLED() {
+ str << "Socket";
+ }
+ TABLED() {
+ str << (Socket ? i64(*Socket) : -1);
+ }
+ }
+
+ ui32 unsentQueueSize = Socket ? Socket->GetUnsentQueueSize() : 0;
+
+ MON_VAR(OutputStuckFlag)
+ MON_VAR(SendQueue.size())
+ MON_VAR(SendQueueCache.size())
+ MON_VAR(NumEventsInReadyChannels)
+ MON_VAR(TotalOutputQueueSize)
+ MON_VAR(BytesUnwritten)
+ MON_VAR(InflightDataAmount)
+ MON_VAR(unsentQueueSize)
+ MON_VAR(SendBufferSize)
+ MON_VAR(LastInputActivityTimestamp)
+ MON_VAR(LastPayloadActivityTimestamp)
+ MON_VAR(LastHandshakeDone)
+ MON_VAR(OutputCounter)
+ MON_VAR(LastSentSerial)
+ MON_VAR(ReceiveContext->GetLastProcessedPacketSerial())
+ MON_VAR(LastConfirmed)
+ MON_VAR(FlushSchedule.size())
+ MON_VAR(MaxFlushSchedule)
+ MON_VAR(FlushEventsScheduled)
+ MON_VAR(FlushEventsProcessed)
+
+ TString clockSkew;
+ i64 x = GetClockSkew();
+ if (x < 0) {
+ clockSkew = Sprintf("-%s", TDuration::MicroSeconds(-x).ToString().data());
+ } else {
+ clockSkew = Sprintf("+%s", TDuration::MicroSeconds(x).ToString().data());
+ }
+
+ MON_VAR(LastPingTimestamp)
+ MON_VAR(GetPingRTT())
+ MON_VAR(clockSkew)
+
+ MON_VAR(GetDeadPeerTimeout())
+ MON_VAR(GetTotalInflightAmountOfData())
+ MON_VAR(GetCloseOnIdleTimeout())
+ MON_VAR(Subscribers.size())
+ }
+ }
+ }
+ }
+ }
+ }
+
+ void CreateSessionKillingActor(TInterconnectProxyCommon::TPtr common) {
+ TlsActivationContext->ExecutorThread.ActorSystem->Register(new TInterconnectSessionKiller(common));
+ }
+}
diff --git a/library/cpp/actors/interconnect/interconnect_tcp_session.h b/library/cpp/actors/interconnect/interconnect_tcp_session.h
new file mode 100644
index 0000000000..7fc00dbcc5
--- /dev/null
+++ b/library/cpp/actors/interconnect/interconnect_tcp_session.h
@@ -0,0 +1,565 @@
+#pragma once
+
+#include <library/cpp/actors/core/hfunc.h>
+#include <library/cpp/actors/core/event_pb.h>
+#include <library/cpp/actors/core/events.h>
+#include <library/cpp/actors/core/log.h>
+#include <library/cpp/actors/helpers/mon_histogram_helper.h>
+#include <library/cpp/actors/protos/services_common.pb.h>
+#include <library/cpp/actors/util/datetime.h>
+#include <library/cpp/actors/util/rope.h>
+#include <library/cpp/actors/util/funnel_queue.h>
+#include <library/cpp/actors/util/recentwnd.h>
+#include <library/cpp/monlib/dynamic_counters/counters.h>
+#include <library/cpp/actors/core/actor_bootstrapped.h>
+
+#include <util/generic/queue.h>
+#include <util/generic/deque.h>
+#include <util/datetime/cputimer.h>
+
+#include "interconnect_impl.h"
+#include "poller_tcp.h"
+#include "poller_actor.h"
+#include "interconnect_channel.h"
+#include "logging.h"
+#include "watchdog_timer.h"
+#include "event_holder_pool.h"
+#include "channel_scheduler.h"
+
+#include <unordered_set>
+#include <unordered_map>
+
+namespace NActors {
+ class TSlowPathChecker {
+ using TTraceCallback = std::function<void(double)>;
+ TTraceCallback Callback;
+ const NHPTimer::STime Start;
+
+ public:
+ TSlowPathChecker(TTraceCallback&& callback)
+ : Callback(std::move(callback))
+ , Start(GetCycleCountFast())
+ {
+ }
+
+ ~TSlowPathChecker() {
+ const NHPTimer::STime end = GetCycleCountFast();
+ const NHPTimer::STime elapsed = end - Start;
+ if (elapsed > 1000000) {
+ Callback(NHPTimer::GetSeconds(elapsed) * 1000);
+ }
+ }
+
+ operator bool() const {
+ return false;
+ }
+ };
+
+#define LWPROBE_IF_TOO_LONG(...) \
+ if (auto __x = TSlowPathChecker{[&](double ms) { LWPROBE(__VA_ARGS__); }}) \
+ ; \
+ else
+
+ class TTimeLimit {
+ public:
+ TTimeLimit(ui64 limitInCycles)
+ : UpperLimit(limitInCycles == 0 ? 0 : GetCycleCountFast() + limitInCycles)
+ {
+ }
+
+ TTimeLimit(ui64 startTS, ui64 limitInCycles)
+ : UpperLimit(limitInCycles == 0 ? 0 : startTS + limitInCycles)
+ {
+ }
+
+ bool CheckExceeded() {
+ return UpperLimit != 0 && GetCycleCountFast() > UpperLimit;
+ }
+
+ const ui64 UpperLimit;
+ };
+
+ static constexpr TDuration DEFAULT_DEADPEER_TIMEOUT = TDuration::Seconds(10);
+ static constexpr TDuration DEFAULT_LOST_CONNECTION_TIMEOUT = TDuration::Seconds(10);
+ static constexpr ui32 DEFAULT_MAX_INFLIGHT_DATA = 10240 * 1024;
+ static constexpr ui32 DEFAULT_TOTAL_INFLIGHT_DATA = 4 * 10240 * 1024;
+
+ class TInterconnectProxyTCP;
+
+ enum class EUpdateState : ui8 {
+ NONE, // no updates generated by input session yet
+ INFLIGHT, // one update is inflight, and no more pending
+ INFLIGHT_AND_PENDING, // one update is inflight, and one is pending
+ CONFIRMING, // confirmation inflight
+ };
+
+ struct TReceiveContext: public TAtomicRefCount<TReceiveContext> {
+ /* All invokations to these fields should be thread-safe */
+
+ ui64 ControlPacketSendTimer = 0;
+ ui64 ControlPacketId = 0;
+
+ // number of packets received by input session
+ TAtomic PacketsReadFromSocket = 0;
+ TAtomic DataPacketsReadFromSocket = 0;
+
+ // last processed packet by input session
+ std::atomic_uint64_t LastProcessedPacketSerial = 0;
+ static constexpr uint64_t LastProcessedPacketSerialLockBit = uint64_t(1) << 63;
+
+ // for hardened checks
+ TAtomic NumInputSessions = 0;
+
+ NHPTimer::STime StartTime;
+
+ std::atomic<ui64> PingRTT_us = 0;
+ std::atomic<i64> ClockSkew_us = 0;
+
+ std::atomic<EUpdateState> UpdateState;
+ static_assert(std::atomic<EUpdateState>::is_always_lock_free);
+
+ bool WriteBlockedByFullSendBuffer = false;
+ bool ReadPending = false;
+
+ std::array<TRope, 16> ChannelArray;
+ std::unordered_map<ui16, TRope> ChannelMap;
+
+ TReceiveContext() {
+ GetTimeFast(&StartTime);
+ }
+
+ // returns false if sessions needs to be terminated and packet not to be processed
+ bool AdvanceLastProcessedPacketSerial() {
+ for (;;) {
+ uint64_t value = LastProcessedPacketSerial.load();
+ if (value & LastProcessedPacketSerialLockBit) {
+ return false;
+ }
+ if (LastProcessedPacketSerial.compare_exchange_weak(value, value + 1)) {
+ return true;
+ }
+ }
+ }
+
+ ui64 LockLastProcessedPacketSerial() {
+ for (;;) {
+ uint64_t value = LastProcessedPacketSerial.load();
+ if (value & LastProcessedPacketSerialLockBit) {
+ return value & ~LastProcessedPacketSerialLockBit;
+ }
+ if (LastProcessedPacketSerial.compare_exchange_strong(value, value | LastProcessedPacketSerialLockBit)) {
+ return value;
+ }
+ }
+ }
+
+ void UnlockLastProcessedPacketSerial() {
+ LastProcessedPacketSerial = LastProcessedPacketSerial.load() & ~LastProcessedPacketSerialLockBit;
+ }
+
+ ui64 GetLastProcessedPacketSerial() {
+ return LastProcessedPacketSerial.load() & ~LastProcessedPacketSerialLockBit;
+ }
+ };
+
+ class TInputSessionTCP
+ : public TActorBootstrapped<TInputSessionTCP>
+ , public TInterconnectLoggingBase
+ {
+ enum {
+ EvCheckDeadPeer = EventSpaceBegin(TEvents::ES_PRIVATE),
+ EvResumeReceiveData,
+ };
+
+ struct TEvCheckDeadPeer : TEventLocal<TEvCheckDeadPeer, EvCheckDeadPeer> {};
+ struct TEvResumeReceiveData : TEventLocal<TEvResumeReceiveData, EvResumeReceiveData> {};
+
+ public:
+ static constexpr EActivityType ActorActivityType() {
+ return INTERCONNECT_SESSION_TCP;
+ }
+
+ TInputSessionTCP(const TActorId& sessionId,
+ TIntrusivePtr<NInterconnect::TStreamSocket> socket,
+ TIntrusivePtr<TReceiveContext> context,
+ TInterconnectProxyCommon::TPtr common,
+ std::shared_ptr<IInterconnectMetrics> metrics,
+ ui32 nodeId,
+ ui64 lastConfirmed,
+ TDuration deadPeerTimeout,
+ TSessionParams params);
+
+ private:
+ friend class TActorBootstrapped<TInputSessionTCP>;
+
+ void Bootstrap();
+
+ STRICT_STFUNC(WorkingState,
+ cFunc(TEvents::TSystem::PoisonPill, PassAway)
+ hFunc(TEvPollerReady, Handle)
+ hFunc(TEvPollerRegisterResult, Handle)
+ cFunc(EvResumeReceiveData, HandleResumeReceiveData)
+ cFunc(TEvInterconnect::TEvCloseInputSession::EventType, CloseInputSession)
+ cFunc(EvCheckDeadPeer, HandleCheckDeadPeer)
+ cFunc(TEvConfirmUpdate::EventType, HandleConfirmUpdate)
+ )
+
+ private:
+ TRope IncomingData;
+
+ const TActorId SessionId;
+ TIntrusivePtr<NInterconnect::TStreamSocket> Socket;
+ TPollerToken::TPtr PollerToken;
+ TIntrusivePtr<TReceiveContext> Context;
+ TInterconnectProxyCommon::TPtr Common;
+ const ui32 NodeId;
+ const TSessionParams Params;
+
+ // header we are currently processing (parsed from the stream)
+ union {
+ TTcpPacketHeader_v1 v1;
+ TTcpPacketHeader_v2 v2;
+ char Data[1];
+ } Header;
+ ui64 HeaderConfirm, HeaderSerial;
+
+ size_t PayloadSize;
+ ui32 ChecksumExpected, Checksum;
+ bool IgnorePayload;
+ TRope Payload;
+ enum class EState {
+ HEADER,
+ PAYLOAD,
+ };
+ EState State = EState::HEADER;
+
+ THolder<TEvUpdateFromInputSession> UpdateFromInputSession;
+
+ ui64 ConfirmedByInput;
+
+ std::shared_ptr<IInterconnectMetrics> Metrics;
+
+ bool CloseInputSessionRequested = false;
+
+ void CloseInputSession();
+
+ void Handle(TEvPollerReady::TPtr ev);
+ void Handle(TEvPollerRegisterResult::TPtr ev);
+ void HandleResumeReceiveData();
+ void HandleConfirmUpdate();
+ void ReceiveData();
+ void ProcessHeader(size_t headerLen);
+ void ProcessPayload(ui64& numDataBytes);
+ void ProcessEvent(TRope& data, TEventDescr& descr);
+ bool ReadMore();
+
+ void ReestablishConnection(TDisconnectReason reason);
+ void DestroySession(TDisconnectReason reason);
+
+ TDeque<TIntrusivePtr<TRopeAlignedBuffer>> Buffers;
+
+ static constexpr size_t NumPreallocatedBuffers = 16;
+ void PreallocateBuffers();
+
+ inline ui64 GetMaxCyclesPerEvent() const {
+ return DurationToCycles(TDuration::MicroSeconds(500));
+ }
+
+ const TDuration DeadPeerTimeout;
+ TInstant LastReceiveTimestamp;
+ void HandleCheckDeadPeer();
+
+ ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+ // pinger logic
+
+ bool NewPingProtocol = false;
+ TDeque<TDuration> PingQ; // last N ping samples
+ TDeque<i64> SkewQ; // last N calculated clock skew samples
+
+ void HandlePingResponse(TDuration passed);
+ void HandleClock(TInstant clock);
+ };
+
+ class TInterconnectSessionTCP
+ : public TActor<TInterconnectSessionTCP>
+ , public TInterconnectLoggingBase
+ {
+ enum {
+ EvCheckCloseOnIdle = EventSpaceBegin(TEvents::ES_PRIVATE),
+ EvCheckLostConnection,
+ EvRam,
+ EvTerminate,
+ EvFreeItems,
+ };
+
+ struct TEvCheckCloseOnIdle : TEventLocal<TEvCheckCloseOnIdle, EvCheckCloseOnIdle> {};
+ struct TEvCheckLostConnection : TEventLocal<TEvCheckLostConnection, EvCheckLostConnection> {};
+
+ struct TEvRam : TEventLocal<TEvRam, EvRam> {
+ const bool Batching;
+ TEvRam(bool batching) : Batching(batching) {}
+ };
+
+ struct TEvTerminate : TEventLocal<TEvTerminate, EvTerminate> {
+ TDisconnectReason Reason;
+
+ TEvTerminate(TDisconnectReason reason)
+ : Reason(std::move(reason))
+ {}
+ };
+
+ const TInstant Created;
+ TInstant NewConnectionSet;
+ ui64 MessagesGot = 0;
+ ui64 MessagesWrittenToBuffer = 0;
+ ui64 PacketsGenerated = 0;
+ ui64 PacketsWrittenToSocket = 0;
+ ui64 PacketsConfirmed = 0;
+
+ public:
+ static constexpr EActivityType ActorActivityType() {
+ return INTERCONNECT_SESSION_TCP;
+ }
+
+ TInterconnectSessionTCP(TInterconnectProxyTCP* const proxy, TSessionParams params);
+ ~TInterconnectSessionTCP();
+
+ void Init();
+ void CloseInputSession();
+
+ static TEvTerminate* NewEvTerminate(TDisconnectReason reason) {
+ return new TEvTerminate(std::move(reason));
+ }
+
+ TDuration GetPingRTT() const {
+ return TDuration::MicroSeconds(ReceiveContext->PingRTT_us);
+ }
+
+ i64 GetClockSkew() const {
+ return ReceiveContext->ClockSkew_us;
+ }
+
+ private:
+ friend class TInterconnectProxyTCP;
+
+ void Handle(TEvTerminate::TPtr& ev);
+ void HandlePoison();
+ void Terminate(TDisconnectReason reason);
+ void PassAway() override;
+
+ void Forward(STATEFN_SIG);
+ void Subscribe(STATEFN_SIG);
+ void Unsubscribe(STATEFN_SIG);
+
+ STRICT_STFUNC(StateFunc,
+ fFunc(TEvInterconnect::EvForward, Forward)
+ cFunc(TEvents::TEvPoisonPill::EventType, HandlePoison)
+ fFunc(TEvInterconnect::TEvConnectNode::EventType, Subscribe)
+ fFunc(TEvents::TEvSubscribe::EventType, Subscribe)
+ fFunc(TEvents::TEvUnsubscribe::EventType, Unsubscribe)
+ cFunc(TEvFlush::EventType, HandleFlush)
+ hFunc(TEvPollerReady, Handle)
+ hFunc(TEvPollerRegisterResult, Handle)
+ hFunc(TEvUpdateFromInputSession, Handle)
+ hFunc(TEvRam, HandleRam)
+ hFunc(TEvCheckCloseOnIdle, CloseOnIdleWatchdog)
+ hFunc(TEvCheckLostConnection, LostConnectionWatchdog)
+ cFunc(TEvents::TSystem::Wakeup, SendUpdateToWhiteboard)
+ hFunc(TEvSocketDisconnect, OnDisconnect)
+ hFunc(TEvTerminate, Handle)
+ hFunc(TEvProcessPingRequest, Handle)
+ )
+
+ void Handle(TEvUpdateFromInputSession::TPtr& ev);
+
+ void OnDisconnect(TEvSocketDisconnect::TPtr& ev);
+
+ THolder<TEvHandshakeAck> ProcessHandshakeRequest(TEvHandshakeAsk::TPtr& ev);
+ void SetNewConnection(TEvHandshakeDone::TPtr& ev);
+
+ TEvRam* RamInQueue = nullptr;
+ ui64 RamStartedCycles = 0;
+ void HandleRam(TEvRam::TPtr& ev);
+ void GenerateTraffic();
+
+ void SendUpdateToWhiteboard(bool connected = true);
+ ui32 CalculateQueueUtilization();
+
+ void Handle(TEvPollerReady::TPtr& ev);
+ void Handle(TEvPollerRegisterResult::TPtr ev);
+ void WriteData();
+
+ ui64 MakePacket(bool data, TMaybe<ui64> pingMask = {});
+ void FillSendingBuffer(TTcpPacketOutTask& packet, ui64 serial);
+ bool DropConfirmed(ui64 confirm);
+ void ShutdownSocket(TDisconnectReason reason);
+
+ void StartHandshake();
+ void ReestablishConnection(TEvHandshakeDone::TPtr&& ev, bool startHandshakeOnSessionClose,
+ TDisconnectReason reason);
+ void ReestablishConnectionWithHandshake(TDisconnectReason reason);
+ void ReestablishConnectionExecute();
+
+ TInterconnectProxyTCP* const Proxy;
+
+ // various connection settings access
+ TDuration GetDeadPeerTimeout() const;
+ TDuration GetCloseOnIdleTimeout() const;
+ TDuration GetLostConnectionTimeout() const;
+ ui32 GetTotalInflightAmountOfData() const;
+ ui64 GetMaxCyclesPerEvent() const;
+
+
+ ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+ // pinger
+
+ TInstant LastPingTimestamp;
+ static constexpr TDuration PingPeriodicity = TDuration::Seconds(1);
+ void IssuePingRequest();
+ void Handle(TEvProcessPingRequest::TPtr ev);
+
+ ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+ TInstant LastInputActivityTimestamp;
+ TInstant LastPayloadActivityTimestamp;
+ TWatchdogTimer<TEvCheckCloseOnIdle> CloseOnIdleWatchdog;
+ TWatchdogTimer<TEvCheckLostConnection> LostConnectionWatchdog;
+
+ void OnCloseOnIdleTimerHit() {
+ LOG_INFO_IC("ICS27", "CloseOnIdle timer hit, session terminated");
+ Terminate(TDisconnectReason::CloseOnIdle());
+ }
+
+ void OnLostConnectionTimerHit() {
+ LOG_ERROR_IC("ICS28", "LostConnection timer hit, session terminated");
+ Terminate(TDisconnectReason::LostConnection());
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+ const TSessionParams Params;
+ TMaybe<TEventHolderPool> Pool;
+ TMaybe<TChannelScheduler> ChannelScheduler;
+ ui64 TotalOutputQueueSize;
+ bool OutputStuckFlag;
+ TRecentWnd<std::pair<ui64, ui64>> OutputQueueUtilization;
+ size_t NumEventsInReadyChannels = 0;
+
+ void SetOutputStuckFlag(bool state);
+ void SwitchStuckPeriod();
+
+ using TSendQueue = TList<TTcpPacketOutTask>;
+ TSendQueue SendQueue;
+ TSendQueue SendQueueCache;
+ TSendQueue::iterator SendQueuePos;
+ ui64 WriteBlockedCycles = 0; // start of current block period
+ TDuration WriteBlockedTotal; // total incremental duration that session has been blocked
+ ui64 BytesUnwritten = 0;
+
+ void TrimSendQueueCache();
+
+ TDuration GetWriteBlockedTotal() const {
+ if (ReceiveContext->WriteBlockedByFullSendBuffer) {
+ double blockedUs = NHPTimer::GetSeconds(GetCycleCountFast() - WriteBlockedCycles) * 1000000.0;
+ return WriteBlockedTotal + TDuration::MicroSeconds(blockedUs); // append current blocking period if any
+ } else {
+ return WriteBlockedTotal;
+ }
+ }
+
+ ui64 OutputCounter;
+ ui64 LastSentSerial = 0;
+
+ TInstant LastHandshakeDone;
+
+ TIntrusivePtr<NInterconnect::TStreamSocket> Socket;
+ TPollerToken::TPtr PollerToken;
+ ui32 SendBufferSize;
+ ui64 InflightDataAmount = 0;
+
+ std::unordered_map<TActorId, ui64, TActorId::THash> Subscribers;
+
+ // time at which we want to send confirmation packet even if there was no outgoing data
+ ui64 UnconfirmedBytes = 0;
+ TInstant ForcePacketTimestamp = TInstant::Max();
+ TPriorityQueue<TInstant, TVector<TInstant>, std::greater<TInstant>> FlushSchedule;
+ size_t MaxFlushSchedule = 0;
+ ui64 FlushEventsScheduled = 0;
+ ui64 FlushEventsProcessed = 0;
+
+ void SetForcePacketTimestamp(TDuration period);
+ void ScheduleFlush();
+ void HandleFlush();
+ void ResetFlushLogic();
+
+ void GenerateHttpInfo(TStringStream& str);
+
+ TIntrusivePtr<TReceiveContext> ReceiveContext;
+ TActorId ReceiverId;
+ TDuration Ping;
+
+ ui64 ConfirmPacketsForcedBySize = 0;
+ ui64 ConfirmPacketsForcedByTimeout = 0;
+
+ ui64 LastConfirmed = 0;
+
+ TEvHandshakeDone::TPtr PendingHandshakeDoneEvent;
+ bool StartHandshakeOnSessionClose = false;
+
+ ui64 EqualizeCounter = 0;
+ };
+
+ class TInterconnectSessionKiller
+ : public TActorBootstrapped<TInterconnectSessionKiller> {
+ ui32 RepliesReceived = 0;
+ ui32 RepliesNumber = 0;
+ TActorId LargestSession = TActorId();
+ ui64 MaxBufferSize = 0;
+ TInterconnectProxyCommon::TPtr Common;
+
+ public:
+ static constexpr EActivityType ActorActivityType() {
+ return INTERCONNECT_SESSION_KILLER;
+ }
+
+ TInterconnectSessionKiller(TInterconnectProxyCommon::TPtr common)
+ : Common(common)
+ {
+ }
+
+ void Bootstrap() {
+ auto sender = SelfId();
+ const auto eventFabric = [&sender](const TActorId& recp) -> IEventHandle* {
+ auto ev = new TEvSessionBufferSizeRequest();
+ return new IEventHandle(recp, sender, ev, IEventHandle::FlagTrackDelivery);
+ };
+ RepliesNumber = TlsActivationContext->ExecutorThread.ActorSystem->BroadcastToProxies(eventFabric);
+ Become(&TInterconnectSessionKiller::StateFunc);
+ }
+
+ STRICT_STFUNC(StateFunc,
+ hFunc(TEvSessionBufferSizeResponse, ProcessResponse)
+ cFunc(TEvents::TEvUndelivered::EventType, ProcessUndelivered)
+ )
+
+ void ProcessResponse(TEvSessionBufferSizeResponse::TPtr& ev) {
+ RepliesReceived++;
+ if (MaxBufferSize < ev->Get()->BufferSize) {
+ MaxBufferSize = ev->Get()->BufferSize;
+ LargestSession = ev->Get()->SessionID;
+ }
+ if (RepliesReceived == RepliesNumber) {
+ Send(LargestSession, new TEvents::TEvPoisonPill);
+ AtomicUnlock(&Common->StartedSessionKiller);
+ PassAway();
+ }
+ }
+
+ void ProcessUndelivered() {
+ RepliesReceived++;
+ }
+ };
+
+ void CreateSessionKillingActor(TInterconnectProxyCommon::TPtr common);
+
+}
diff --git a/library/cpp/actors/interconnect/load.cpp b/library/cpp/actors/interconnect/load.cpp
new file mode 100644
index 0000000000..2a8443da71
--- /dev/null
+++ b/library/cpp/actors/interconnect/load.cpp
@@ -0,0 +1,405 @@
+#include "load.h"
+#include "interconnect_common.h"
+#include "events_local.h"
+#include <library/cpp/actors/protos/services_common.pb.h>
+#include <library/cpp/actors/core/log.h>
+#include <library/cpp/actors/core/actor_bootstrapped.h>
+#include <library/cpp/actors/core/events.h>
+#include <library/cpp/actors/core/hfunc.h>
+#include <util/generic/queue.h>
+
+namespace NInterconnect {
+ using namespace NActors;
+
+ enum {
+ EvGenerateMessages = EventSpaceBegin(TEvents::ES_PRIVATE),
+ EvPublishResults,
+ EvQueryTrafficCounter,
+ EvTrafficCounter,
+ };
+
+ struct TEvQueryTrafficCounter : TEventLocal<TEvQueryTrafficCounter, EvQueryTrafficCounter> {};
+
+ struct TEvTrafficCounter : TEventLocal<TEvTrafficCounter, EvTrafficCounter> {
+ std::shared_ptr<std::atomic_uint64_t> Traffic;
+
+ TEvTrafficCounter(std::shared_ptr<std::atomic_uint64_t> traffic)
+ : Traffic(std::move(traffic))
+ {}
+ };
+
+ class TLoadResponderActor : public TActor<TLoadResponderActor> {
+ STRICT_STFUNC(StateFunc,
+ HFunc(TEvLoadMessage, Handle);
+ CFunc(TEvents::TSystem::PoisonPill, Die);
+ )
+
+ void Handle(TEvLoadMessage::TPtr& ev, const TActorContext& ctx) {
+ ui64 bytes = ev->Get()->CalculateSerializedSizeCached();
+ auto& record = ev->Get()->Record;
+ auto *hops = record.MutableHops();
+ while (!hops->empty() && !hops->begin()->HasNextHop()) {
+ record.ClearPayload();
+ ev->Get()->StripPayload();
+ hops->erase(hops->begin());
+ }
+ if (!hops->empty()) {
+ // extract actor id of the next hop
+ const TActorId nextHopActorId = ActorIdFromProto(hops->begin()->GetNextHop());
+ hops->erase(hops->begin());
+
+ // forward message to next hop; preserve flags and cookie
+ auto msg = MakeHolder<TEvLoadMessage>();
+ record.Swap(&msg->Record);
+ bytes += msg->CalculateSerializedSizeCached();
+ ctx.Send(nextHopActorId, msg.Release(), ev->Flags, ev->Cookie);
+ }
+ *Traffic += bytes;
+ }
+
+ public:
+ TLoadResponderActor(std::shared_ptr<std::atomic_uint64_t> traffic)
+ : TActor(&TLoadResponderActor::StateFunc)
+ , Traffic(std::move(traffic))
+ {}
+
+ static constexpr IActor::EActivityType ActorActivityType() {
+ return IActor::INTERCONNECT_LOAD_RESPONDER;
+ }
+
+ private:
+ std::shared_ptr<std::atomic_uint64_t> Traffic;
+ };
+
+ class TLoadResponderMasterActor : public TActorBootstrapped<TLoadResponderMasterActor> {
+ TVector<TActorId> Slaves;
+ ui32 SlaveIndex = 0;
+
+ STRICT_STFUNC(StateFunc,
+ HFunc(TEvLoadMessage, Handle);
+ HFunc(TEvQueryTrafficCounter, Handle);
+ CFunc(TEvents::TSystem::PoisonPill, Die);
+ )
+
+ void Handle(TEvLoadMessage::TPtr& ev, const TActorContext& ctx) {
+ ctx.ExecutorThread.ActorSystem->Send(ev->Forward(Slaves[SlaveIndex]));
+ if (++SlaveIndex == Slaves.size()) {
+ SlaveIndex = 0;
+ }
+ }
+
+ void Handle(TEvQueryTrafficCounter::TPtr ev, const TActorContext& ctx) {
+ ctx.Send(ev->Sender, new TEvTrafficCounter(Traffic));
+ }
+
+ void Die(const TActorContext& ctx) override {
+ for (const TActorId& actorId : Slaves) {
+ ctx.Send(actorId, new TEvents::TEvPoisonPill);
+ }
+ TActorBootstrapped::Die(ctx);
+ }
+
+ public:
+ static constexpr IActor::EActivityType ActorActivityType() {
+ return IActor::INTERCONNECT_LOAD_RESPONDER;
+ }
+
+ TLoadResponderMasterActor()
+ {}
+
+ void Bootstrap(const TActorContext& ctx) {
+ Become(&TLoadResponderMasterActor::StateFunc);
+ while (Slaves.size() < 10) {
+ Slaves.push_back(ctx.Register(new TLoadResponderActor(Traffic)));
+ }
+ }
+
+ private:
+ std::shared_ptr<std::atomic_uint64_t> Traffic = std::make_shared<std::atomic_uint64_t>();
+ };
+
+ IActor* CreateLoadResponderActor() {
+ return new TLoadResponderMasterActor();
+ }
+
+ TActorId MakeLoadResponderActorId(ui32 nodeId) {
+ char x[12] = {'I', 'C', 'L', 'o', 'a', 'd', 'R', 'e', 's', 'p', 'A', 'c'};
+ return TActorId(nodeId, TStringBuf(x, 12));
+ }
+
+ class TLoadActor: public TActorBootstrapped<TLoadActor> {
+ struct TEvGenerateMessages : TEventLocal<TEvGenerateMessages, EvGenerateMessages> {};
+ struct TEvPublishResults : TEventLocal<TEvPublishResults, EvPublishResults> {};
+
+ struct TMessageInfo {
+ TInstant SendTimestamp;
+
+ TMessageInfo(const TInstant& sendTimestamp)
+ : SendTimestamp(sendTimestamp)
+ {
+ }
+ };
+
+ const TLoadParams Params;
+ TInstant NextMessageTimestamp;
+ THashMap<TString, TMessageInfo> InFly;
+ ui64 NextId = 1;
+ TVector<TActorId> Hops;
+ TActorId FirstHop;
+ ui64 NumDropped = 0;
+ std::shared_ptr<std::atomic_uint64_t> Traffic;
+
+ public:
+ static constexpr IActor::EActivityType ActorActivityType() {
+ return IActor::INTERCONNECT_LOAD_ACTOR;
+ }
+
+ TLoadActor(const TLoadParams& params)
+ : Params(params)
+ {}
+
+ void Bootstrap(const TActorContext& ctx) {
+ Become(&TLoadActor::QueryTrafficCounter);
+ ctx.Send(MakeLoadResponderActorId(SelfId().NodeId()), new TEvQueryTrafficCounter);
+ }
+
+ void Handle(TEvTrafficCounter::TPtr ev, const TActorContext& ctx) {
+ Traffic = std::move(ev->Get()->Traffic);
+
+ for (const ui32 nodeId : Params.NodeHops) {
+ const TActorId& actorId = nodeId ? MakeLoadResponderActorId(nodeId) : TActorId();
+ if (!FirstHop) {
+ FirstHop = actorId;
+ } else {
+ Hops.push_back(actorId);
+ }
+ }
+
+ Hops.push_back(ctx.SelfID);
+
+ Become(&TLoadActor::StateFunc);
+ NextMessageTimestamp = ctx.Now();
+ ResetThroughput(NextMessageTimestamp, *Traffic);
+ GenerateMessages(ctx);
+ ctx.Schedule(Params.Duration, new TEvents::TEvPoisonPill);
+ SchedulePublishResults(ctx);
+ }
+
+ void GenerateMessages(const TActorContext& ctx) {
+ while (InFly.size() < Params.InFlyMax && ctx.Now() >= NextMessageTimestamp) {
+ // generate payload
+ const ui32 size = Params.SizeMin + RandomNumber(Params.SizeMax - Params.SizeMin + 1);
+
+ // generate message id
+ const ui64 cookie = NextId++;
+ TString id = Sprintf("%" PRIu64, cookie);
+
+ // create message and send it to the first hop
+ THolder<TEvLoadMessage> ev;
+ if (Params.UseProtobufWithPayload && size) {
+ auto buffer = TRopeAlignedBuffer::Allocate(size);
+ memset(buffer->GetBuffer(), '*', size);
+ ev.Reset(new TEvLoadMessage(Hops, id, TRope(buffer)));
+ } else {
+ TString payload;
+ if (size) {
+ payload = TString::Uninitialized(size);
+ memset(payload.Detach(), '*', size);
+ }
+ ev.Reset(new TEvLoadMessage(Hops, id, payload ? &payload : nullptr));
+ }
+ UpdateThroughput(ev->CalculateSerializedSizeCached());
+ ctx.Send(FirstHop, ev.Release(), IEventHandle::MakeFlags(Params.Channel, 0), cookie);
+
+ // register in the map
+ InFly.emplace(id, TMessageInfo(ctx.Now()));
+
+ // put item into timeout queue
+ PutTimeoutQueueItem(ctx, id);
+
+ const TDuration duration = TDuration::MicroSeconds(Params.IntervalMin.GetValue() +
+ RandomNumber(Params.IntervalMax.GetValue() - Params.IntervalMin.GetValue() + 1));
+ if (Params.SoftLoad) {
+ NextMessageTimestamp += duration;
+ } else {
+ NextMessageTimestamp = ctx.Now() + duration;
+ }
+ }
+
+ // schedule next generate messages call
+ if (NextMessageTimestamp > ctx.Now() && InFly.size() < Params.InFlyMax) {
+ ctx.Schedule(NextMessageTimestamp - ctx.Now(), new TEvGenerateMessages);
+ }
+ }
+
+ void Handle(TEvLoadMessage::TPtr& ev, const TActorContext& ctx) {
+ const auto& record = ev->Get()->Record;
+ auto it = InFly.find(record.GetId());
+ if (it != InFly.end()) {
+ // record message rtt
+ const TDuration rtt = ctx.Now() - it->second.SendTimestamp;
+ UpdateHistogram(ctx.Now(), rtt);
+
+ // update throughput
+ UpdateThroughput(ev->Get()->CalculateSerializedSizeCached());
+
+ // remove message from the in fly map
+ InFly.erase(it);
+ } else {
+ ++NumDropped;
+ }
+ GenerateMessages(ctx);
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+ // RTT HISTOGRAM
+ ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+ const TDuration AggregationPeriod = TDuration::Seconds(20);
+ TDeque<std::pair<TInstant, TDuration>> Histogram;
+
+ void UpdateHistogram(TInstant when, TDuration rtt) {
+ Histogram.emplace_back(when, rtt);
+
+ const TInstant barrier = when - AggregationPeriod;
+ while (Histogram && Histogram.front().first < barrier) {
+ Histogram.pop_front();
+ }
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+ // THROUGHPUT
+ ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+ TInstant ThroughputFirstSample = TInstant::Zero();
+ ui64 ThroughputSamples = 0;
+ ui64 ThroughputBytes = 0;
+ ui64 TrafficAtBegin = 0;
+
+ void UpdateThroughput(ui64 bytes) {
+ ThroughputBytes += bytes;
+ ++ThroughputSamples;
+ }
+
+ void ResetThroughput(TInstant when, ui64 traffic) {
+ ThroughputFirstSample = when;
+ ThroughputSamples = 0;
+ ThroughputBytes = 0;
+ TrafficAtBegin = traffic;
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+ // TIMEOUT QUEUE OPERATIONS
+ ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+ TQueue<std::pair<TInstant, TString>> TimeoutQueue;
+
+ void PutTimeoutQueueItem(const TActorContext& ctx, TString id) {
+ TimeoutQueue.emplace(ctx.Now() + TDuration::Minutes(1), std::move(id));
+ if (TimeoutQueue.size() == 1) {
+ ScheduleWakeup(ctx);
+ }
+ }
+
+ void ScheduleWakeup(const TActorContext& ctx) {
+ ctx.Schedule(TimeoutQueue.front().first - ctx.Now(), new TEvents::TEvWakeup);
+ }
+
+ void HandleWakeup(const TActorContext& ctx) {
+ ui32 numDropped = 0;
+
+ while (TimeoutQueue && TimeoutQueue.front().first <= ctx.Now()) {
+ numDropped += InFly.erase(TimeoutQueue.front().second);
+ TimeoutQueue.pop();
+ }
+ if (TimeoutQueue) {
+ // we still have some elements in timeout queue, so schedule next wake up to tidy up
+ ScheduleWakeup(ctx);
+ }
+
+ GenerateMessages(ctx);
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+ // RESULT PUBLISHING
+ ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+ const TDuration ResultPublishPeriod = TDuration::Seconds(15);
+
+ void SchedulePublishResults(const TActorContext& ctx) {
+ ctx.Schedule(ResultPublishPeriod, new TEvPublishResults);
+ }
+
+ void PublishResults(const TActorContext& ctx, bool schedule = true) {
+ const TInstant now = ctx.Now();
+
+ TStringStream msg;
+
+ msg << "Load# '" << Params.Name << "'";
+
+ msg << " Throughput# ";
+ const TDuration duration = now - ThroughputFirstSample;
+ const ui64 traffic = *Traffic;
+ msg << "{window# " << duration
+ << " bytes# " << ThroughputBytes
+ << " samples# " << ThroughputSamples
+ << " b/s# " << ui64(ThroughputBytes * 1000000 / duration.MicroSeconds())
+ << " common# " << ui64((traffic - TrafficAtBegin) * 1000000 / duration.MicroSeconds())
+ << "}";
+ ResetThroughput(now, traffic);
+
+ msg << " RTT# ";
+ if (Histogram) {
+ const TDuration duration = Histogram.back().first - Histogram.front().first;
+ msg << "{window# " << duration << " samples# " << Histogram.size();
+ TVector<TDuration> v;
+ v.reserve(Histogram.size());
+ for (const auto& item : Histogram) {
+ v.push_back(item.second);
+ }
+ std::sort(v.begin(), v.end());
+ for (double q : {0.5, 0.9, 0.99, 0.999, 0.9999, 1.0}) {
+ const size_t pos = q * (v.size() - 1);
+ msg << Sprintf(" %.4f# %s", q, v[pos].ToString().data());
+ }
+ msg << "}";
+ } else {
+ msg << "<empty>";
+ }
+
+ msg << " NumDropped# " << NumDropped;
+
+ if (!schedule) {
+ msg << " final";
+ }
+
+ LOG_NOTICE(ctx, NActorsServices::INTERCONNECT_SPEED_TEST, "%s", msg.Str().data());
+
+ if (schedule) {
+ SchedulePublishResults(ctx);
+ }
+ }
+
+ STRICT_STFUNC(QueryTrafficCounter,
+ HFunc(TEvTrafficCounter, Handle);
+ )
+
+ STRICT_STFUNC(StateFunc,
+ CFunc(TEvents::TSystem::PoisonPill, Die);
+ CFunc(TEvents::TSystem::Wakeup, HandleWakeup);
+ CFunc(EvPublishResults, PublishResults);
+ CFunc(EvGenerateMessages, GenerateMessages);
+ HFunc(TEvLoadMessage, Handle);
+ )
+
+ void Die(const TActorContext& ctx) override {
+ PublishResults(ctx, false);
+ TActorBootstrapped::Die(ctx);
+ }
+ };
+
+ IActor* CreateLoadActor(const TLoadParams& params) {
+ return new TLoadActor(params);
+ }
+
+}
diff --git a/library/cpp/actors/interconnect/load.h b/library/cpp/actors/interconnect/load.h
new file mode 100644
index 0000000000..0a01a0dc04
--- /dev/null
+++ b/library/cpp/actors/interconnect/load.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <library/cpp/actors/core/actor.h>
+
+namespace NInterconnect {
+ // load responder -- lives on every node as a service actor
+ NActors::IActor* CreateLoadResponderActor();
+ NActors::TActorId MakeLoadResponderActorId(ui32 node);
+
+ // load actor -- generates load with specific parameters
+ struct TLoadParams {
+ TString Name;
+ ui32 Channel;
+ TVector<ui32> NodeHops; // node ids for the message route
+ ui32 SizeMin, SizeMax; // min and max size for payloads
+ ui32 InFlyMax; // maximum number of in fly messages
+ TDuration IntervalMin, IntervalMax; // min and max intervals between sending messages
+ bool SoftLoad; // is the load soft?
+ TDuration Duration; // test duration
+ bool UseProtobufWithPayload; // store payload separately
+ };
+ NActors::IActor* CreateLoadActor(const TLoadParams& params);
+
+}
diff --git a/library/cpp/actors/interconnect/logging.h b/library/cpp/actors/interconnect/logging.h
new file mode 100644
index 0000000000..c429d1cade
--- /dev/null
+++ b/library/cpp/actors/interconnect/logging.h
@@ -0,0 +1,68 @@
+#pragma once
+
+#include <library/cpp/actors/core/log.h>
+#include <library/cpp/actors/protos/services_common.pb.h>
+
+#define LOG_LOG_IC_X(component, marker, priority, ...) \
+ do { \
+ LOG_LOG(this->GetActorContext(), (priority), (component), "%s " marker " %s", LogPrefix.data(), Sprintf(__VA_ARGS__).data()); \
+ } while (false)
+
+#define LOG_LOG_NET_X(priority, NODE_ID, FMT, ...) \
+ do { \
+ const TActorContext& ctx = this->GetActorContext(); \
+ LOG_LOG(ctx, (priority), ::NActorsServices::INTERCONNECT_NETWORK, "[%" PRIu32 " <-> %" PRIu32 "] %s", \
+ ctx.SelfID.NodeId(), (NODE_ID), Sprintf(FMT, __VA_ARGS__).data()); \
+ } while (false)
+
+#define LOG_LOG_IC(component, marker, priority, ...) \
+ do { \
+ LOG_LOG(::NActors::TActivationContext::AsActorContext(), (priority), (component), "%s " marker " %s", LogPrefix.data(), Sprintf(__VA_ARGS__).data()); \
+ } while (false)
+
+#define LOG_LOG_NET(priority, NODE_ID, FMT, ...) \
+ do { \
+ const TActorContext& ctx = ::NActors::TActivationContext::AsActorContext(); \
+ LOG_LOG(ctx, (priority), ::NActorsServices::INTERCONNECT_NETWORK, "[%" PRIu32 " <-> %" PRIu32 "] %s", \
+ ctx.SelfID.NodeId(), (NODE_ID), Sprintf(FMT, __VA_ARGS__).data()); \
+ } while (false)
+
+#define LOG_EMER_IC(marker, ...) LOG_LOG_IC(::NActorsServices::INTERCONNECT, marker, ::NActors::NLog::PRI_EMER, __VA_ARGS__)
+#define LOG_ALERT_IC(marker, ...) LOG_LOG_IC(::NActorsServices::INTERCONNECT, marker, ::NActors::NLog::PRI_ALERT, __VA_ARGS__)
+#define LOG_CRIT_IC(marker, ...) LOG_LOG_IC(::NActorsServices::INTERCONNECT, marker, ::NActors::NLog::PRI_CRIT, __VA_ARGS__)
+#define LOG_ERROR_IC(marker, ...) LOG_LOG_IC(::NActorsServices::INTERCONNECT, marker, ::NActors::NLog::PRI_ERROR, __VA_ARGS__)
+#define LOG_WARN_IC(marker, ...) LOG_LOG_IC(::NActorsServices::INTERCONNECT, marker, ::NActors::NLog::PRI_WARN, __VA_ARGS__)
+#define LOG_NOTICE_IC(marker, ...) LOG_LOG_IC(::NActorsServices::INTERCONNECT, marker, ::NActors::NLog::PRI_NOTICE, __VA_ARGS__)
+#define LOG_INFO_IC(marker, ...) LOG_LOG_IC(::NActorsServices::INTERCONNECT, marker, ::NActors::NLog::PRI_INFO, __VA_ARGS__)
+#define LOG_DEBUG_IC(marker, ...) LOG_LOG_IC(::NActorsServices::INTERCONNECT, marker, ::NActors::NLog::PRI_DEBUG, __VA_ARGS__)
+
+#define LOG_EMER_IC_SESSION(marker, ...) LOG_LOG_IC(::NActorsServices::INTERCONNECT_SESSION, marker, ::NActors::NLog::PRI_EMER, __VA_ARGS__)
+#define LOG_ALERT_IC_SESSION(marker, ...) LOG_LOG_IC(::NActorsServices::INTERCONNECT_SESSION, marker, ::NActors::NLog::PRI_ALERT, __VA_ARGS__)
+#define LOG_CRIT_IC_SESSION(marker, ...) LOG_LOG_IC(::NActorsServices::INTERCONNECT_SESSION, marker, ::NActors::NLog::PRI_CRIT, __VA_ARGS__)
+#define LOG_ERROR_IC_SESSION(marker, ...) LOG_LOG_IC(::NActorsServices::INTERCONNECT_SESSION, marker, ::NActors::NLog::PRI_ERROR, __VA_ARGS__)
+#define LOG_WARN_IC_SESSION(marker, ...) LOG_LOG_IC(::NActorsServices::INTERCONNECT_SESSION, marker, ::NActors::NLog::PRI_WARN, __VA_ARGS__)
+#define LOG_NOTICE_IC_SESSION(marker, ...) LOG_LOG_IC(::NActorsServices::INTERCONNECT_SESSION, marker, ::NActors::NLog::PRI_NOTICE, __VA_ARGS__)
+#define LOG_INFO_IC_SESSION(marker, ...) LOG_LOG_IC(::NActorsServices::INTERCONNECT_SESSION, marker, ::NActors::NLog::PRI_INFO, __VA_ARGS__)
+#define LOG_DEBUG_IC_SESSION(marker, ...) LOG_LOG_IC(::NActorsServices::INTERCONNECT_SESSION, marker, ::NActors::NLog::PRI_DEBUG, __VA_ARGS__)
+
+#define LOG_NOTICE_NET(NODE_ID, FMT, ...) LOG_LOG_NET(::NActors::NLog::PRI_NOTICE, NODE_ID, FMT, __VA_ARGS__)
+#define LOG_DEBUG_NET(NODE_ID, FMT, ...) LOG_LOG_NET(::NActors::NLog::PRI_DEBUG, NODE_ID, FMT, __VA_ARGS__)
+
+namespace NActors {
+ class TInterconnectLoggingBase {
+ protected:
+ const TString LogPrefix;
+
+ public:
+ TInterconnectLoggingBase() = default;
+
+ TInterconnectLoggingBase(const TString& prefix)
+ : LogPrefix(prefix)
+ {
+ }
+
+ void SetPrefix(TString logPrefix) const {
+ logPrefix.swap(const_cast<TString&>(LogPrefix));
+ }
+ };
+}
diff --git a/library/cpp/actors/interconnect/mock/ic_mock.cpp b/library/cpp/actors/interconnect/mock/ic_mock.cpp
new file mode 100644
index 0000000000..884503e602
--- /dev/null
+++ b/library/cpp/actors/interconnect/mock/ic_mock.cpp
@@ -0,0 +1,298 @@
+#include "ic_mock.h"
+#include <library/cpp/actors/core/interconnect.h>
+#include <util/system/yield.h>
+#include <thread>
+
+namespace NActors {
+
+ class TInterconnectMock::TImpl {
+ enum {
+ EvInject = EventSpaceBegin(TEvents::ES_PRIVATE),
+ EvCheckSession,
+ EvRam,
+ };
+
+ struct TEvInject : TEventLocal<TEvInject, EvInject> {
+ std::deque<std::unique_ptr<IEventHandle>> Messages;
+ const TScopeId OriginScopeId;
+ const ui64 SenderSessionId;
+
+ TEvInject(std::deque<std::unique_ptr<IEventHandle>>&& messages, const TScopeId& originScopeId, ui64 senderSessionId)
+ : Messages(std::move(messages))
+ , OriginScopeId(originScopeId)
+ , SenderSessionId(senderSessionId)
+ {}
+ };
+
+ class TProxyMockActor;
+
+ class TConnectionState {
+ struct TPeerInfo {
+ TRWMutex Mutex;
+ TActorSystem *ActorSystem = nullptr;
+ TActorId ProxyId;
+ };
+
+ const ui64 Key;
+ TPeerInfo PeerInfo[2];
+ std::atomic_uint64_t SessionId = 0;
+
+ public:
+ TConnectionState(ui64 key)
+ : Key(key)
+ {}
+
+ void Attach(ui32 nodeId, TActorSystem *as, const TActorId& actorId) {
+ TPeerInfo *peer = GetPeer(nodeId);
+ auto guard = TWriteGuard(peer->Mutex);
+ Y_VERIFY(!peer->ActorSystem);
+ peer->ActorSystem = as;
+ peer->ProxyId = actorId;
+ as->DeferPreStop([peer] {
+ auto guard = TWriteGuard(peer->Mutex);
+ peer->ActorSystem = nullptr;
+ });
+ }
+
+ void Inject(ui32 peerNodeId, std::deque<std::unique_ptr<IEventHandle>>&& messages,
+ const TScopeId& originScopeId, ui64 senderSessionId) {
+ TPeerInfo *peer = GetPeer(peerNodeId);
+ auto guard = TReadGuard(peer->Mutex);
+ if (peer->ActorSystem) {
+ peer->ActorSystem->Send(new IEventHandle(peer->ProxyId, TActorId(), new TEvInject(std::move(messages),
+ originScopeId, senderSessionId)));
+ } else {
+ for (auto&& ev : messages) {
+ TActivationContext::Send(ev->ForwardOnNondelivery(TEvents::TEvUndelivered::Disconnected));
+ }
+ }
+ }
+
+ ui64 GetValidSessionId() const {
+ return SessionId;
+ }
+
+ void InvalidateSessionId(ui32 peerNodeId) {
+ ++SessionId;
+ TPeerInfo *peer = GetPeer(peerNodeId);
+ auto guard = TReadGuard(peer->Mutex);
+ if (peer->ActorSystem) {
+ peer->ActorSystem->Send(new IEventHandle(EvCheckSession, 0, peer->ProxyId, {}, nullptr, 0));
+ }
+ }
+
+ private:
+ TPeerInfo *GetPeer(ui32 nodeId) {
+ if (nodeId == ui32(Key)) {
+ return PeerInfo;
+ } else if (nodeId == ui32(Key >> 32)) {
+ return PeerInfo + 1;
+ } else {
+ Y_FAIL();
+ }
+ }
+ };
+
+ class TProxyMockActor : public TActor<TProxyMockActor> {
+ class TSessionMockActor : public TActor<TSessionMockActor> {
+ std::map<TActorId, ui64> Subscribers;
+ TProxyMockActor* const Proxy;
+ std::deque<std::unique_ptr<IEventHandle>> Queue;
+
+ public:
+ const ui64 SessionId;
+
+ public:
+ TSessionMockActor(TProxyMockActor *proxy, ui64 sessionId)
+ : TActor(&TThis::StateFunc)
+ , Proxy(proxy)
+ , SessionId(sessionId)
+ {}
+
+ void Terminate() {
+ for (auto&& ev : std::exchange(Queue, {})) {
+ TActivationContext::Send(ev->ForwardOnNondelivery(TEvents::TEvUndelivered::Disconnected));
+ }
+ for (const auto& kv : Subscribers) {
+ Send(kv.first, new TEvInterconnect::TEvNodeDisconnected(Proxy->PeerNodeId), 0, kv.second);
+ }
+ Y_VERIFY(Proxy->Session == this);
+ Proxy->Session = nullptr;
+ PassAway();
+ }
+
+ void HandleForward(TAutoPtr<IEventHandle> ev) {
+ if (ev->Flags & IEventHandle::FlagSubscribeOnSession) {
+ Subscribe(ev->Sender, ev->Cookie);
+ }
+ if (Queue.empty()) {
+ TActivationContext::Send(new IEventHandle(EvRam, 0, SelfId(), {}, {}, 0));
+ }
+ Queue.emplace_back(ev.Release());
+ }
+
+ void HandleRam() {
+ if (SessionId != Proxy->State.GetValidSessionId()) {
+ Terminate();
+ } else {
+ Proxy->PeerInject(std::exchange(Queue, {}));
+ }
+ }
+
+ void Handle(TEvInterconnect::TEvConnectNode::TPtr ev) {
+ Subscribe(ev->Sender, ev->Cookie);
+ }
+
+ void Handle(TEvents::TEvSubscribe::TPtr ev) {
+ Subscribe(ev->Sender, ev->Cookie);
+ }
+
+ void Handle(TEvents::TEvUnsubscribe::TPtr ev) {
+ Subscribers.erase(ev->Sender);
+ }
+
+ void HandlePoison() {
+ Proxy->Disconnect();
+ }
+
+ STRICT_STFUNC(StateFunc,
+ fFunc(TEvInterconnect::EvForward, HandleForward)
+ hFunc(TEvInterconnect::TEvConnectNode, Handle)
+ hFunc(TEvents::TEvSubscribe, Handle)
+ hFunc(TEvents::TEvUnsubscribe, Handle)
+ cFunc(TEvents::TSystem::Poison, HandlePoison)
+ cFunc(EvRam, HandleRam)
+ )
+
+ private:
+ void Subscribe(const TActorId& actorId, ui64 cookie) {
+ Subscribers[actorId] = cookie;
+ Send(actorId, new TEvInterconnect::TEvNodeConnected(Proxy->PeerNodeId), 0, cookie);
+ }
+ };
+
+ friend class TSessionMockActor;
+
+ const ui32 NodeId;
+ const ui32 PeerNodeId;
+ TConnectionState& State;
+ const TInterconnectProxyCommon::TPtr Common;
+ TSessionMockActor *Session = nullptr;
+
+ public:
+ TProxyMockActor(ui32 nodeId, ui32 peerNodeId, TConnectionState& state, TInterconnectProxyCommon::TPtr common)
+ : TActor(&TThis::StateFunc)
+ , NodeId(nodeId)
+ , PeerNodeId(peerNodeId)
+ , State(state)
+ , Common(std::move(common))
+ {}
+
+ void Registered(TActorSystem *as, const TActorId& parent) override {
+ TActor::Registered(as, parent);
+ State.Attach(NodeId, as, SelfId());
+ }
+
+ void Handle(TEvInject::TPtr ev) {
+ auto *msg = ev->Get();
+ if (Session && Session->SessionId != msg->SenderSessionId) {
+ return; // drop messages from other sessions
+ }
+ if (auto *session = GetSession()) {
+ for (auto&& ev : ev->Get()->Messages) {
+ auto fw = std::make_unique<IEventHandle>(
+ session->SelfId(),
+ ev->Type,
+ ev->Flags & ~IEventHandle::FlagForwardOnNondelivery,
+ ev->Recipient,
+ ev->Sender,
+ ev->ReleaseChainBuffer(),
+ ev->Cookie,
+ msg->OriginScopeId,
+ std::move(ev->TraceId)
+ );
+ if (!Common->EventFilter || Common->EventFilter->CheckIncomingEvent(*fw, Common->LocalScopeId)) {
+ TActivationContext::Send(fw.release());
+ }
+ }
+ }
+ }
+
+ void PassAway() override {
+ Disconnect();
+ TActor::PassAway();
+ }
+
+ TSessionMockActor *GetSession() {
+ CheckSession();
+ if (!Session) {
+ Session = new TSessionMockActor(this, State.GetValidSessionId());
+ RegisterWithSameMailbox(Session);
+ }
+ return Session;
+ }
+
+ void HandleSessionEvent(TAutoPtr<IEventHandle> ev) {
+ auto *session = GetSession();
+ InvokeOtherActor(*session, &TSessionMockActor::Receive, ev,
+ TActivationContext::ActorContextFor(session->SelfId()));
+ }
+
+ void Disconnect() {
+ State.InvalidateSessionId(PeerNodeId);
+ if (Session) {
+ Session->Terminate();
+ }
+ }
+
+ void CheckSession() {
+ if (Session && Session->SessionId != State.GetValidSessionId()) {
+ Session->Terminate();
+ }
+ }
+
+ void PeerInject(std::deque<std::unique_ptr<IEventHandle>>&& messages) {
+ Y_VERIFY(Session);
+ return State.Inject(PeerNodeId, std::move(messages), Common->LocalScopeId, Session->SessionId);
+ }
+
+ STRICT_STFUNC(StateFunc,
+ cFunc(TEvents::TSystem::Poison, PassAway)
+ fFunc(TEvInterconnect::EvForward, HandleSessionEvent)
+ fFunc(TEvInterconnect::EvConnectNode, HandleSessionEvent)
+ fFunc(TEvents::TSystem::Subscribe, HandleSessionEvent)
+ fFunc(TEvents::TSystem::Unsubscribe, HandleSessionEvent)
+ cFunc(TEvInterconnect::EvDisconnect, Disconnect)
+ IgnoreFunc(TEvInterconnect::TEvClosePeerSocket)
+ IgnoreFunc(TEvInterconnect::TEvCloseInputSession)
+ cFunc(TEvInterconnect::EvPoisonSession, Disconnect)
+ hFunc(TEvInject, Handle)
+ cFunc(EvCheckSession, CheckSession)
+ )
+ };
+
+ std::unordered_map<ui64, TConnectionState> States;
+
+ public:
+ IActor *CreateProxyMock(ui32 nodeId, ui32 peerNodeId, TInterconnectProxyCommon::TPtr common) {
+ Y_VERIFY(nodeId != peerNodeId);
+ Y_VERIFY(nodeId);
+ Y_VERIFY(peerNodeId);
+ const ui64 key = std::min(nodeId, peerNodeId) | ui64(std::max(nodeId, peerNodeId)) << 32;
+ auto it = States.try_emplace(key, key).first;
+ return new TProxyMockActor(nodeId, peerNodeId, it->second, std::move(common));
+ }
+ };
+
+ TInterconnectMock::TInterconnectMock()
+ : Impl(std::make_unique<TImpl>())
+ {}
+
+ TInterconnectMock::~TInterconnectMock()
+ {}
+
+ IActor *TInterconnectMock::CreateProxyMock(ui32 nodeId, ui32 peerNodeId, TInterconnectProxyCommon::TPtr common) {
+ return Impl->CreateProxyMock(nodeId, peerNodeId, std::move(common));
+ }
+
+} // NActors
diff --git a/library/cpp/actors/interconnect/mock/ic_mock.h b/library/cpp/actors/interconnect/mock/ic_mock.h
new file mode 100644
index 0000000000..636bdc2b7f
--- /dev/null
+++ b/library/cpp/actors/interconnect/mock/ic_mock.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <library/cpp/actors/core/actor.h>
+
+#include <library/cpp/actors/interconnect/interconnect_common.h>
+
+namespace NActors {
+
+ class TInterconnectMock {
+ class TImpl;
+ std::unique_ptr<TImpl> Impl;
+
+ public:
+ TInterconnectMock();
+ ~TInterconnectMock();
+ IActor *CreateProxyMock(ui32 nodeId, ui32 peerNodeId, TInterconnectProxyCommon::TPtr common);
+ };
+
+} // NActors
diff --git a/library/cpp/actors/interconnect/mock/tsan.supp b/library/cpp/actors/interconnect/mock/tsan.supp
new file mode 100644
index 0000000000..19fd059419
--- /dev/null
+++ b/library/cpp/actors/interconnect/mock/tsan.supp
@@ -0,0 +1 @@
+deadlock:Attach
diff --git a/library/cpp/actors/interconnect/mock/ya.make b/library/cpp/actors/interconnect/mock/ya.make
new file mode 100644
index 0000000000..19a2834162
--- /dev/null
+++ b/library/cpp/actors/interconnect/mock/ya.make
@@ -0,0 +1,16 @@
+LIBRARY()
+
+OWNER(alexvru)
+
+SRCS(
+ ic_mock.cpp
+ ic_mock.h
+)
+
+SUPPRESSIONS(tsan.supp)
+
+PEERDIR(
+ library/cpp/actors/interconnect
+)
+
+END()
diff --git a/library/cpp/actors/interconnect/packet.cpp b/library/cpp/actors/interconnect/packet.cpp
new file mode 100644
index 0000000000..e2c289ed59
--- /dev/null
+++ b/library/cpp/actors/interconnect/packet.cpp
@@ -0,0 +1,32 @@
+#include "packet.h"
+
+#include <library/cpp/actors/core/probes.h>
+
+#include <util/system/datetime.h>
+
+LWTRACE_USING(ACTORLIB_PROVIDER);
+
+ui32 TEventHolder::Fill(IEventHandle& ev) {
+ Serial = 0;
+ Descr.Type = ev.Type;
+ Descr.Flags = ev.Flags;
+ Descr.Recipient = ev.Recipient;
+ Descr.Sender = ev.Sender;
+ Descr.Cookie = ev.Cookie;
+ ev.TraceId.Serialize(&Descr.TraceId);
+ ForwardRecipient = ev.GetForwardOnNondeliveryRecipient();
+ EventActuallySerialized = 0;
+ Descr.Checksum = 0;
+
+ if (ev.HasBuffer()) {
+ Buffer = ev.ReleaseChainBuffer();
+ EventSerializedSize = Buffer->GetSize();
+ } else if (ev.HasEvent()) {
+ Event.Reset(ev.ReleaseBase());
+ EventSerializedSize = Event->CalculateSerializedSize();
+ } else {
+ EventSerializedSize = 0;
+ }
+
+ return EventSerializedSize;
+}
diff --git a/library/cpp/actors/interconnect/packet.h b/library/cpp/actors/interconnect/packet.h
new file mode 100644
index 0000000000..4ba50a2b5f
--- /dev/null
+++ b/library/cpp/actors/interconnect/packet.h
@@ -0,0 +1,324 @@
+#pragma once
+
+#include <library/cpp/actors/core/event_pb.h>
+#include <library/cpp/actors/core/event_load.h>
+#include <library/cpp/actors/core/events.h>
+#include <library/cpp/actors/core/actor.h>
+#include <library/cpp/containers/stack_vector/stack_vec.h>
+#include <library/cpp/actors/util/rope.h>
+#include <library/cpp/actors/prof/tag.h>
+#include <library/cpp/digest/crc32c/crc32c.h>
+#include <library/cpp/lwtrace/shuttle.h>
+#include <util/generic/string.h>
+#include <util/generic/list.h>
+
+#ifndef FORCE_EVENT_CHECKSUM
+#define FORCE_EVENT_CHECKSUM 0
+#endif
+
+using NActors::IEventBase;
+using NActors::IEventHandle;
+using NActors::TActorId;
+using NActors::TConstIoVec;
+using NActors::TEventSerializedData;
+
+Y_FORCE_INLINE ui32 Crc32cExtendMSanCompatible(ui32 checksum, const void *data, size_t len) {
+ if constexpr (NSan::MSanIsOn()) {
+ const char *begin = static_cast<const char*>(data);
+ const char *end = begin + len;
+ begin -= reinterpret_cast<uintptr_t>(begin) & 15;
+ end += -reinterpret_cast<uintptr_t>(end) & 15;
+ NSan::Unpoison(begin, end - begin);
+ }
+ return Crc32cExtend(checksum, data, len);
+}
+
+struct TSessionParams {
+ bool Encryption = {};
+ bool UseModernFrame = {};
+ bool AuthOnly = {};
+ TString AuthCN;
+ NActors::TScopeId PeerScopeId;
+};
+
+struct TTcpPacketHeader_v1 {
+ ui32 HeaderCRC32;
+ ui32 PayloadCRC32;
+ ui64 Confirm;
+ ui64 Serial;
+ ui64 DataSize;
+
+ inline bool Check() const {
+ ui32 actual = Crc32cExtendMSanCompatible(0, &PayloadCRC32, sizeof(TTcpPacketHeader_v1) - sizeof(HeaderCRC32));
+ return actual == HeaderCRC32;
+ }
+
+ inline void Sign() {
+ HeaderCRC32 = Crc32cExtendMSanCompatible(0, &PayloadCRC32, sizeof(TTcpPacketHeader_v1) - sizeof(HeaderCRC32));
+ }
+
+ TString ToString() const {
+ return Sprintf("{Confirm# %" PRIu64 " Serial# %" PRIu64 " DataSize# %" PRIu64 "}", Confirm, Serial, DataSize);
+ }
+};
+
+#pragma pack(push, 1)
+struct TTcpPacketHeader_v2 {
+ ui64 Confirm;
+ ui64 Serial;
+ ui32 Checksum; // for the whole frame
+ ui16 PayloadLength;
+};
+#pragma pack(pop)
+
+union TTcpPacketBuf {
+ static constexpr ui64 PingRequestMask = 0x8000000000000000ULL;
+ static constexpr ui64 PingResponseMask = 0x4000000000000000ULL;
+ static constexpr ui64 ClockMask = 0x2000000000000000ULL;
+
+ static constexpr size_t PacketDataLen = 4096 * 2 - 96 - Max(sizeof(TTcpPacketHeader_v1), sizeof(TTcpPacketHeader_v2));
+ struct {
+ TTcpPacketHeader_v1 Header;
+ char Data[PacketDataLen];
+ } v1;
+ struct {
+ TTcpPacketHeader_v2 Header;
+ char Data[PacketDataLen];
+ } v2;
+};
+
+#pragma pack(push, 1)
+struct TEventDescr {
+ ui32 Type;
+ ui32 Flags;
+ TActorId Recipient;
+ TActorId Sender;
+ ui64 Cookie;
+ // wilson trace id is stored as a serialized entity to avoid using complex object with prohibited copy ctor
+ NWilson::TTraceId::TSerializedTraceId TraceId;
+ ui32 Checksum;
+};
+#pragma pack(pop)
+
+struct TEventHolder : TNonCopyable {
+ TEventDescr Descr;
+ TActorId ForwardRecipient;
+ THolder<IEventBase> Event;
+ TIntrusivePtr<TEventSerializedData> Buffer;
+ ui64 Serial;
+ ui32 EventSerializedSize;
+ ui32 EventActuallySerialized;
+ mutable NLWTrace::TOrbit Orbit;
+
+ ui32 Fill(IEventHandle& ev);
+
+ void InitChecksum() {
+ Descr.Checksum = 0;
+ }
+
+ void UpdateChecksum(const TSessionParams& params, const void *buffer, size_t len) {
+ if (FORCE_EVENT_CHECKSUM || !params.UseModernFrame) {
+ Descr.Checksum = Crc32cExtendMSanCompatible(Descr.Checksum, buffer, len);
+ }
+ }
+
+ void ForwardOnNondelivery(bool unsure) {
+ TEventDescr& d = Descr;
+ const TActorId& r = d.Recipient;
+ const TActorId& s = d.Sender;
+ const TActorId *f = ForwardRecipient ? &ForwardRecipient : nullptr;
+ auto ev = Event
+ ? std::make_unique<IEventHandle>(r, s, Event.Release(), d.Flags, d.Cookie, f, NWilson::TTraceId(d.TraceId))
+ : std::make_unique<IEventHandle>(d.Type, d.Flags, r, s, std::move(Buffer), d.Cookie, f, NWilson::TTraceId(d.TraceId));
+ NActors::TActivationContext::Send(ev->ForwardOnNondelivery(NActors::TEvents::TEvUndelivered::Disconnected, unsure));
+ }
+
+ void Clear() {
+ Event.Reset();
+ Buffer.Reset();
+ Orbit.Reset();
+ }
+};
+
+namespace NActors {
+ class TEventOutputChannel;
+}
+
+struct TTcpPacketOutTask : TNonCopyable {
+ const TSessionParams& Params;
+ TTcpPacketBuf Packet;
+ size_t DataSize;
+ TStackVec<TConstIoVec, 32> Bufs;
+ size_t BufferIndex;
+ size_t FirstBufferOffset;
+ bool TriedWriting;
+ char *FreeArea;
+ char *End;
+ mutable NLWTrace::TOrbit Orbit;
+
+public:
+ TTcpPacketOutTask(const TSessionParams& params)
+ : Params(params)
+ {
+ Reuse();
+ }
+
+ template<typename T>
+ auto ApplyToHeader(T&& callback) {
+ return Params.UseModernFrame ? callback(Packet.v2.Header) : callback(Packet.v1.Header);
+ }
+
+ template<typename T>
+ auto ApplyToHeader(T&& callback) const {
+ return Params.UseModernFrame ? callback(Packet.v2.Header) : callback(Packet.v1.Header);
+ }
+
+ bool IsAtBegin() const {
+ return !BufferIndex && !FirstBufferOffset && !TriedWriting;
+ }
+
+ void MarkTriedWriting() {
+ TriedWriting = true;
+ }
+
+ void Reuse() {
+ DataSize = 0;
+ ApplyToHeader([this](auto& header) { Bufs.assign(1, {&header, sizeof(header)}); });
+ BufferIndex = 0;
+ FirstBufferOffset = 0;
+ TriedWriting = false;
+ FreeArea = Params.UseModernFrame ? Packet.v2.Data : Packet.v1.Data;
+ End = FreeArea + TTcpPacketBuf::PacketDataLen;
+ Orbit.Reset();
+ }
+
+ bool IsEmpty() const {
+ return !DataSize;
+ }
+
+ void SetMetadata(ui64 serial, ui64 confirm) {
+ ApplyToHeader([&](auto& header) {
+ header.Serial = serial;
+ header.Confirm = confirm;
+ });
+ }
+
+ void UpdateConfirmIfPossible(ui64 confirm) {
+ // we don't want to recalculate whole packet checksum for single confirmation update on v2
+ if (!Params.UseModernFrame && IsAtBegin() && confirm != Packet.v1.Header.Confirm) {
+ Packet.v1.Header.Confirm = confirm;
+ Packet.v1.Header.Sign();
+ }
+ }
+
+ size_t GetDataSize() const { return DataSize; }
+
+ ui64 GetSerial() const {
+ return ApplyToHeader([](auto& header) { return header.Serial; });
+ }
+
+ bool Confirmed(ui64 confirm) const {
+ return ApplyToHeader([&](auto& header) { return IsEmpty() || header.Serial <= confirm; });
+ }
+
+ void *GetFreeArea() {
+ return FreeArea;
+ }
+
+ size_t GetVirtualFreeAmount() const {
+ return TTcpPacketBuf::PacketDataLen - DataSize;
+ }
+
+ void AppendBuf(const void *buf, size_t size) {
+ DataSize += size;
+ Y_VERIFY_DEBUG(DataSize <= TTcpPacketBuf::PacketDataLen, "DataSize# %zu AppendBuf buf# %p size# %zu"
+ " FreeArea# %p End# %p", DataSize, buf, size, FreeArea, End);
+
+ if (Bufs && static_cast<const char*>(Bufs.back().Data) + Bufs.back().Size == buf) {
+ Bufs.back().Size += size;
+ } else {
+ Bufs.push_back({buf, size});
+ }
+
+ if (buf >= FreeArea && buf < End) {
+ Y_VERIFY_DEBUG(buf == FreeArea);
+ FreeArea = const_cast<char*>(static_cast<const char*>(buf)) + size;
+ Y_VERIFY_DEBUG(FreeArea <= End);
+ }
+ }
+
+ void Undo(size_t size) {
+ Y_VERIFY(Bufs);
+ auto& buf = Bufs.back();
+ Y_VERIFY(buf.Data == FreeArea - buf.Size);
+ buf.Size -= size;
+ if (!buf.Size) {
+ Bufs.pop_back();
+ }
+ FreeArea -= size;
+ DataSize -= size;
+ }
+
+ bool DropBufs(size_t& amount) {
+ while (BufferIndex != Bufs.size()) {
+ TConstIoVec& item = Bufs[BufferIndex];
+ // calculate number of bytes to the end in current buffer
+ const size_t remain = item.Size - FirstBufferOffset;
+ if (amount >= remain) {
+ // vector item completely fits into the received amount, drop it out and switch to next buffer
+ amount -= remain;
+ ++BufferIndex;
+ FirstBufferOffset = 0;
+ } else {
+ // adjust first buffer by "amount" bytes forward and reset amount to zero
+ FirstBufferOffset += amount;
+ amount = 0;
+ // return false meaning that we have some more data to send
+ return false;
+ }
+ }
+ return true;
+ }
+
+ void ResetBufs() {
+ BufferIndex = FirstBufferOffset = 0;
+ TriedWriting = false;
+ }
+
+ template <typename TVectorType>
+ void AppendToIoVector(TVectorType& vector, size_t max) {
+ for (size_t k = BufferIndex, offset = FirstBufferOffset; k != Bufs.size() && vector.size() < max; ++k, offset = 0) {
+ TConstIoVec v = Bufs[k];
+ v.Data = static_cast<const char*>(v.Data) + offset;
+ v.Size -= offset;
+ vector.push_back(v);
+ }
+ }
+
+ void Sign() {
+ if (Params.UseModernFrame) {
+ Packet.v2.Header.Checksum = 0;
+ Packet.v2.Header.PayloadLength = DataSize;
+ if (!Params.Encryption) {
+ ui32 sum = 0;
+ for (const auto& item : Bufs) {
+ sum = Crc32cExtendMSanCompatible(sum, item.Data, item.Size);
+ }
+ Packet.v2.Header.Checksum = sum;
+ }
+ } else {
+ Y_VERIFY(!Bufs.empty());
+ auto it = Bufs.begin();
+ static constexpr size_t headerLen = sizeof(TTcpPacketHeader_v1);
+ Y_VERIFY(it->Data == &Packet.v1.Header && it->Size >= headerLen);
+ ui32 sum = Crc32cExtendMSanCompatible(0, Packet.v1.Data, it->Size - headerLen);
+ while (++it != Bufs.end()) {
+ sum = Crc32cExtendMSanCompatible(sum, it->Data, it->Size);
+ }
+
+ Packet.v1.Header.PayloadCRC32 = sum;
+ Packet.v1.Header.DataSize = DataSize;
+ Packet.v1.Header.Sign();
+ }
+ }
+};
diff --git a/library/cpp/actors/interconnect/poller.h b/library/cpp/actors/interconnect/poller.h
new file mode 100644
index 0000000000..ff7979369f
--- /dev/null
+++ b/library/cpp/actors/interconnect/poller.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <functional>
+#include <library/cpp/actors/core/events.h>
+
+namespace NActors {
+ class TSharedDescriptor: public TThrRefBase {
+ public:
+ virtual int GetDescriptor() = 0;
+ };
+
+ using TDelegate = std::function<void()>;
+ using TFDDelegate = std::function<TDelegate(const TIntrusivePtr<TSharedDescriptor>&)>;
+
+ class IPoller: public TThrRefBase {
+ public:
+ virtual ~IPoller() = default;
+
+ virtual void StartRead(const TIntrusivePtr<TSharedDescriptor>& s, TFDDelegate&& operation) = 0;
+ virtual void StartWrite(const TIntrusivePtr<TSharedDescriptor>& s, TFDDelegate&& operation) = 0;
+ };
+
+}
diff --git a/library/cpp/actors/interconnect/poller_actor.cpp b/library/cpp/actors/interconnect/poller_actor.cpp
new file mode 100644
index 0000000000..e75cbcaef4
--- /dev/null
+++ b/library/cpp/actors/interconnect/poller_actor.cpp
@@ -0,0 +1,294 @@
+#include "poller_actor.h"
+#include "interconnect_common.h"
+
+#include <library/cpp/actors/core/actor_bootstrapped.h>
+#include <library/cpp/actors/core/actorsystem.h>
+#include <library/cpp/actors/core/hfunc.h>
+#include <library/cpp/actors/core/log.h>
+#include <library/cpp/actors/core/probes.h>
+#include <library/cpp/actors/protos/services_common.pb.h>
+#include <library/cpp/actors/util/funnel_queue.h>
+
+#include <util/generic/intrlist.h>
+#include <util/system/thread.h>
+#include <util/system/event.h>
+#include <util/system/pipe.h>
+
+#include <variant>
+
+namespace NActors {
+
+ LWTRACE_USING(ACTORLIB_PROVIDER);
+
+ namespace {
+ int LastSocketError() {
+#if defined(_win_)
+ return WSAGetLastError();
+#else
+ return errno;
+#endif
+ }
+ }
+
+ struct TSocketRecord : TThrRefBase {
+ const TIntrusivePtr<TSharedDescriptor> Socket;
+ const TActorId ReadActorId;
+ const TActorId WriteActorId;
+ std::atomic_uint32_t Flags = 0;
+
+ TSocketRecord(TEvPollerRegister& ev)
+ : Socket(std::move(ev.Socket))
+ , ReadActorId(ev.ReadActorId)
+ , WriteActorId(ev.WriteActorId)
+ {}
+ };
+
+ template<typename TDerived>
+ class TPollerThreadBase : public ISimpleThread {
+ protected:
+ struct TPollerExitThread {}; // issued then we need to terminate the poller thread
+
+ struct TPollerWakeup {};
+
+ struct TPollerUnregisterSocket {
+ TIntrusivePtr<TSharedDescriptor> Socket;
+
+ TPollerUnregisterSocket(TIntrusivePtr<TSharedDescriptor> socket)
+ : Socket(std::move(socket))
+ {}
+ };
+
+ using TPollerSyncOperation = std::variant<TPollerExitThread, TPollerWakeup, TPollerUnregisterSocket>;
+
+ struct TPollerSyncOperationWrapper {
+ TPollerSyncOperation Operation;
+ TManualEvent Event;
+
+ TPollerSyncOperationWrapper(TPollerSyncOperation&& operation)
+ : Operation(std::move(operation))
+ {}
+
+ void Wait() {
+ Event.WaitI();
+ }
+
+ void SignalDone() {
+ Event.Signal();
+ }
+ };
+
+ TActorSystem *ActorSystem;
+ TPipeHandle ReadEnd, WriteEnd; // pipe for sync event processor
+ TFunnelQueue<TPollerSyncOperationWrapper*> SyncOperationsQ; // operation queue
+
+ public:
+ TPollerThreadBase(TActorSystem *actorSystem)
+ : ActorSystem(actorSystem)
+ {
+ // create a pipe for notifications
+ try {
+ TPipeHandle::Pipe(ReadEnd, WriteEnd, CloseOnExec);
+ } catch (const TFileError& err) {
+ Y_FAIL("failed to create pipe");
+ }
+
+ // switch the read/write ends to nonblocking mode
+ SetNonBlock(ReadEnd);
+ SetNonBlock(WriteEnd);
+ }
+
+ void UnregisterSocket(const TIntrusivePtr<TSocketRecord>& record) {
+ ExecuteSyncOperation(TPollerUnregisterSocket(record->Socket));
+ }
+
+ protected:
+ void Notify(TSocketRecord *record, bool read, bool write) {
+ auto issue = [&](const TActorId& recipient) {
+ ActorSystem->Send(new IEventHandle(recipient, {}, new TEvPollerReady(record->Socket, read, write)));
+ };
+ if (read && record->ReadActorId) {
+ issue(record->ReadActorId);
+ if (write && record->WriteActorId && record->WriteActorId != record->ReadActorId) {
+ issue(record->WriteActorId);
+ }
+ } else if (write && record->WriteActorId) {
+ issue(record->WriteActorId);
+ }
+ }
+
+ void Stop() {
+ // signal poller thread to stop and wait for the thread
+ ExecuteSyncOperation(TPollerExitThread());
+ ISimpleThread::Join();
+ }
+
+ void ExecuteSyncOperation(TPollerSyncOperation&& op) {
+ TPollerSyncOperationWrapper wrapper(std::move(op));
+ if (SyncOperationsQ.Push(&wrapper)) {
+ // this was the first entry, so we push notification through the pipe
+ for (;;) {
+ char buffer = '\x00';
+ ssize_t nwritten = WriteEnd.Write(&buffer, sizeof(buffer));
+ if (nwritten < 0) {
+ const int err = LastSocketError();
+ if (err == EINTR) {
+ continue;
+ } else {
+ Y_FAIL("WriteEnd.Write() failed with %s", strerror(err));
+ }
+ } else {
+ Y_VERIFY(nwritten);
+ break;
+ }
+ }
+ }
+ // wait for operation to complete
+ wrapper.Wait();
+ }
+
+ bool DrainReadEnd() {
+ size_t totalRead = 0;
+ char buffer[4096];
+ for (;;) {
+ ssize_t n = ReadEnd.Read(buffer, sizeof(buffer));
+ if (n < 0) {
+ const int error = LastSocketError();
+ if (error == EINTR) {
+ continue;
+ } else if (error == EAGAIN || error == EWOULDBLOCK) {
+ break;
+ } else {
+ Y_FAIL("read() failed with %s", strerror(errno));
+ }
+ } else {
+ Y_VERIFY(n);
+ totalRead += n;
+ }
+ }
+ return totalRead;
+ }
+
+ bool ProcessSyncOpQueue() {
+ if (DrainReadEnd()) {
+ Y_VERIFY(!SyncOperationsQ.IsEmpty());
+ do {
+ TPollerSyncOperationWrapper *op = SyncOperationsQ.Top();
+ if (auto *unregister = std::get_if<TPollerUnregisterSocket>(&op->Operation)) {
+ static_cast<TDerived&>(*this).UnregisterSocketInLoop(unregister->Socket);
+ op->SignalDone();
+ } else if (std::get_if<TPollerExitThread>(&op->Operation)) {
+ op->SignalDone();
+ return false; // terminate the thread
+ } else if (std::get_if<TPollerWakeup>(&op->Operation)) {
+ op->SignalDone();
+ } else {
+ Y_FAIL();
+ }
+ } while (SyncOperationsQ.Pop());
+ }
+ return true;
+ }
+
+ void *ThreadProc() override {
+ SetCurrentThreadName("network poller");
+ while (ProcessSyncOpQueue()) {
+ static_cast<TDerived&>(*this).ProcessEventsInLoop();
+ }
+ return nullptr;
+ }
+ };
+
+} // namespace NActors
+
+#if defined(_linux_)
+# include "poller_actor_linux.h"
+#elif defined(_darwin_)
+# include "poller_actor_darwin.h"
+#elif defined(_win_)
+# include "poller_actor_win.h"
+#else
+# error "Unsupported platform"
+#endif
+
+namespace NActors {
+
+ class TPollerToken::TImpl {
+ std::weak_ptr<TPollerThread> Thread;
+ TIntrusivePtr<TSocketRecord> Record; // valid only when Thread is held locked
+
+ public:
+ TImpl(std::shared_ptr<TPollerThread> thread, TIntrusivePtr<TSocketRecord> record)
+ : Thread(thread)
+ , Record(std::move(record))
+ {
+ thread->RegisterSocket(Record);
+ }
+
+ ~TImpl() {
+ if (auto thread = Thread.lock()) {
+ thread->UnregisterSocket(Record);
+ }
+ }
+
+ void Request(bool read, bool write) {
+ if (auto thread = Thread.lock()) {
+ thread->Request(Record, read, write);
+ }
+ }
+
+ const TIntrusivePtr<TSharedDescriptor>& Socket() const {
+ return Record->Socket;
+ }
+ };
+
+ class TPollerActor: public TActorBootstrapped<TPollerActor> {
+ // poller thread
+ std::shared_ptr<TPollerThread> PollerThread;
+
+ public:
+ static constexpr IActor::EActivityType ActorActivityType() {
+ return IActor::INTERCONNECT_POLLER;
+ }
+
+ void Bootstrap() {
+ PollerThread = std::make_shared<TPollerThread>(TlsActivationContext->ExecutorThread.ActorSystem);
+ Become(&TPollerActor::StateFunc);
+ }
+
+ STRICT_STFUNC(StateFunc,
+ hFunc(TEvPollerRegister, Handle);
+ cFunc(TEvents::TSystem::Poison, PassAway);
+ )
+
+ void Handle(TEvPollerRegister::TPtr& ev) {
+ auto *msg = ev->Get();
+ auto impl = std::make_unique<TPollerToken::TImpl>(PollerThread, MakeIntrusive<TSocketRecord>(*msg));
+ auto socket = impl->Socket();
+ TPollerToken::TPtr token(new TPollerToken(std::move(impl)));
+ if (msg->ReadActorId && msg->WriteActorId && msg->WriteActorId != msg->ReadActorId) {
+ Send(msg->ReadActorId, new TEvPollerRegisterResult(socket, token));
+ Send(msg->WriteActorId, new TEvPollerRegisterResult(socket, std::move(token)));
+ } else if (msg->ReadActorId) {
+ Send(msg->ReadActorId, new TEvPollerRegisterResult(socket, std::move(token)));
+ } else if (msg->WriteActorId) {
+ Send(msg->WriteActorId, new TEvPollerRegisterResult(socket, std::move(token)));
+ }
+ }
+ };
+
+ TPollerToken::TPollerToken(std::unique_ptr<TImpl> impl)
+ : Impl(std::move(impl))
+ {}
+
+ TPollerToken::~TPollerToken()
+ {}
+
+ void TPollerToken::Request(bool read, bool write) {
+ Impl->Request(read, write);
+ }
+
+ IActor* CreatePollerActor() {
+ return new TPollerActor;
+ }
+
+}
diff --git a/library/cpp/actors/interconnect/poller_actor.h b/library/cpp/actors/interconnect/poller_actor.h
new file mode 100644
index 0000000000..f927b82089
--- /dev/null
+++ b/library/cpp/actors/interconnect/poller_actor.h
@@ -0,0 +1,63 @@
+#pragma once
+
+#include "events_local.h"
+#include "poller.h"
+#include <library/cpp/actors/core/actor.h>
+
+namespace NActors {
+ struct TEvPollerRegister : TEventLocal<TEvPollerRegister, ui32(ENetwork::EvPollerRegister)> {
+ const TIntrusivePtr<TSharedDescriptor> Socket; // socket to watch for
+ const TActorId ReadActorId; // actor id to notify about read availability
+ const TActorId WriteActorId; // actor id to notify about write availability; may be the same as the ReadActorId
+
+ TEvPollerRegister(TIntrusivePtr<TSharedDescriptor> socket, const TActorId& readActorId, const TActorId& writeActorId)
+ : Socket(std::move(socket))
+ , ReadActorId(readActorId)
+ , WriteActorId(writeActorId)
+ {}
+ };
+
+ // poller token is sent in response to TEvPollerRegister; it allows requesting poll when read/write returns EAGAIN
+ class TPollerToken : public TThrRefBase {
+ class TImpl;
+ std::unique_ptr<TImpl> Impl;
+
+ friend class TPollerActor;
+ TPollerToken(std::unique_ptr<TImpl> impl);
+
+ public:
+ ~TPollerToken();
+ void Request(bool read, bool write);
+
+ using TPtr = TIntrusivePtr<TPollerToken>;
+ };
+
+ struct TEvPollerRegisterResult : TEventLocal<TEvPollerRegisterResult, ui32(ENetwork::EvPollerRegisterResult)> {
+ TIntrusivePtr<TSharedDescriptor> Socket;
+ TPollerToken::TPtr PollerToken;
+
+ TEvPollerRegisterResult(TIntrusivePtr<TSharedDescriptor> socket, TPollerToken::TPtr pollerToken)
+ : Socket(std::move(socket))
+ , PollerToken(std::move(pollerToken))
+ {}
+ };
+
+ struct TEvPollerReady : TEventLocal<TEvPollerReady, ui32(ENetwork::EvPollerReady)> {
+ TIntrusivePtr<TSharedDescriptor> Socket;
+ const bool Read, Write;
+
+ TEvPollerReady(TIntrusivePtr<TSharedDescriptor> socket, bool read, bool write)
+ : Socket(std::move(socket))
+ , Read(read)
+ , Write(write)
+ {}
+ };
+
+ IActor* CreatePollerActor();
+
+ inline TActorId MakePollerActorId() {
+ char x[12] = {'I', 'C', 'P', 'o', 'l', 'l', 'e', 'r', '\xDE', '\xAD', '\xBE', '\xEF'};
+ return TActorId(0, TStringBuf(std::begin(x), std::end(x)));
+ }
+
+}
diff --git a/library/cpp/actors/interconnect/poller_actor_darwin.h b/library/cpp/actors/interconnect/poller_actor_darwin.h
new file mode 100644
index 0000000000..4cb0a58f8d
--- /dev/null
+++ b/library/cpp/actors/interconnect/poller_actor_darwin.h
@@ -0,0 +1,95 @@
+#pragma once
+
+#include <sys/event.h>
+
+namespace NActors {
+
+ class TKqueueThread : public TPollerThreadBase<TKqueueThread> {
+ // KQueue file descriptor
+ int KqDescriptor;
+
+ void SafeKevent(const struct kevent* ev, int size) {
+ int rc;
+ do {
+ rc = kevent(KqDescriptor, ev, size, nullptr, 0, nullptr);
+ } while (rc == -1 && errno == EINTR);
+ Y_VERIFY(rc != -1, "kevent() failed with %s", strerror(errno));
+ }
+
+ public:
+ TKqueueThread(TActorSystem *actorSystem)
+ : TPollerThreadBase(actorSystem)
+ {
+ // create kqueue
+ KqDescriptor = kqueue();
+ Y_VERIFY(KqDescriptor != -1, "kqueue() failed with %s", strerror(errno));
+
+ // set close-on-exit flag
+ {
+ int flags = fcntl(KqDescriptor, F_GETFD);
+ Y_VERIFY(flags >= 0, "fcntl(F_GETFD) failed with %s", strerror(errno));
+ int rc = fcntl(KqDescriptor, F_SETFD, flags | FD_CLOEXEC);
+ Y_VERIFY(rc != -1, "fcntl(F_SETFD, +FD_CLOEXEC) failed with %s", strerror(errno));
+ }
+
+ // register pipe's read end in poller
+ struct kevent ev;
+ EV_SET(&ev, (int)ReadEnd, EVFILT_READ, EV_ADD | EV_ENABLE, 0, 0, nullptr);
+ SafeKevent(&ev, 1);
+
+ ISimpleThread::Start(); // start poller thread
+ }
+
+ ~TKqueueThread() {
+ Stop();
+ close(KqDescriptor);
+ }
+
+ void ProcessEventsInLoop() {
+ std::array<struct kevent, 256> events;
+
+ int numReady = kevent(KqDescriptor, nullptr, 0, events.data(), events.size(), nullptr);
+ if (numReady == -1) {
+ if (errno == EINTR) {
+ return;
+ } else {
+ Y_FAIL("kevent() failed with %s", strerror(errno));
+ }
+ }
+
+ for (int i = 0; i < numReady; ++i) {
+ const struct kevent& ev = events[i];
+ if (ev.udata) {
+ TSocketRecord *it = static_cast<TSocketRecord*>(ev.udata);
+ const bool error = ev.flags & (EV_EOF | EV_ERROR);
+ const bool read = error || ev.filter == EVFILT_READ;
+ const bool write = error || ev.filter == EVFILT_WRITE;
+ Notify(it, read, write);
+ }
+ }
+ }
+
+ void UnregisterSocketInLoop(const TIntrusivePtr<TSharedDescriptor>& socket) {
+ struct kevent ev[2];
+ const int fd = socket->GetDescriptor();
+ EV_SET(&ev[0], fd, EVFILT_READ, EV_DELETE, 0, 0, nullptr);
+ EV_SET(&ev[1], fd, EVFILT_WRITE, EV_DELETE, 0, 0, nullptr);
+ SafeKevent(ev, 2);
+ }
+
+ void RegisterSocket(const TIntrusivePtr<TSocketRecord>& record) {
+ int flags = EV_ADD | EV_CLEAR | EV_ENABLE;
+ struct kevent ev[2];
+ const int fd = record->Socket->GetDescriptor();
+ EV_SET(&ev[0], fd, EVFILT_READ, flags, 0, 0, record.Get());
+ EV_SET(&ev[1], fd, EVFILT_WRITE, flags, 0, 0, record.Get());
+ SafeKevent(ev, 2);
+ }
+
+ void Request(const TIntrusivePtr<TSocketRecord>& /*socket*/, bool /*read*/, bool /*write*/)
+ {} // no special processing here as we use kqueue in edge-triggered mode
+ };
+
+ using TPollerThread = TKqueueThread;
+
+}
diff --git a/library/cpp/actors/interconnect/poller_actor_linux.h b/library/cpp/actors/interconnect/poller_actor_linux.h
new file mode 100644
index 0000000000..dd4f7c0124
--- /dev/null
+++ b/library/cpp/actors/interconnect/poller_actor_linux.h
@@ -0,0 +1,114 @@
+#pragma once
+
+#include <sys/epoll.h>
+
+namespace NActors {
+
+ class TEpollThread : public TPollerThreadBase<TEpollThread> {
+ // epoll file descriptor
+ int EpollDescriptor;
+
+ public:
+ TEpollThread(TActorSystem *actorSystem)
+ : TPollerThreadBase(actorSystem)
+ {
+ EpollDescriptor = epoll_create1(EPOLL_CLOEXEC);
+ Y_VERIFY(EpollDescriptor != -1, "epoll_create1() failed with %s", strerror(errno));
+
+ epoll_event event;
+ event.data.ptr = nullptr;
+ event.events = EPOLLIN;
+ if (epoll_ctl(EpollDescriptor, EPOLL_CTL_ADD, ReadEnd, &event) == -1) {
+ Y_FAIL("epoll_ctl(EPOLL_CTL_ADD) failed with %s", strerror(errno));
+ }
+
+ ISimpleThread::Start(); // start poller thread
+ }
+
+ ~TEpollThread() {
+ Stop();
+ close(EpollDescriptor);
+ }
+
+ void ProcessEventsInLoop() {
+ // preallocated array for events
+ std::array<epoll_event, 256> events;
+
+ // wait indefinitely for event to arrive
+ LWPROBE(EpollStartWaitIn);
+ int numReady = epoll_wait(EpollDescriptor, events.data(), events.size(), -1);
+ LWPROBE(EpollFinishWaitIn, numReady);
+
+ // check return status for any errors
+ if (numReady == -1) {
+ if (errno == EINTR) {
+ return; // restart the call a bit later
+ } else {
+ Y_FAIL("epoll_wait() failed with %s", strerror(errno));
+ }
+ }
+
+ for (int i = 0; i < numReady; ++i) {
+ const epoll_event& ev = events[i];
+ if (auto *record = static_cast<TSocketRecord*>(ev.data.ptr)) {
+ const bool read = ev.events & (EPOLLIN | EPOLLHUP | EPOLLRDHUP | EPOLLERR);
+ const bool write = ev.events & (EPOLLOUT | EPOLLERR);
+
+ // remove hit flags from the bit set
+ ui32 flags = record->Flags;
+ const ui32 remove = (read ? EPOLLIN : 0) | (write ? EPOLLOUT : 0);
+ while (!record->Flags.compare_exchange_weak(flags, flags & ~remove))
+ {}
+ flags &= ~remove;
+
+ // rearm poller if some flags remain
+ if (flags) {
+ epoll_event event;
+ event.events = EPOLLONESHOT | EPOLLRDHUP | flags;
+ event.data.ptr = record;
+ if (epoll_ctl(EpollDescriptor, EPOLL_CTL_MOD, record->Socket->GetDescriptor(), &event) == -1) {
+ Y_FAIL("epoll_ctl(EPOLL_CTL_MOD) failed with %s", strerror(errno));
+ }
+ }
+
+ // issue notifications
+ Notify(record, read, write);
+ }
+ }
+ }
+
+ void UnregisterSocketInLoop(const TIntrusivePtr<TSharedDescriptor>& socket) {
+ if (epoll_ctl(EpollDescriptor, EPOLL_CTL_DEL, socket->GetDescriptor(), nullptr) == -1) {
+ Y_FAIL("epoll_ctl(EPOLL_CTL_DEL) failed with %s", strerror(errno));
+ }
+ }
+
+ void RegisterSocket(const TIntrusivePtr<TSocketRecord>& record) {
+ epoll_event event;
+ event.events = EPOLLONESHOT | EPOLLRDHUP;
+ event.data.ptr = record.Get();
+ if (epoll_ctl(EpollDescriptor, EPOLL_CTL_ADD, record->Socket->GetDescriptor(), &event) == -1) {
+ Y_FAIL("epoll_ctl(EPOLL_CTL_ADD) failed with %s", strerror(errno));
+ }
+ }
+
+ void Request(const TIntrusivePtr<TSocketRecord>& record, bool read, bool write) {
+ const ui32 add = (read ? EPOLLIN : 0) | (write ? EPOLLOUT : 0);
+ ui32 flags = record->Flags;
+ while (!record->Flags.compare_exchange_weak(flags, flags | add))
+ {}
+ flags |= add;
+ if (flags) {
+ epoll_event event;
+ event.events = EPOLLONESHOT | EPOLLRDHUP | flags;
+ event.data.ptr = record.Get();
+ if (epoll_ctl(EpollDescriptor, EPOLL_CTL_MOD, record->Socket->GetDescriptor(), &event) == -1) {
+ Y_FAIL("epoll_ctl(EPOLL_CTL_MOD) failed with %s", strerror(errno));
+ }
+ }
+ }
+ };
+
+ using TPollerThread = TEpollThread;
+
+} // namespace NActors
diff --git a/library/cpp/actors/interconnect/poller_actor_win.h b/library/cpp/actors/interconnect/poller_actor_win.h
new file mode 100644
index 0000000000..4b4caa0ebd
--- /dev/null
+++ b/library/cpp/actors/interconnect/poller_actor_win.h
@@ -0,0 +1,103 @@
+#pragma once
+
+namespace NActors {
+
+ class TSelectThread : public TPollerThreadBase<TSelectThread> {
+ TMutex Mutex;
+ std::unordered_map<SOCKET, TIntrusivePtr<TSocketRecord>> Descriptors;
+
+ enum {
+ READ = 1,
+ WRITE = 2,
+ };
+
+ public:
+ TSelectThread(TActorSystem *actorSystem)
+ : TPollerThreadBase(actorSystem)
+ {
+ Descriptors.emplace(ReadEnd, nullptr);
+ ISimpleThread::Start();
+ }
+
+ ~TSelectThread() {
+ Stop();
+ }
+
+ void ProcessEventsInLoop() {
+ fd_set readfds, writefds, exceptfds;
+
+ FD_ZERO(&readfds);
+ FD_ZERO(&writefds);
+ FD_ZERO(&exceptfds);
+ int nfds = 0;
+ with_lock (Mutex) {
+ for (const auto& [key, record] : Descriptors) {
+ const int fd = key;
+ auto add = [&](auto& set) {
+ FD_SET(fd, &set);
+ nfds = Max<int>(nfds, fd + 1);
+ };
+ if (!record || (record->Flags & READ)) {
+ add(readfds);
+ }
+ if (!record || (record->Flags & WRITE)) {
+ add(writefds);
+ }
+ add(exceptfds);
+ }
+ }
+
+ int res = select(nfds, &readfds, &writefds, &exceptfds, nullptr);
+ if (res == -1) {
+ const int err = LastSocketError();
+ if (err == EINTR) {
+ return; // try a bit later
+ } else {
+ Y_FAIL("select() failed with %s", strerror(err));
+ }
+ }
+
+ with_lock (Mutex) {
+ for (const auto& [fd, record] : Descriptors) {
+ if (record) {
+ const bool error = FD_ISSET(fd, &exceptfds);
+ const bool read = error || FD_ISSET(fd, &readfds);
+ const bool write = error || FD_ISSET(fd, &writefds);
+ if (read) {
+ record->Flags &= ~READ;
+ }
+ if (write) {
+ record->Flags &= ~WRITE;
+ }
+ Notify(record.Get(), read, write);
+ }
+ }
+ }
+ }
+
+ void UnregisterSocketInLoop(const TIntrusivePtr<TSharedDescriptor>& socket) {
+ with_lock (Mutex) {
+ Descriptors.erase(socket->GetDescriptor());
+ }
+ }
+
+ void RegisterSocket(const TIntrusivePtr<TSocketRecord>& record) {
+ with_lock (Mutex) {
+ Descriptors.emplace(record->Socket->GetDescriptor(), record);
+ }
+ ExecuteSyncOperation(TPollerWakeup());
+ }
+
+ void Request(const TIntrusivePtr<TSocketRecord>& record, bool read, bool write) {
+ with_lock (Mutex) {
+ const auto it = Descriptors.find(record->Socket->GetDescriptor());
+ Y_VERIFY(it != Descriptors.end());
+ it->second->Flags |= (read ? READ : 0) | (write ? WRITE : 0);
+ }
+ ExecuteSyncOperation(TPollerWakeup());
+ }
+ };
+
+ using TPollerThread = TSelectThread;
+
+} // NActors
diff --git a/library/cpp/actors/interconnect/poller_tcp.cpp b/library/cpp/actors/interconnect/poller_tcp.cpp
new file mode 100644
index 0000000000..8267df31ea
--- /dev/null
+++ b/library/cpp/actors/interconnect/poller_tcp.cpp
@@ -0,0 +1,35 @@
+#include "poller_tcp.h"
+
+namespace NInterconnect {
+ TPollerThreads::TPollerThreads(size_t units, bool useSelect)
+ : Units(units)
+ {
+ Y_VERIFY_DEBUG(!Units.empty());
+ for (auto& unit : Units)
+ unit = TPollerUnit::Make(useSelect);
+ }
+
+ TPollerThreads::~TPollerThreads() {
+ }
+
+ void TPollerThreads::Start() {
+ for (const auto& unit : Units)
+ unit->Start();
+ }
+
+ void TPollerThreads::Stop() {
+ for (const auto& unit : Units)
+ unit->Stop();
+ }
+
+ void TPollerThreads::StartRead(const TIntrusivePtr<TSharedDescriptor>& s, TFDDelegate&& operation) {
+ auto& unit = Units[THash<SOCKET>()(s->GetDescriptor()) % Units.size()];
+ unit->StartReadOperation(s, std::move(operation));
+ }
+
+ void TPollerThreads::StartWrite(const TIntrusivePtr<TSharedDescriptor>& s, TFDDelegate&& operation) {
+ auto& unit = Units[THash<SOCKET>()(s->GetDescriptor()) % Units.size()];
+ unit->StartWriteOperation(s, std::move(operation));
+ }
+
+}
diff --git a/library/cpp/actors/interconnect/poller_tcp.h b/library/cpp/actors/interconnect/poller_tcp.h
new file mode 100644
index 0000000000..310265eccd
--- /dev/null
+++ b/library/cpp/actors/interconnect/poller_tcp.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include "poller_tcp_unit.h"
+#include "poller.h"
+
+#include <util/generic/vector.h>
+#include <util/generic/hash.h>
+
+namespace NInterconnect {
+ class TPollerThreads: public NActors::IPoller {
+ public:
+ TPollerThreads(size_t units = 1U, bool useSelect = false);
+ ~TPollerThreads();
+
+ void Start();
+ void Stop();
+
+ void StartRead(const TIntrusivePtr<TSharedDescriptor>& s, TFDDelegate&& operation) override;
+ void StartWrite(const TIntrusivePtr<TSharedDescriptor>& s, TFDDelegate&& operation) override;
+
+ private:
+ TVector<TPollerUnit::TPtr> Units;
+ };
+
+}
diff --git a/library/cpp/actors/interconnect/poller_tcp_unit.cpp b/library/cpp/actors/interconnect/poller_tcp_unit.cpp
new file mode 100644
index 0000000000..59e7dda810
--- /dev/null
+++ b/library/cpp/actors/interconnect/poller_tcp_unit.cpp
@@ -0,0 +1,126 @@
+#include "poller_tcp_unit.h"
+
+#if !defined(_win_) && !defined(_darwin_)
+#include "poller_tcp_unit_epoll.h"
+#endif
+
+#include "poller_tcp_unit_select.h"
+#include "poller.h"
+
+#include <library/cpp/actors/prof/tag.h>
+#include <library/cpp/actors/util/intrinsics.h>
+
+#if defined _linux_
+#include <pthread.h>
+#endif
+
+namespace NInterconnect {
+ TPollerUnit::TPtr
+ TPollerUnit::Make(bool useSelect) {
+#if defined(_win_) || defined(_darwin_)
+ Y_UNUSED(useSelect);
+ return TPtr(new TPollerUnitSelect);
+#else
+ return useSelect ? TPtr(new TPollerUnitSelect) : TPtr(new TPollerUnitEpoll);
+#endif
+ }
+
+ TPollerUnit::TPollerUnit()
+ : StopFlag(true)
+ , ReadLoop(TThread::TParams(IdleThread<false>, this).SetName("network read"))
+ , WriteLoop(TThread::TParams(IdleThread<true>, this).SetName("network write"))
+ {
+ }
+
+ TPollerUnit::~TPollerUnit() {
+ if (!AtomicLoad(&StopFlag))
+ Stop();
+ }
+
+ void
+ TPollerUnit::Start() {
+ AtomicStore(&StopFlag, false);
+ ReadLoop.Start();
+ WriteLoop.Start();
+ }
+
+ void
+ TPollerUnit::Stop() {
+ AtomicStore(&StopFlag, true);
+ ReadLoop.Join();
+ WriteLoop.Join();
+ }
+
+ template <>
+ TPollerUnit::TSide&
+ TPollerUnit::GetSide<false>() {
+ return Read;
+ }
+
+ template <>
+ TPollerUnit::TSide&
+ TPollerUnit::GetSide<true>() {
+ return Write;
+ }
+
+ void
+ TPollerUnit::StartReadOperation(
+ const TIntrusivePtr<TSharedDescriptor>& stream,
+ TFDDelegate&& operation) {
+ Y_VERIFY_DEBUG(stream);
+ if (AtomicLoad(&StopFlag))
+ return;
+ GetSide<false>().InputQueue.Push(TSide::TItem(stream, std::move(operation)));
+ }
+
+ void
+ TPollerUnit::StartWriteOperation(
+ const TIntrusivePtr<TSharedDescriptor>& stream,
+ TFDDelegate&& operation) {
+ Y_VERIFY_DEBUG(stream);
+ if (AtomicLoad(&StopFlag))
+ return;
+ GetSide<true>().InputQueue.Push(TSide::TItem(stream, std::move(operation)));
+ }
+
+ template <bool IsWrite>
+ void*
+ TPollerUnit::IdleThread(void* param) {
+ // TODO: musl-libc version of `sched_param` struct is for some reason different from pthread
+ // version in Ubuntu 12.04
+#if defined(_linux_) && !defined(_musl_)
+ pthread_t threadSelf = pthread_self();
+ sched_param sparam = {20};
+ pthread_setschedparam(threadSelf, SCHED_FIFO, &sparam);
+#endif
+
+ static_cast<TPollerUnit*>(param)->RunLoop<IsWrite>();
+ return nullptr;
+ }
+
+ template <>
+ void
+ TPollerUnit::RunLoop<false>() {
+ NProfiling::TMemoryTagScope tag("INTERCONNECT_RECEIVED_DATA");
+ while (!AtomicLoad(&StopFlag))
+ ProcessRead();
+ }
+
+ template <>
+ void
+ TPollerUnit::RunLoop<true>() {
+ NProfiling::TMemoryTagScope tag("INTERCONNECT_SEND_DATA");
+ while (!AtomicLoad(&StopFlag))
+ ProcessWrite();
+ }
+
+ void
+ TPollerUnit::TSide::ProcessInput() {
+ if (!InputQueue.IsEmpty())
+ do {
+ auto sock = InputQueue.Top().first->GetDescriptor();
+ if (!Operations.emplace(sock, std::move(InputQueue.Top())).second)
+ Y_FAIL("Descriptor is already in pooler.");
+ } while (InputQueue.Pop());
+ }
+}
diff --git a/library/cpp/actors/interconnect/poller_tcp_unit.h b/library/cpp/actors/interconnect/poller_tcp_unit.h
new file mode 100644
index 0000000000..692168b968
--- /dev/null
+++ b/library/cpp/actors/interconnect/poller_tcp_unit.h
@@ -0,0 +1,67 @@
+#pragma once
+
+#include <util/system/thread.h>
+#include <library/cpp/actors/util/funnel_queue.h>
+
+#include "interconnect_stream.h"
+
+#include <memory>
+#include <functional>
+#include <unordered_map>
+
+namespace NInterconnect {
+ using NActors::TFDDelegate;
+ using NActors::TSharedDescriptor;
+
+ class TPollerUnit {
+ public:
+ typedef std::unique_ptr<TPollerUnit> TPtr;
+
+ static TPtr Make(bool useSelect);
+
+ void Start();
+ void Stop();
+
+ virtual void StartReadOperation(
+ const TIntrusivePtr<TSharedDescriptor>& stream,
+ TFDDelegate&& operation);
+
+ virtual void StartWriteOperation(
+ const TIntrusivePtr<TSharedDescriptor>& stream,
+ TFDDelegate&& operation);
+
+ virtual ~TPollerUnit();
+
+ private:
+ virtual void ProcessRead() = 0;
+ virtual void ProcessWrite() = 0;
+
+ template <bool IsWrite>
+ static void* IdleThread(void* param);
+
+ template <bool IsWrite>
+ void RunLoop();
+
+ volatile bool StopFlag;
+ TThread ReadLoop, WriteLoop;
+
+ protected:
+ TPollerUnit();
+
+ struct TSide {
+ using TOperations =
+ std::unordered_map<SOCKET,
+ std::pair<TIntrusivePtr<TSharedDescriptor>, TFDDelegate>>;
+
+ TOperations Operations;
+ using TItem = TOperations::mapped_type;
+ TFunnelQueue<TItem> InputQueue;
+
+ void ProcessInput();
+ } Read, Write;
+
+ template <bool IsWrite>
+ TSide& GetSide();
+ };
+
+}
diff --git a/library/cpp/actors/interconnect/poller_tcp_unit_epoll.cpp b/library/cpp/actors/interconnect/poller_tcp_unit_epoll.cpp
new file mode 100644
index 0000000000..c78538b95b
--- /dev/null
+++ b/library/cpp/actors/interconnect/poller_tcp_unit_epoll.cpp
@@ -0,0 +1,125 @@
+#include "poller_tcp_unit_epoll.h"
+#if !defined(_win_) && !defined(_darwin_)
+#include <unistd.h>
+#include <sys/epoll.h>
+
+#include <csignal>
+#include <cerrno>
+#include <cstring>
+
+namespace NInterconnect {
+ namespace {
+ void
+ DeleteEpoll(int epoll, SOCKET stream) {
+ ::epoll_event event = {0, {.fd = stream}};
+ if (::epoll_ctl(epoll, EPOLL_CTL_DEL, stream, &event)) {
+ Cerr << "epoll_ctl errno: " << errno << Endl;
+ Y_FAIL("epoll delete error!");
+ }
+ }
+
+ template <ui32 Events>
+ void
+ AddEpoll(int epoll, SOCKET stream) {
+ ::epoll_event event = {.events = Events};
+ event.data.fd = stream;
+ if (::epoll_ctl(epoll, EPOLL_CTL_ADD, stream, &event)) {
+ Cerr << "epoll_ctl errno: " << errno << Endl;
+ Y_FAIL("epoll add error!");
+ }
+ }
+
+ int
+ Initialize() {
+ const auto epoll = ::epoll_create(10000);
+ Y_VERIFY_DEBUG(epoll > 0);
+ return epoll;
+ }
+
+ }
+
+ TPollerUnitEpoll::TPollerUnitEpoll()
+ : ReadDescriptor(Initialize())
+ , WriteDescriptor(Initialize())
+ {
+ // Block on the epoll descriptor.
+ ::sigemptyset(&sigmask);
+ ::sigaddset(&sigmask, SIGPIPE);
+ ::sigaddset(&sigmask, SIGTERM);
+ }
+
+ TPollerUnitEpoll::~TPollerUnitEpoll() {
+ ::close(ReadDescriptor);
+ ::close(WriteDescriptor);
+ }
+
+ template <>
+ int TPollerUnitEpoll::GetDescriptor<false>() const {
+ return ReadDescriptor;
+ }
+
+ template <>
+ int TPollerUnitEpoll::GetDescriptor<true>() const {
+ return WriteDescriptor;
+ }
+
+ void
+ TPollerUnitEpoll::StartReadOperation(
+ const TIntrusivePtr<TSharedDescriptor>& s,
+ TFDDelegate&& operation) {
+ TPollerUnit::StartReadOperation(s, std::move(operation));
+ AddEpoll<EPOLLRDHUP | EPOLLIN>(ReadDescriptor, s->GetDescriptor());
+ }
+
+ void
+ TPollerUnitEpoll::StartWriteOperation(
+ const TIntrusivePtr<TSharedDescriptor>& s,
+ TFDDelegate&& operation) {
+ TPollerUnit::StartWriteOperation(s, std::move(operation));
+ AddEpoll<EPOLLRDHUP | EPOLLOUT>(WriteDescriptor, s->GetDescriptor());
+ }
+
+ constexpr int EVENTS_BUF_SIZE = 128;
+
+ template <bool WriteOp>
+ void
+ TPollerUnitEpoll::Process() {
+ ::epoll_event events[EVENTS_BUF_SIZE];
+
+ const int epoll = GetDescriptor<WriteOp>();
+
+ /* Timeout just to check StopFlag sometimes */
+ const int result =
+ ::epoll_pwait(epoll, events, EVENTS_BUF_SIZE, 200, &sigmask);
+
+ if (result == -1 && errno != EINTR)
+ Y_FAIL("epoll wait error!");
+
+ auto& side = GetSide<WriteOp>();
+ side.ProcessInput();
+
+ for (int i = 0; i < result; ++i) {
+ const auto it = side.Operations.find(events[i].data.fd);
+ if (side.Operations.end() == it)
+ continue;
+ if (const auto& finalizer = it->second.second(it->second.first)) {
+ DeleteEpoll(epoll, it->first);
+ side.Operations.erase(it);
+ finalizer();
+ }
+ }
+ }
+
+ void
+ TPollerUnitEpoll::ProcessRead() {
+ Process<false>();
+ }
+
+ void
+ TPollerUnitEpoll::ProcessWrite() {
+ Process<true>();
+ }
+
+}
+
+#endif
diff --git a/library/cpp/actors/interconnect/poller_tcp_unit_epoll.h b/library/cpp/actors/interconnect/poller_tcp_unit_epoll.h
new file mode 100644
index 0000000000..ff7893eba2
--- /dev/null
+++ b/library/cpp/actors/interconnect/poller_tcp_unit_epoll.h
@@ -0,0 +1,33 @@
+#pragma once
+
+#include "poller_tcp_unit.h"
+
+namespace NInterconnect {
+ class TPollerUnitEpoll: public TPollerUnit {
+ public:
+ TPollerUnitEpoll();
+ virtual ~TPollerUnitEpoll();
+
+ private:
+ virtual void StartReadOperation(
+ const TIntrusivePtr<TSharedDescriptor>& s,
+ TFDDelegate&& operation) override;
+
+ virtual void StartWriteOperation(
+ const TIntrusivePtr<TSharedDescriptor>& s,
+ TFDDelegate&& operation) override;
+
+ virtual void ProcessRead() override;
+ virtual void ProcessWrite() override;
+
+ template <bool Write>
+ void Process();
+
+ template <bool Write>
+ int GetDescriptor() const;
+
+ const int ReadDescriptor, WriteDescriptor;
+ ::sigset_t sigmask;
+ };
+
+}
diff --git a/library/cpp/actors/interconnect/poller_tcp_unit_select.cpp b/library/cpp/actors/interconnect/poller_tcp_unit_select.cpp
new file mode 100644
index 0000000000..ae7aaad566
--- /dev/null
+++ b/library/cpp/actors/interconnect/poller_tcp_unit_select.cpp
@@ -0,0 +1,86 @@
+#include "poller_tcp_unit_select.h"
+
+#include <csignal>
+
+#if defined(_win_)
+#include <winsock2.h>
+#define SOCKET_ERROR_SOURCE ::WSAGetLastError()
+#elif defined(_darwin_)
+#include <cerrno>
+#define SOCKET_ERROR_SOURCE errno
+typedef timeval TIMEVAL;
+#else
+#include <cerrno>
+#define SOCKET_ERROR_SOURCE errno
+#endif
+
+namespace NInterconnect {
+ TPollerUnitSelect::TPollerUnitSelect() {
+ }
+
+ TPollerUnitSelect::~TPollerUnitSelect() {
+ }
+
+ template <bool IsWrite>
+ void
+ TPollerUnitSelect::Process() {
+ auto& side = GetSide<IsWrite>();
+ side.ProcessInput();
+
+ enum : size_t { R,
+ W,
+ E };
+ static const auto O = IsWrite ? W : R;
+
+ ::fd_set sets[3];
+
+ FD_ZERO(&sets[R]);
+ FD_ZERO(&sets[W]);
+ FD_ZERO(&sets[E]);
+
+ for (const auto& operation : side.Operations) {
+ FD_SET(operation.first, &sets[O]);
+ FD_SET(operation.first, &sets[E]);
+ }
+
+#if defined(_win_)
+ ::TIMEVAL timeout = {0L, 99991L};
+ const auto numberEvents = !side.Operations.empty() ? ::select(FD_SETSIZE, &sets[R], &sets[W], &sets[E], &timeout)
+ : (::Sleep(100), 0);
+#elif defined(_darwin_)
+ ::TIMEVAL timeout = {0L, 99991L};
+ const auto numberEvents = ::select(FD_SETSIZE, &sets[R], &sets[W], &sets[E], &timeout);
+#else
+ ::sigset_t sigmask;
+ ::sigemptyset(&sigmask);
+ ::sigaddset(&sigmask, SIGPIPE);
+ ::sigaddset(&sigmask, SIGTERM);
+
+ struct ::timespec timeout = {0L, 99999989L};
+ const auto numberEvents = ::pselect(FD_SETSIZE, &sets[R], &sets[W], &sets[E], &timeout, &sigmask);
+#endif
+
+ Y_VERIFY_DEBUG(numberEvents >= 0);
+
+ for (auto it = side.Operations.cbegin(); side.Operations.cend() != it;) {
+ if (FD_ISSET(it->first, &sets[O]) || FD_ISSET(it->first, &sets[E]))
+ if (const auto& finalizer = it->second.second(it->second.first)) {
+ side.Operations.erase(it++);
+ finalizer();
+ continue;
+ }
+ ++it;
+ }
+ }
+
+ void
+ TPollerUnitSelect::ProcessRead() {
+ Process<false>();
+ }
+
+ void
+ TPollerUnitSelect::ProcessWrite() {
+ Process<true>();
+ }
+
+}
diff --git a/library/cpp/actors/interconnect/poller_tcp_unit_select.h b/library/cpp/actors/interconnect/poller_tcp_unit_select.h
new file mode 100644
index 0000000000..0c15217796
--- /dev/null
+++ b/library/cpp/actors/interconnect/poller_tcp_unit_select.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include "poller_tcp_unit.h"
+
+namespace NInterconnect {
+ class TPollerUnitSelect: public TPollerUnit {
+ public:
+ TPollerUnitSelect();
+ virtual ~TPollerUnitSelect();
+
+ private:
+ virtual void ProcessRead() override;
+ virtual void ProcessWrite() override;
+
+ template <bool IsWrite>
+ void Process();
+ };
+
+}
diff --git a/library/cpp/actors/interconnect/profiler.h b/library/cpp/actors/interconnect/profiler.h
new file mode 100644
index 0000000000..77a59e3179
--- /dev/null
+++ b/library/cpp/actors/interconnect/profiler.h
@@ -0,0 +1,142 @@
+#pragma once
+
+#include <library/cpp/actors/util/datetime.h>
+
+namespace NActors {
+
+ class TProfiled {
+ enum class EType : ui32 {
+ ENTRY,
+ EXIT,
+ };
+
+ struct TItem {
+ EType Type; // entry kind
+ int Line;
+ const char *Marker; // name of the profiled function/part
+ ui64 Timestamp; // cycles
+ };
+
+ bool Enable = false;
+ mutable TDeque<TItem> Items;
+
+ friend class TFunction;
+
+ public:
+ class TFunction {
+ const TProfiled& Profiled;
+
+ public:
+ TFunction(const TProfiled& profiled, const char *name, int line)
+ : Profiled(profiled)
+ {
+ Log(EType::ENTRY, name, line);
+ }
+
+ ~TFunction() {
+ Log(EType::EXIT, nullptr, 0);
+ }
+
+ private:
+ void Log(EType type, const char *marker, int line) {
+ if (Profiled.Enable) {
+ Profiled.Items.push_back(TItem{
+ type,
+ line,
+ marker,
+ GetCycleCountFast()
+ });
+ }
+ }
+ };
+
+ public:
+ void Start() {
+ Enable = true;
+ }
+
+ void Finish() {
+ Items.clear();
+ Enable = false;
+ }
+
+ TDuration Duration() const {
+ return CyclesToDuration(Items ? Items.back().Timestamp - Items.front().Timestamp : 0);
+ }
+
+ TString Format() const {
+ TDeque<TItem>::iterator it = Items.begin();
+ TString res = FormatLevel(it);
+ Y_VERIFY(it == Items.end());
+ return res;
+ }
+
+ private:
+ TString FormatLevel(TDeque<TItem>::iterator& it) const {
+ struct TRecord {
+ TString Marker;
+ ui64 Duration;
+ TString Interior;
+
+ bool operator <(const TRecord& other) const {
+ return Duration < other.Duration;
+ }
+ };
+ TVector<TRecord> records;
+
+ while (it != Items.end() && it->Type != EType::EXIT) {
+ Y_VERIFY(it->Type == EType::ENTRY);
+ const TString marker = Sprintf("%s:%d", it->Marker, it->Line);
+ const ui64 begin = it->Timestamp;
+ ++it;
+ const TString interior = FormatLevel(it);
+ Y_VERIFY(it != Items.end());
+ Y_VERIFY(it->Type == EType::EXIT);
+ const ui64 end = it->Timestamp;
+ records.push_back(TRecord{marker, end - begin, interior});
+ ++it;
+ }
+
+ TStringStream s;
+ const ui64 cyclesPerMs = GetCyclesPerMillisecond();
+
+ if (records.size() <= 10) {
+ bool first = true;
+ for (const TRecord& record : records) {
+ if (first) {
+ first = false;
+ } else {
+ s << " ";
+ }
+ s << record.Marker << "(" << (record.Duration * 1000000 / cyclesPerMs) << "ns)";
+ if (record.Interior) {
+ s << " {" << record.Interior << "}";
+ }
+ }
+ } else {
+ TMap<TString, TVector<TRecord>> m;
+ for (TRecord& r : records) {
+ const TString key = r.Marker;
+ m[key].push_back(std::move(r));
+ }
+
+ s << "unordered ";
+ for (auto& [key, value] : m) {
+ auto i = std::max_element(value.begin(), value.end());
+ ui64 sum = 0;
+ for (const auto& item : value) {
+ sum += item.Duration;
+ }
+ sum = sum * 1000000 / cyclesPerMs;
+ s << key << " num# " << value.size() << " sum# " << sum << "ns max# " << (i->Duration * 1000000 / cyclesPerMs) << "ns";
+ if (i->Interior) {
+ s << " {" << i->Interior << "}";
+ }
+ }
+ }
+
+ return s.Str();
+ }
+ };
+
+} // NActors
diff --git a/library/cpp/actors/interconnect/slowpoke_actor.h b/library/cpp/actors/interconnect/slowpoke_actor.h
new file mode 100644
index 0000000000..4b02e5da48
--- /dev/null
+++ b/library/cpp/actors/interconnect/slowpoke_actor.h
@@ -0,0 +1,47 @@
+#pragma once
+
+#include <library/cpp/actors/core/actor_bootstrapped.h>
+
+namespace NActors {
+
+ class TSlowpokeActor : public TActorBootstrapped<TSlowpokeActor> {
+ const TDuration Duration;
+ const TDuration SleepMin;
+ const TDuration SleepMax;
+ const TDuration RescheduleMin;
+ const TDuration RescheduleMax;
+
+ public:
+ static constexpr NKikimrServices::TActivity::EType ActorActivityType() {
+ return NKikimrServices::TActivity::INTERCONNECT_COMMON;
+ }
+
+ TSlowpokeActor(TDuration duration, TDuration sleepMin, TDuration sleepMax, TDuration rescheduleMin, TDuration rescheduleMax)
+ : Duration(duration)
+ , SleepMin(sleepMin)
+ , SleepMax(sleepMax)
+ , RescheduleMin(rescheduleMin)
+ , RescheduleMax(rescheduleMax)
+ {}
+
+ void Bootstrap(const TActorContext& ctx) {
+ Become(&TThis::StateFunc, ctx, Duration, new TEvents::TEvPoisonPill);
+ HandleWakeup(ctx);
+ }
+
+ void HandleWakeup(const TActorContext& ctx) {
+ Sleep(RandomDuration(SleepMin, SleepMax));
+ ctx.Schedule(RandomDuration(RescheduleMin, RescheduleMax), new TEvents::TEvWakeup);
+ }
+
+ static TDuration RandomDuration(TDuration min, TDuration max) {
+ return min + TDuration::FromValue(RandomNumber<ui64>(max.GetValue() - min.GetValue() + 1));
+ }
+
+ STRICT_STFUNC(StateFunc,
+ CFunc(TEvents::TSystem::PoisonPill, Die)
+ CFunc(TEvents::TSystem::Wakeup, HandleWakeup)
+ )
+ };
+
+} // NActors
diff --git a/library/cpp/actors/interconnect/types.cpp b/library/cpp/actors/interconnect/types.cpp
new file mode 100644
index 0000000000..979c55f277
--- /dev/null
+++ b/library/cpp/actors/interconnect/types.cpp
@@ -0,0 +1,564 @@
+#include "types.h"
+#include <util/string/printf.h>
+#include <util/generic/vector.h>
+#include <errno.h>
+
+namespace NActors {
+
+ TVector<const char*> TDisconnectReason::Reasons = {
+ "EndOfStream",
+ "CloseOnIdle",
+ "LostConnection",
+ "DeadPeer",
+ "NewSession",
+ "HandshakeFailTransient",
+ "HandshakeFailPermanent",
+ "UserRequest",
+ "Debug",
+ "ChecksumError",
+ "FormatError",
+ "EventTooLarge",
+ "QueueOverload",
+ "E2BIG",
+ "EACCES",
+ "EADDRINUSE",
+ "EADDRNOTAVAIL",
+ "EADV",
+ "EAFNOSUPPORT",
+ "EAGAIN",
+ "EALREADY",
+ "EBADE",
+ "EBADF",
+ "EBADFD",
+ "EBADMSG",
+ "EBADR",
+ "EBADRQC",
+ "EBADSLT",
+ "EBFONT",
+ "EBUSY",
+ "ECANCELED",
+ "ECHILD",
+ "ECHRNG",
+ "ECOMM",
+ "ECONNABORTED",
+ "ECONNREFUSED",
+ "ECONNRESET",
+ "EDEADLK",
+ "EDEADLOCK",
+ "EDESTADDRREQ",
+ "EDOM",
+ "EDOTDOT",
+ "EDQUOT",
+ "EEXIST",
+ "EFAULT",
+ "EFBIG",
+ "EHOSTDOWN",
+ "EHOSTUNREACH",
+ "EHWPOISON",
+ "EIDRM",
+ "EILSEQ",
+ "EINPROGRESS",
+ "EINTR",
+ "EINVAL",
+ "EIO",
+ "EISCONN",
+ "EISDIR",
+ "EISNAM",
+ "EKEYEXPIRED",
+ "EKEYREJECTED",
+ "EKEYREVOKED",
+ "EL2HLT",
+ "EL2NSYNC",
+ "EL3HLT",
+ "EL3RST",
+ "ELIBACC",
+ "ELIBBAD",
+ "ELIBEXEC",
+ "ELIBMAX",
+ "ELIBSCN",
+ "ELNRNG",
+ "ELOOP",
+ "EMEDIUMTYPE",
+ "EMFILE",
+ "EMLINK",
+ "EMSGSIZE",
+ "EMULTIHOP",
+ "ENAMETOOLONG",
+ "ENAVAIL",
+ "ENETDOWN",
+ "ENETRESET",
+ "ENETUNREACH",
+ "ENFILE",
+ "ENOANO",
+ "ENOBUFS",
+ "ENOCSI",
+ "ENODATA",
+ "ENODEV",
+ "ENOENT",
+ "ENOEXEC",
+ "ENOKEY",
+ "ENOLCK",
+ "ENOLINK",
+ "ENOMEDIUM",
+ "ENOMEM",
+ "ENOMSG",
+ "ENONET",
+ "ENOPKG",
+ "ENOPROTOOPT",
+ "ENOSPC",
+ "ENOSR",
+ "ENOSTR",
+ "ENOSYS",
+ "ENOTBLK",
+ "ENOTCONN",
+ "ENOTDIR",
+ "ENOTEMPTY",
+ "ENOTNAM",
+ "ENOTRECOVERABLE",
+ "ENOTSOCK",
+ "ENOTTY",
+ "ENOTUNIQ",
+ "ENXIO",
+ "EOPNOTSUPP",
+ "EOVERFLOW",
+ "EOWNERDEAD",
+ "EPERM",
+ "EPFNOSUPPORT",
+ "EPIPE",
+ "EPROTO",
+ "EPROTONOSUPPORT",
+ "EPROTOTYPE",
+ "ERANGE",
+ "EREMCHG",
+ "EREMOTE",
+ "EREMOTEIO",
+ "ERESTART",
+ "ERFKILL",
+ "EROFS",
+ "ESHUTDOWN",
+ "ESOCKTNOSUPPORT",
+ "ESPIPE",
+ "ESRCH",
+ "ESRMNT",
+ "ESTALE",
+ "ESTRPIPE",
+ "ETIME",
+ "ETIMEDOUT",
+ "ETOOMANYREFS",
+ "ETXTBSY",
+ "EUCLEAN",
+ "EUNATCH",
+ "EUSERS",
+ "EWOULDBLOCK",
+ "EXDEV",
+ "EXFULL",
+ };
+
+ TDisconnectReason TDisconnectReason::FromErrno(int err) {
+ switch (err) {
+#define REASON(ERRNO) case ERRNO: return TDisconnectReason(TString(#ERRNO))
+#if defined(E2BIG)
+ REASON(E2BIG);
+#endif
+#if defined(EACCES)
+ REASON(EACCES);
+#endif
+#if defined(EADDRINUSE)
+ REASON(EADDRINUSE);
+#endif
+#if defined(EADDRNOTAVAIL)
+ REASON(EADDRNOTAVAIL);
+#endif
+#if defined(EADV)
+ REASON(EADV);
+#endif
+#if defined(EAFNOSUPPORT)
+ REASON(EAFNOSUPPORT);
+#endif
+#if defined(EAGAIN)
+ REASON(EAGAIN);
+#endif
+#if defined(EALREADY)
+ REASON(EALREADY);
+#endif
+#if defined(EBADE)
+ REASON(EBADE);
+#endif
+#if defined(EBADF)
+ REASON(EBADF);
+#endif
+#if defined(EBADFD)
+ REASON(EBADFD);
+#endif
+#if defined(EBADMSG)
+ REASON(EBADMSG);
+#endif
+#if defined(EBADR)
+ REASON(EBADR);
+#endif
+#if defined(EBADRQC)
+ REASON(EBADRQC);
+#endif
+#if defined(EBADSLT)
+ REASON(EBADSLT);
+#endif
+#if defined(EBFONT)
+ REASON(EBFONT);
+#endif
+#if defined(EBUSY)
+ REASON(EBUSY);
+#endif
+#if defined(ECANCELED)
+ REASON(ECANCELED);
+#endif
+#if defined(ECHILD)
+ REASON(ECHILD);
+#endif
+#if defined(ECHRNG)
+ REASON(ECHRNG);
+#endif
+#if defined(ECOMM)
+ REASON(ECOMM);
+#endif
+#if defined(ECONNABORTED)
+ REASON(ECONNABORTED);
+#endif
+#if defined(ECONNREFUSED)
+ REASON(ECONNREFUSED);
+#endif
+#if defined(ECONNRESET)
+ REASON(ECONNRESET);
+#endif
+#if defined(EDEADLK)
+ REASON(EDEADLK);
+#endif
+#if defined(EDEADLOCK) && (!defined(EDEADLK) || EDEADLOCK != EDEADLK)
+ REASON(EDEADLOCK);
+#endif
+#if defined(EDESTADDRREQ)
+ REASON(EDESTADDRREQ);
+#endif
+#if defined(EDOM)
+ REASON(EDOM);
+#endif
+#if defined(EDOTDOT)
+ REASON(EDOTDOT);
+#endif
+#if defined(EDQUOT)
+ REASON(EDQUOT);
+#endif
+#if defined(EEXIST)
+ REASON(EEXIST);
+#endif
+#if defined(EFAULT)
+ REASON(EFAULT);
+#endif
+#if defined(EFBIG)
+ REASON(EFBIG);
+#endif
+#if defined(EHOSTDOWN)
+ REASON(EHOSTDOWN);
+#endif
+#if defined(EHOSTUNREACH)
+ REASON(EHOSTUNREACH);
+#endif
+#if defined(EHWPOISON)
+ REASON(EHWPOISON);
+#endif
+#if defined(EIDRM)
+ REASON(EIDRM);
+#endif
+#if defined(EILSEQ)
+ REASON(EILSEQ);
+#endif
+#if defined(EINPROGRESS)
+ REASON(EINPROGRESS);
+#endif
+#if defined(EINTR)
+ REASON(EINTR);
+#endif
+#if defined(EINVAL)
+ REASON(EINVAL);
+#endif
+#if defined(EIO)
+ REASON(EIO);
+#endif
+#if defined(EISCONN)
+ REASON(EISCONN);
+#endif
+#if defined(EISDIR)
+ REASON(EISDIR);
+#endif
+#if defined(EISNAM)
+ REASON(EISNAM);
+#endif
+#if defined(EKEYEXPIRED)
+ REASON(EKEYEXPIRED);
+#endif
+#if defined(EKEYREJECTED)
+ REASON(EKEYREJECTED);
+#endif
+#if defined(EKEYREVOKED)
+ REASON(EKEYREVOKED);
+#endif
+#if defined(EL2HLT)
+ REASON(EL2HLT);
+#endif
+#if defined(EL2NSYNC)
+ REASON(EL2NSYNC);
+#endif
+#if defined(EL3HLT)
+ REASON(EL3HLT);
+#endif
+#if defined(EL3RST)
+ REASON(EL3RST);
+#endif
+#if defined(ELIBACC)
+ REASON(ELIBACC);
+#endif
+#if defined(ELIBBAD)
+ REASON(ELIBBAD);
+#endif
+#if defined(ELIBEXEC)
+ REASON(ELIBEXEC);
+#endif
+#if defined(ELIBMAX)
+ REASON(ELIBMAX);
+#endif
+#if defined(ELIBSCN)
+ REASON(ELIBSCN);
+#endif
+#if defined(ELNRNG)
+ REASON(ELNRNG);
+#endif
+#if defined(ELOOP)
+ REASON(ELOOP);
+#endif
+#if defined(EMEDIUMTYPE)
+ REASON(EMEDIUMTYPE);
+#endif
+#if defined(EMFILE)
+ REASON(EMFILE);
+#endif
+#if defined(EMLINK)
+ REASON(EMLINK);
+#endif
+#if defined(EMSGSIZE)
+ REASON(EMSGSIZE);
+#endif
+#if defined(EMULTIHOP)
+ REASON(EMULTIHOP);
+#endif
+#if defined(ENAMETOOLONG)
+ REASON(ENAMETOOLONG);
+#endif
+#if defined(ENAVAIL)
+ REASON(ENAVAIL);
+#endif
+#if defined(ENETDOWN)
+ REASON(ENETDOWN);
+#endif
+#if defined(ENETRESET)
+ REASON(ENETRESET);
+#endif
+#if defined(ENETUNREACH)
+ REASON(ENETUNREACH);
+#endif
+#if defined(ENFILE)
+ REASON(ENFILE);
+#endif
+#if defined(ENOANO)
+ REASON(ENOANO);
+#endif
+#if defined(ENOBUFS)
+ REASON(ENOBUFS);
+#endif
+#if defined(ENOCSI)
+ REASON(ENOCSI);
+#endif
+#if defined(ENODATA)
+ REASON(ENODATA);
+#endif
+#if defined(ENODEV)
+ REASON(ENODEV);
+#endif
+#if defined(ENOENT)
+ REASON(ENOENT);
+#endif
+#if defined(ENOEXEC)
+ REASON(ENOEXEC);
+#endif
+#if defined(ENOKEY)
+ REASON(ENOKEY);
+#endif
+#if defined(ENOLCK)
+ REASON(ENOLCK);
+#endif
+#if defined(ENOLINK)
+ REASON(ENOLINK);
+#endif
+#if defined(ENOMEDIUM)
+ REASON(ENOMEDIUM);
+#endif
+#if defined(ENOMEM)
+ REASON(ENOMEM);
+#endif
+#if defined(ENOMSG)
+ REASON(ENOMSG);
+#endif
+#if defined(ENONET)
+ REASON(ENONET);
+#endif
+#if defined(ENOPKG)
+ REASON(ENOPKG);
+#endif
+#if defined(ENOPROTOOPT)
+ REASON(ENOPROTOOPT);
+#endif
+#if defined(ENOSPC)
+ REASON(ENOSPC);
+#endif
+#if defined(ENOSR)
+ REASON(ENOSR);
+#endif
+#if defined(ENOSTR)
+ REASON(ENOSTR);
+#endif
+#if defined(ENOSYS)
+ REASON(ENOSYS);
+#endif
+#if defined(ENOTBLK)
+ REASON(ENOTBLK);
+#endif
+#if defined(ENOTCONN)
+ REASON(ENOTCONN);
+#endif
+#if defined(ENOTDIR)
+ REASON(ENOTDIR);
+#endif
+#if defined(ENOTEMPTY)
+ REASON(ENOTEMPTY);
+#endif
+#if defined(ENOTNAM)
+ REASON(ENOTNAM);
+#endif
+#if defined(ENOTRECOVERABLE)
+ REASON(ENOTRECOVERABLE);
+#endif
+#if defined(ENOTSOCK)
+ REASON(ENOTSOCK);
+#endif
+#if defined(ENOTTY)
+ REASON(ENOTTY);
+#endif
+#if defined(ENOTUNIQ)
+ REASON(ENOTUNIQ);
+#endif
+#if defined(ENXIO)
+ REASON(ENXIO);
+#endif
+#if defined(EOPNOTSUPP)
+ REASON(EOPNOTSUPP);
+#endif
+#if defined(EOVERFLOW)
+ REASON(EOVERFLOW);
+#endif
+#if defined(EOWNERDEAD)
+ REASON(EOWNERDEAD);
+#endif
+#if defined(EPERM)
+ REASON(EPERM);
+#endif
+#if defined(EPFNOSUPPORT)
+ REASON(EPFNOSUPPORT);
+#endif
+#if defined(EPIPE)
+ REASON(EPIPE);
+#endif
+#if defined(EPROTO)
+ REASON(EPROTO);
+#endif
+#if defined(EPROTONOSUPPORT)
+ REASON(EPROTONOSUPPORT);
+#endif
+#if defined(EPROTOTYPE)
+ REASON(EPROTOTYPE);
+#endif
+#if defined(ERANGE)
+ REASON(ERANGE);
+#endif
+#if defined(EREMCHG)
+ REASON(EREMCHG);
+#endif
+#if defined(EREMOTE)
+ REASON(EREMOTE);
+#endif
+#if defined(EREMOTEIO)
+ REASON(EREMOTEIO);
+#endif
+#if defined(ERESTART)
+ REASON(ERESTART);
+#endif
+#if defined(ERFKILL)
+ REASON(ERFKILL);
+#endif
+#if defined(EROFS)
+ REASON(EROFS);
+#endif
+#if defined(ESHUTDOWN)
+ REASON(ESHUTDOWN);
+#endif
+#if defined(ESOCKTNOSUPPORT)
+ REASON(ESOCKTNOSUPPORT);
+#endif
+#if defined(ESPIPE)
+ REASON(ESPIPE);
+#endif
+#if defined(ESRCH)
+ REASON(ESRCH);
+#endif
+#if defined(ESRMNT)
+ REASON(ESRMNT);
+#endif
+#if defined(ESTALE)
+ REASON(ESTALE);
+#endif
+#if defined(ESTRPIPE)
+ REASON(ESTRPIPE);
+#endif
+#if defined(ETIME)
+ REASON(ETIME);
+#endif
+#if defined(ETIMEDOUT)
+ REASON(ETIMEDOUT);
+#endif
+#if defined(ETOOMANYREFS)
+ REASON(ETOOMANYREFS);
+#endif
+#if defined(ETXTBSY)
+ REASON(ETXTBSY);
+#endif
+#if defined(EUCLEAN)
+ REASON(EUCLEAN);
+#endif
+#if defined(EUNATCH)
+ REASON(EUNATCH);
+#endif
+#if defined(EUSERS)
+ REASON(EUSERS);
+#endif
+#if defined(EWOULDBLOCK) && (!defined(EAGAIN) || EWOULDBLOCK != EAGAIN)
+ REASON(EWOULDBLOCK);
+#endif
+#if defined(EXDEV)
+ REASON(EXDEV);
+#endif
+#if defined(EXFULL)
+ REASON(EXFULL);
+#endif
+ default:
+ return TDisconnectReason(Sprintf("errno=%d", errno));
+ }
+ }
+
+} // NActors
diff --git a/library/cpp/actors/interconnect/types.h b/library/cpp/actors/interconnect/types.h
new file mode 100644
index 0000000000..2662c50c22
--- /dev/null
+++ b/library/cpp/actors/interconnect/types.h
@@ -0,0 +1,43 @@
+#pragma once
+
+#include <util/generic/string.h>
+
+namespace NActors {
+
+ class TDisconnectReason {
+ TString Text;
+
+ private:
+ explicit TDisconnectReason(TString text)
+ : Text(std::move(text))
+ {}
+
+ public:
+ TDisconnectReason() = default;
+ TDisconnectReason(const TDisconnectReason&) = default;
+ TDisconnectReason(TDisconnectReason&&) = default;
+
+ static TDisconnectReason FromErrno(int err);
+
+ static TDisconnectReason EndOfStream() { return TDisconnectReason("EndOfStream"); }
+ static TDisconnectReason CloseOnIdle() { return TDisconnectReason("CloseOnIdle"); }
+ static TDisconnectReason LostConnection() { return TDisconnectReason("LostConnection"); }
+ static TDisconnectReason DeadPeer() { return TDisconnectReason("DeadPeer"); }
+ static TDisconnectReason NewSession() { return TDisconnectReason("NewSession"); }
+ static TDisconnectReason HandshakeFailTransient() { return TDisconnectReason("HandshakeFailTransient"); }
+ static TDisconnectReason HandshakeFailPermanent() { return TDisconnectReason("HandshakeFailPermanent"); }
+ static TDisconnectReason UserRequest() { return TDisconnectReason("UserRequest"); }
+ static TDisconnectReason Debug() { return TDisconnectReason("Debug"); }
+ static TDisconnectReason ChecksumError() { return TDisconnectReason("ChecksumError"); }
+ static TDisconnectReason FormatError() { return TDisconnectReason("FormatError"); }
+ static TDisconnectReason EventTooLarge() { return TDisconnectReason("EventTooLarge"); }
+ static TDisconnectReason QueueOverload() { return TDisconnectReason("QueueOverload"); }
+
+ TString ToString() const {
+ return Text;
+ }
+
+ static TVector<const char*> Reasons;
+ };
+
+} // NActors
diff --git a/library/cpp/actors/interconnect/ut/channel_scheduler_ut.cpp b/library/cpp/actors/interconnect/ut/channel_scheduler_ut.cpp
new file mode 100644
index 0000000000..565a511859
--- /dev/null
+++ b/library/cpp/actors/interconnect/ut/channel_scheduler_ut.cpp
@@ -0,0 +1,115 @@
+#include <library/cpp/actors/interconnect/channel_scheduler.h>
+#include <library/cpp/actors/interconnect/events_local.h>
+#include <library/cpp/testing/unittest/registar.h>
+
+using namespace NActors;
+
+Y_UNIT_TEST_SUITE(ChannelScheduler) {
+
+ Y_UNIT_TEST(PriorityTraffic) {
+ auto common = MakeIntrusive<TInterconnectProxyCommon>();
+ common->MonCounters = MakeIntrusive<NMonitoring::TDynamicCounters>();
+ std::shared_ptr<IInterconnectMetrics> ctr = CreateInterconnectCounters(common);
+ ctr->SetPeerInfo("peer", "1");
+ auto callback = [](THolder<IEventBase>) {};
+ TEventHolderPool pool(common, callback);
+ TSessionParams p;
+ TChannelScheduler scheduler(1, {}, ctr, pool, 64 << 20, p);
+
+ ui32 numEvents = 0;
+
+ auto pushEvent = [&](size_t size, int channel) {
+ TString payload(size, 'X');
+ auto ev = MakeHolder<IEventHandle>(1, 0, TActorId(), TActorId(), MakeIntrusive<TEventSerializedData>(payload, false), 0);
+ auto& ch = scheduler.GetOutputChannel(channel);
+ const bool wasWorking = ch.IsWorking();
+ ch.Push(*ev);
+ if (!wasWorking) {
+ scheduler.AddToHeap(ch, 0);
+ }
+ ++numEvents;
+ };
+
+ for (ui32 i = 0; i < 100; ++i) {
+ pushEvent(10000, 1);
+ }
+
+ for (ui32 i = 0; i < 1000; ++i) {
+ pushEvent(1000, 2);
+ }
+
+ std::map<ui16, ui32> run;
+ ui32 step = 0;
+
+ std::deque<std::map<ui16, ui32>> window;
+
+ for (; numEvents; ++step) {
+ TTcpPacketOutTask task(p);
+
+ if (step == 100) {
+ for (ui32 i = 0; i < 200; ++i) {
+ pushEvent(1000, 3);
+ }
+ }
+
+ std::map<ui16, ui32> ch;
+
+ while (numEvents) {
+ TEventOutputChannel *channel = scheduler.PickChannelWithLeastConsumedWeight();
+ ui32 before = task.GetDataSize();
+ ui64 weightConsumed = 0;
+ numEvents -= channel->FeedBuf(task, 0, &weightConsumed);
+ ui32 after = task.GetDataSize();
+ Y_VERIFY(after >= before);
+ scheduler.FinishPick(weightConsumed, 0);
+ const ui32 bytesAdded = after - before;
+ if (!bytesAdded) {
+ break;
+ }
+ ch[channel->ChannelId] += bytesAdded;
+ }
+
+ scheduler.Equalize();
+
+ for (const auto& [key, value] : ch) {
+ run[key] += value;
+ }
+ window.push_back(ch);
+
+ if (window.size() == 32) {
+ for (const auto& [key, value] : window.front()) {
+ run[key] -= value;
+ if (!run[key]) {
+ run.erase(key);
+ }
+ }
+ window.pop_front();
+ }
+
+ double mean = 0.0;
+ for (const auto& [key, value] : run) {
+ mean += value;
+ }
+ mean /= run.size();
+
+ double dev = 0.0;
+ for (const auto& [key, value] : run) {
+ dev += (value - mean) * (value - mean);
+ }
+ dev = sqrt(dev / run.size());
+
+ double devToMean = dev / mean;
+
+ Cerr << step << ": ";
+ for (const auto& [key, value] : run) {
+ Cerr << "ch" << key << "=" << value << " ";
+ }
+ Cerr << "mean# " << mean << " dev# " << dev << " part# " << devToMean;
+
+ Cerr << Endl;
+
+ UNIT_ASSERT(devToMean < 1);
+ }
+ }
+
+}
diff --git a/library/cpp/actors/interconnect/ut/dynamic_proxy_ut.cpp b/library/cpp/actors/interconnect/ut/dynamic_proxy_ut.cpp
new file mode 100644
index 0000000000..3c474979dc
--- /dev/null
+++ b/library/cpp/actors/interconnect/ut/dynamic_proxy_ut.cpp
@@ -0,0 +1,179 @@
+#include <library/cpp/actors/interconnect/ut/lib/node.h>
+#include <library/cpp/actors/interconnect/ut/lib/ic_test_cluster.h>
+#include <library/cpp/testing/unittest/registar.h>
+
+TActorId MakeResponderServiceId(ui32 nodeId) {
+ return TActorId(nodeId, TStringBuf("ResponderAct", 12));
+}
+
+class TArriveQueue {
+ struct TArrivedItem {
+ ui32 QueueId;
+ ui32 Index;
+ bool Success;
+ };
+
+ TMutex Lock;
+ std::size_t Counter = 0;
+ std::vector<TArrivedItem> Items;
+
+public:
+ TArriveQueue(size_t capacity)
+ : Items(capacity)
+ {}
+
+ bool Done() const {
+ with_lock (Lock) {
+ return Counter == Items.size();
+ }
+ }
+
+ void Push(ui64 cookie, bool success) {
+ with_lock (Lock) {
+ const size_t pos = Counter++;
+ TArrivedItem item{.QueueId = static_cast<ui32>(cookie >> 32), .Index = static_cast<ui32>(cookie & 0xffff'ffff),
+ .Success = success};
+ memcpy(&Items[pos], &item, sizeof(TArrivedItem));
+ }
+ }
+
+ void Check() {
+ struct TPerQueueState {
+ std::vector<ui32> Ok, Error;
+ };
+ std::unordered_map<ui32, TPerQueueState> state;
+ for (const TArrivedItem& item : Items) {
+ auto& st = state[item.QueueId];
+ auto& v = item.Success ? st.Ok : st.Error;
+ v.push_back(item.Index);
+ }
+ for (const auto& [queueId, st] : state) {
+ ui32 expected = 0;
+ for (const ui32 index : st.Ok) {
+ Y_VERIFY(index == expected);
+ ++expected;
+ }
+ for (const ui32 index : st.Error) {
+ Y_VERIFY(index == expected);
+ ++expected;
+ }
+ if (st.Error.size()) {
+ Cerr << "Error.size# " << st.Error.size() << Endl;
+ }
+ }
+ }
+};
+
+class TResponder : public TActor<TResponder> {
+ TArriveQueue& ArriveQueue;
+
+public:
+ TResponder(TArriveQueue& arriveQueue)
+ : TActor(&TResponder::StateFunc)
+ , ArriveQueue(arriveQueue)
+ {}
+
+ STRICT_STFUNC(StateFunc,
+ hFunc(TEvents::TEvPing, Handle);
+ )
+
+ void Handle(TEvents::TEvPing::TPtr ev) {
+ ArriveQueue.Push(ev->Cookie, true);
+ }
+};
+
+class TSender : public TActor<TSender> {
+ TArriveQueue& ArriveQueue;
+
+public:
+ TSender(TArriveQueue& arriveQueue)
+ : TActor(&TThis::StateFunc)
+ , ArriveQueue(arriveQueue)
+ {}
+
+ STRICT_STFUNC(StateFunc,
+ hFunc(TEvents::TEvUndelivered, Handle);
+ )
+
+ void Handle(TEvents::TEvUndelivered::TPtr ev) {
+ ArriveQueue.Push(ev->Cookie, false);
+ }
+};
+
+void SenderThread(TMutex& lock, TActorSystem *as, ui32 nodeId, ui32 queueId, ui32 count, TArriveQueue& arriveQueue) {
+ const TActorId sender = as->Register(new TSender(arriveQueue));
+ with_lock(lock) {}
+ const TActorId target = MakeResponderServiceId(nodeId);
+ for (ui32 i = 0; i < count; ++i) {
+ const ui32 flags = IEventHandle::FlagTrackDelivery;
+ as->Send(new IEventHandle(TEvents::THelloWorld::Ping, flags, target, sender, nullptr, ((ui64)queueId << 32) | i));
+ }
+}
+
+void RaceTestIter(ui32 numThreads, ui32 count) {
+ TPortManager portman;
+ THashMap<ui32, ui16> nodeToPort;
+ const ui32 numNodes = 6; // total
+ const ui32 numDynamicNodes = 3;
+ for (ui32 i = 1; i <= numNodes; ++i) {
+ nodeToPort.emplace(i, portman.GetPort());
+ }
+
+ NMonitoring::TDynamicCounterPtr counters = new NMonitoring::TDynamicCounters;
+ std::list<TNode> nodes;
+ for (ui32 i = 1; i <= numNodes; ++i) {
+ nodes.emplace_back(i, numNodes, nodeToPort, "127.1.0.0", counters->GetSubgroup("nodeId", TStringBuilder() << i),
+ TDuration::Seconds(10), TChannelsConfig(), numDynamicNodes, numThreads);
+ }
+
+ const ui32 numSenders = 10;
+ TArriveQueue arriveQueue(numSenders * numNodes * (numNodes - 1) * count);
+ for (TNode& node : nodes) {
+ node.RegisterServiceActor(MakeResponderServiceId(node.GetActorSystem()->NodeId), new TResponder(arriveQueue));
+ }
+
+ TMutex lock;
+ std::list<TThread> threads;
+ ui32 queueId = 0;
+ with_lock(lock) {
+ for (TNode& from : nodes) {
+ for (ui32 toId = 1; toId <= numNodes; ++toId) {
+ if (toId == from.GetActorSystem()->NodeId) {
+ continue;
+ }
+ for (ui32 i = 0; i < numSenders; ++i) {
+ threads.emplace_back([=, &lock, &from, &arriveQueue] {
+ SenderThread(lock, from.GetActorSystem(), toId, queueId, count, arriveQueue);
+ });
+ ++queueId;
+ }
+ }
+ }
+ for (auto& thread : threads) {
+ thread.Start();
+ }
+ }
+ for (auto& thread : threads) {
+ thread.Join();
+ }
+
+ for (THPTimer timer; !arriveQueue.Done(); TDuration::MilliSeconds(10)) {
+ Y_VERIFY(timer.Passed() < 10);
+ }
+
+ nodes.clear();
+ arriveQueue.Check();
+}
+
+Y_UNIT_TEST_SUITE(DynamicProxy) {
+ Y_UNIT_TEST(RaceCheck1) {
+ for (ui32 iteration = 0; iteration < 100; ++iteration) {
+ RaceTestIter(1 + iteration % 5, 1);
+ }
+ }
+ Y_UNIT_TEST(RaceCheck10) {
+ for (ui32 iteration = 0; iteration < 100; ++iteration) {
+ RaceTestIter(1 + iteration % 5, 10);
+ }
+ }
+}
diff --git a/library/cpp/actors/interconnect/ut/event_holder_pool_ut.cpp b/library/cpp/actors/interconnect/ut/event_holder_pool_ut.cpp
new file mode 100644
index 0000000000..e6b2bd4e4c
--- /dev/null
+++ b/library/cpp/actors/interconnect/ut/event_holder_pool_ut.cpp
@@ -0,0 +1,59 @@
+#include <library/cpp/testing/unittest/registar.h>
+#include <library/cpp/actors/core/events.h>
+#include <library/cpp/actors/core/event_local.h>
+#include <library/cpp/actors/interconnect/interconnect_common.h>
+#include <library/cpp/monlib/dynamic_counters/counters.h>
+#include <library/cpp/actors/interconnect/event_holder_pool.h>
+
+#include <atomic>
+
+using namespace NActors;
+
+template<typename T>
+TEventHolderPool Setup(T&& callback) {
+ auto common = MakeIntrusive<TInterconnectProxyCommon>();
+ common->DestructorQueueSize = std::make_shared<std::atomic<TAtomicBase>>();
+ common->MaxDestructorQueueSize = 1024 * 1024;
+ return TEventHolderPool(common, callback);
+}
+
+Y_UNIT_TEST_SUITE(EventHolderPool) {
+
+ Y_UNIT_TEST(Overflow) {
+ TDeque<THolder<IEventBase>> freeQ;
+ auto callback = [&](THolder<IEventBase> event) {
+ freeQ.push_back(std::move(event));
+ };
+ auto pool = Setup(std::move(callback));
+
+ std::list<TEventHolder> q;
+
+ auto& ev1 = pool.Allocate(q);
+ ev1.Buffer = MakeIntrusive<TEventSerializedData>(TString::Uninitialized(512 * 1024), true);
+
+ auto& ev2 = pool.Allocate(q);
+ ev2.Buffer = MakeIntrusive<TEventSerializedData>(TString::Uninitialized(512 * 1024), true);
+
+ auto& ev3 = pool.Allocate(q);
+ ev3.Buffer = MakeIntrusive<TEventSerializedData>(TString::Uninitialized(512 * 1024), true);
+
+ auto& ev4 = pool.Allocate(q);
+ ev4.Buffer = MakeIntrusive<TEventSerializedData>(TString::Uninitialized(512 * 1024), true);
+
+ pool.Release(q, q.begin());
+ pool.Release(q, q.begin());
+ pool.Trim();
+ UNIT_ASSERT_VALUES_EQUAL(freeQ.size(), 1);
+
+ pool.Release(q, q.begin());
+ UNIT_ASSERT_VALUES_EQUAL(freeQ.size(), 1);
+
+ freeQ.clear();
+ pool.Release(q, q.begin());
+ pool.Trim();
+ UNIT_ASSERT_VALUES_EQUAL(freeQ.size(), 1);
+
+ freeQ.clear(); // if we don't this, we may probablty crash due to the order of object destruction
+ }
+
+}
diff --git a/library/cpp/actors/interconnect/ut/interconnect_ut.cpp b/library/cpp/actors/interconnect/ut/interconnect_ut.cpp
new file mode 100644
index 0000000000..8ef0b1507c
--- /dev/null
+++ b/library/cpp/actors/interconnect/ut/interconnect_ut.cpp
@@ -0,0 +1,177 @@
+#include <library/cpp/actors/interconnect/ut/lib/ic_test_cluster.h>
+#include <library/cpp/testing/unittest/registar.h>
+#include <library/cpp/digest/md5/md5.h>
+#include <util/random/fast.h>
+
+using namespace NActors;
+
+class TSenderActor : public TActorBootstrapped<TSenderActor> {
+ const TActorId Recipient;
+ using TSessionToCookie = std::unordered_multimap<TActorId, ui64, THash<TActorId>>;
+ TSessionToCookie SessionToCookie;
+ std::unordered_map<ui64, std::pair<TSessionToCookie::iterator, TString>> InFlight;
+ std::unordered_map<ui64, TString> Tentative;
+ ui64 NextCookie = 0;
+ TActorId SessionId;
+ bool SubscribeInFlight = false;
+
+public:
+ TSenderActor(TActorId recipient)
+ : Recipient(recipient)
+ {}
+
+ void Bootstrap() {
+ Become(&TThis::StateFunc);
+ Subscribe();
+ }
+
+ void Subscribe() {
+ Cerr << (TStringBuilder() << "Subscribe" << Endl);
+ Y_VERIFY(!SubscribeInFlight);
+ SubscribeInFlight = true;
+ Send(TActivationContext::InterconnectProxy(Recipient.NodeId()), new TEvents::TEvSubscribe);
+ }
+
+ void IssueQueries() {
+ if (!SessionId) {
+ return;
+ }
+ while (InFlight.size() < 10) {
+ size_t len = RandomNumber<size_t>(65536) + 1;
+ TString data = TString::Uninitialized(len);
+ TReallyFastRng32 rng(RandomNumber<ui32>());
+ char *p = data.Detach();
+ for (size_t i = 0; i < len; ++i) {
+ p[i] = rng();
+ }
+ const TSessionToCookie::iterator s2cIt = SessionToCookie.emplace(SessionId, NextCookie);
+ InFlight.emplace(NextCookie, std::make_tuple(s2cIt, MD5::CalcRaw(data)));
+ TActivationContext::Send(new IEventHandle(TEvents::THelloWorld::Ping, IEventHandle::FlagTrackDelivery, Recipient,
+ SelfId(), MakeIntrusive<TEventSerializedData>(std::move(data), false), NextCookie));
+// Cerr << (TStringBuilder() << "Send# " << NextCookie << Endl);
+ ++NextCookie;
+ }
+ }
+
+ void HandlePong(TAutoPtr<IEventHandle> ev) {
+// Cerr << (TStringBuilder() << "Receive# " << ev->Cookie << Endl);
+ if (const auto it = InFlight.find(ev->Cookie); it != InFlight.end()) {
+ auto& [s2cIt, hash] = it->second;
+ Y_VERIFY(hash == ev->GetChainBuffer()->GetString());
+ SessionToCookie.erase(s2cIt);
+ InFlight.erase(it);
+ } else if (const auto it = Tentative.find(ev->Cookie); it != Tentative.end()) {
+ Y_VERIFY(it->second == ev->GetChainBuffer()->GetString());
+ Tentative.erase(it);
+ } else {
+ Y_FAIL("Cookie# %" PRIu64, ev->Cookie);
+ }
+ IssueQueries();
+ }
+
+ void Handle(TEvInterconnect::TEvNodeConnected::TPtr ev) {
+ Cerr << (TStringBuilder() << "TEvNodeConnected" << Endl);
+ Y_VERIFY(SubscribeInFlight);
+ SubscribeInFlight = false;
+ Y_VERIFY(!SessionId);
+ SessionId = ev->Sender;
+ IssueQueries();
+ }
+
+ void Handle(TEvInterconnect::TEvNodeDisconnected::TPtr ev) {
+ Cerr << (TStringBuilder() << "TEvNodeDisconnected" << Endl);
+ SubscribeInFlight = false;
+ if (SessionId) {
+ Y_VERIFY(SessionId == ev->Sender);
+ auto r = SessionToCookie.equal_range(SessionId);
+ for (auto it = r.first; it != r.second; ++it) {
+ const auto inFlightIt = InFlight.find(it->second);
+ Y_VERIFY(inFlightIt != InFlight.end());
+ Tentative.emplace(inFlightIt->first, inFlightIt->second.second);
+ InFlight.erase(it->second);
+ }
+ SessionToCookie.erase(r.first, r.second);
+ SessionId = TActorId();
+ }
+ Schedule(TDuration::MilliSeconds(100), new TEvents::TEvWakeup);
+ }
+
+ void Handle(TEvents::TEvUndelivered::TPtr ev) {
+ Cerr << (TStringBuilder() << "TEvUndelivered Cookie# " << ev->Cookie << Endl);
+ if (const auto it = InFlight.find(ev->Cookie); it != InFlight.end()) {
+ auto& [s2cIt, hash] = it->second;
+ Tentative.emplace(it->first, hash);
+ SessionToCookie.erase(s2cIt);
+ InFlight.erase(it);
+ IssueQueries();
+ }
+ }
+
+ STRICT_STFUNC(StateFunc,
+ fFunc(TEvents::THelloWorld::Pong, HandlePong);
+ hFunc(TEvInterconnect::TEvNodeConnected, Handle);
+ hFunc(TEvInterconnect::TEvNodeDisconnected, Handle);
+ hFunc(TEvents::TEvUndelivered, Handle);
+ cFunc(TEvents::TSystem::Wakeup, Subscribe);
+ )
+};
+
+class TRecipientActor : public TActor<TRecipientActor> {
+public:
+ TRecipientActor()
+ : TActor(&TThis::StateFunc)
+ {}
+
+ void HandlePing(TAutoPtr<IEventHandle>& ev) {
+ const TString& data = ev->GetChainBuffer()->GetString();
+ const TString& response = MD5::CalcRaw(data);
+ TActivationContext::Send(new IEventHandle(TEvents::THelloWorld::Pong, 0, ev->Sender, SelfId(),
+ MakeIntrusive<TEventSerializedData>(response, false), ev->Cookie));
+ }
+
+ STRICT_STFUNC(StateFunc,
+ fFunc(TEvents::THelloWorld::Ping, HandlePing);
+ )
+};
+
+Y_UNIT_TEST_SUITE(Interconnect) {
+
+ Y_UNIT_TEST(SessionContinuation) {
+ TTestICCluster cluster(2);
+ const TActorId recipient = cluster.RegisterActor(new TRecipientActor, 1);
+ cluster.RegisterActor(new TSenderActor(recipient), 2);
+ for (ui32 i = 0; i < 100; ++i) {
+ const ui32 nodeId = 1 + RandomNumber(2u);
+ const ui32 peerNodeId = 3 - nodeId;
+ const ui32 action = RandomNumber(3u);
+ auto *node = cluster.GetNode(nodeId);
+ TActorId proxyId = node->InterconnectProxy(peerNodeId);
+
+ switch (action) {
+ case 0:
+ node->Send(proxyId, new TEvInterconnect::TEvClosePeerSocket);
+ Cerr << (TStringBuilder() << "nodeId# " << nodeId << " peerNodeId# " << peerNodeId
+ << " TEvClosePeerSocket" << Endl);
+ break;
+
+ case 1:
+ node->Send(proxyId, new TEvInterconnect::TEvCloseInputSession);
+ Cerr << (TStringBuilder() << "nodeId# " << nodeId << " peerNodeId# " << peerNodeId
+ << " TEvCloseInputSession" << Endl);
+ break;
+
+ case 2:
+ node->Send(proxyId, new TEvInterconnect::TEvPoisonSession);
+ Cerr << (TStringBuilder() << "nodeId# " << nodeId << " peerNodeId# " << peerNodeId
+ << " TEvPoisonSession" << Endl);
+ break;
+
+ default:
+ Y_FAIL();
+ }
+
+ Sleep(TDuration::MilliSeconds(RandomNumber<ui32>(500) + 100));
+ }
+ }
+
+}
diff --git a/library/cpp/actors/interconnect/ut/large.cpp b/library/cpp/actors/interconnect/ut/large.cpp
new file mode 100644
index 0000000000..ba2a50c6f6
--- /dev/null
+++ b/library/cpp/actors/interconnect/ut/large.cpp
@@ -0,0 +1,85 @@
+#include "lib/ic_test_cluster.h"
+#include "lib/test_events.h"
+#include "lib/test_actors.h"
+
+#include <library/cpp/actors/interconnect/interconnect_tcp_proxy.h>
+
+#include <library/cpp/testing/unittest/tests_data.h>
+#include <library/cpp/testing/unittest/registar.h>
+
+#include <util/system/event.h>
+#include <util/system/sanitizers.h>
+
+Y_UNIT_TEST_SUITE(LargeMessage) {
+ using namespace NActors;
+
+ class TProducer: public TActorBootstrapped<TProducer> {
+ const TActorId RecipientActorId;
+
+ public:
+ TProducer(const TActorId& recipientActorId)
+ : RecipientActorId(recipientActorId)
+ {}
+
+ void Bootstrap(const TActorContext& ctx) {
+ Become(&TThis::StateFunc);
+ ctx.Send(RecipientActorId, new TEvTest(1, "hello"), IEventHandle::FlagTrackDelivery, 1);
+ ctx.Send(RecipientActorId, new TEvTest(2, TString(128 * 1024 * 1024, 'X')), IEventHandle::FlagTrackDelivery, 2);
+ }
+
+ void Handle(TEvents::TEvUndelivered::TPtr ev, const TActorContext& ctx) {
+ if (ev->Cookie == 2) {
+ Cerr << "TEvUndelivered\n";
+ ctx.Send(RecipientActorId, new TEvTest(3, "hello"), IEventHandle::FlagTrackDelivery, 3);
+ }
+ }
+
+ STRICT_STFUNC(StateFunc,
+ HFunc(TEvents::TEvUndelivered, Handle)
+ )
+ };
+
+ class TConsumer : public TActorBootstrapped<TConsumer> {
+ TManualEvent& Done;
+ TActorId SessionId;
+
+ public:
+ TConsumer(TManualEvent& done)
+ : Done(done)
+ {
+ }
+
+ void Bootstrap(const TActorContext& /*ctx*/) {
+ Become(&TThis::StateFunc);
+ }
+
+ void Handle(TEvTest::TPtr ev, const TActorContext& /*ctx*/) {
+ const auto& record = ev->Get()->Record;
+ Cerr << "RECEIVED TEvTest\n";
+ if (record.GetSequenceNumber() == 1) {
+ Y_VERIFY(!SessionId);
+ SessionId = ev->InterconnectSession;
+ } else if (record.GetSequenceNumber() == 3) {
+ Y_VERIFY(SessionId != ev->InterconnectSession);
+ Done.Signal();
+ } else {
+ Y_FAIL("incorrect sequence number");
+ }
+ }
+
+ STRICT_STFUNC(StateFunc,
+ HFunc(TEvTest, Handle)
+ )
+ };
+
+ Y_UNIT_TEST(Test) {
+ TTestICCluster testCluster(2);
+
+ TManualEvent done;
+ TConsumer* consumer = new TConsumer(done);
+ const TActorId recp = testCluster.RegisterActor(consumer, 1);
+ testCluster.RegisterActor(new TProducer(recp), 2);
+ done.WaitI();
+ }
+
+}
diff --git a/library/cpp/actors/interconnect/ut/lib/ic_test_cluster.h b/library/cpp/actors/interconnect/ut/lib/ic_test_cluster.h
new file mode 100644
index 0000000000..2b6d27cd3f
--- /dev/null
+++ b/library/cpp/actors/interconnect/ut/lib/ic_test_cluster.h
@@ -0,0 +1,84 @@
+#pragma once
+
+#include "node.h"
+#include "interrupter.h"
+
+#include <library/cpp/actors/interconnect/interconnect_tcp_proxy.h>
+#include <library/cpp/actors/core/events.h>
+#include <library/cpp/testing/unittest/tests_data.h>
+
+#include <util/generic/noncopyable.h>
+
+class TTestICCluster: public TNonCopyable {
+public:
+ struct TTrafficInterrupterSettings {
+ TDuration RejectingTrafficTimeout;
+ double BandWidth;
+ bool Disconnect;
+ };
+
+private:
+ const ui32 NumNodes;
+ const TString Address = "::1";
+ TDuration DeadPeerTimeout = TDuration::Seconds(2);
+ NMonitoring::TDynamicCounterPtr Counters;
+ THashMap<ui32, THolder<TNode>> Nodes;
+ TList<TTrafficInterrupter> interrupters;
+ NActors::TChannelsConfig ChannelsConfig;
+ TPortManager PortManager;
+
+public:
+ TTestICCluster(ui32 numNodes = 1, NActors::TChannelsConfig channelsConfig = NActors::TChannelsConfig(),
+ TTrafficInterrupterSettings* tiSettings = nullptr)
+ : NumNodes(numNodes)
+ , Counters(new NMonitoring::TDynamicCounters)
+ , ChannelsConfig(channelsConfig)
+ {
+ THashMap<ui32, ui16> nodeToPortMap;
+ THashMap<ui32, THashMap<ui32, ui16>> specificNodePortMap;
+
+ for (ui32 i = 1; i <= NumNodes; ++i) {
+ nodeToPortMap.emplace(i, PortManager.GetPort());
+ }
+
+ if (tiSettings) {
+ ui32 nodeId;
+ ui16 listenPort;
+ ui16 forwardPort;
+ for (auto& item : nodeToPortMap) {
+ nodeId = item.first;
+ listenPort = item.second;
+ forwardPort = PortManager.GetPort();
+
+ specificNodePortMap[nodeId] = nodeToPortMap;
+ specificNodePortMap[nodeId].at(nodeId) = forwardPort;
+ interrupters.emplace_back(Address, listenPort, forwardPort, tiSettings->RejectingTrafficTimeout, tiSettings->BandWidth, tiSettings->Disconnect);
+ interrupters.back().Start();
+ }
+ }
+
+ for (ui32 i = 1; i <= NumNodes; ++i) {
+ auto& portMap = tiSettings ? specificNodePortMap[i] : nodeToPortMap;
+ Nodes.emplace(i, MakeHolder<TNode>(i, NumNodes, portMap, Address, Counters, DeadPeerTimeout, ChannelsConfig));
+ }
+ }
+
+ TNode* GetNode(ui32 id) {
+ return Nodes[id].Get();
+ }
+
+ ~TTestICCluster() {
+ }
+
+ TActorId RegisterActor(NActors::IActor* actor, ui32 nodeId) {
+ return Nodes[nodeId]->RegisterActor(actor);
+ }
+
+ TActorId InterconnectProxy(ui32 peerNodeId, ui32 nodeId) {
+ return Nodes[nodeId]->InterconnectProxy(peerNodeId);
+ }
+
+ void KillActor(ui32 nodeId, const TActorId& id) {
+ Nodes[nodeId]->Send(id, new NActors::TEvents::TEvPoisonPill);
+ }
+};
diff --git a/library/cpp/actors/interconnect/ut/lib/interrupter.h b/library/cpp/actors/interconnect/ut/lib/interrupter.h
new file mode 100644
index 0000000000..48851de2c5
--- /dev/null
+++ b/library/cpp/actors/interconnect/ut/lib/interrupter.h
@@ -0,0 +1,249 @@
+#pragma once
+
+#include <library/cpp/testing/unittest/tests_data.h>
+
+#include <util/network/sock.h>
+#include <util/network/poller.h>
+#include <util/system/thread.h>
+#include <util/system/hp_timer.h>
+#include <util/generic/list.h>
+#include <util/generic/set.h>
+#include <util/generic/vector.h>
+#include <util/generic/deque.h>
+#include <util/random/random.h>
+
+#include <iterator>
+
+class TTrafficInterrupter
+ : public ISimpleThread {
+ const TString Address;
+ const ui16 ForwardPort;
+ TInet6StreamSocket ListenSocket;
+
+ struct TConnectionDescriptor;
+ struct TDelayedPacket {
+ TInet6StreamSocket* ForwardSocket = nullptr;
+ TVector<char> Data;
+ };
+ struct TCompare {
+ bool operator()(const std::pair<TInstant, TDelayedPacket>& x, const std::pair<TInstant, TDelayedPacket>& y) const {
+ return x.first > y.first;
+ };
+ };
+
+ struct TDirectedConnection {
+ TInet6StreamSocket* Source = nullptr;
+ TInet6StreamSocket* Destination = nullptr;
+ TList<TConnectionDescriptor>::iterator ListIterator;
+ TInstant Timestamp;
+ TPriorityQueue<std::pair<TInstant, TDelayedPacket>, TVector<std::pair<TInstant, TDelayedPacket>>, TCompare> DelayedQueue;
+
+ TDirectedConnection(TInet6StreamSocket* source, TInet6StreamSocket* destination)
+ : Source(source)
+ , Destination(destination)
+ {
+ }
+ };
+
+ struct TConnectionDescriptor {
+ std::unique_ptr<TInet6StreamSocket> FirstSocket;
+ std::unique_ptr<TInet6StreamSocket> SecondSocket;
+ TDirectedConnection ForwardConnection;
+ TDirectedConnection BackwardConnection;
+
+ TConnectionDescriptor(std::unique_ptr<TInet6StreamSocket> firstSock,
+ std::unique_ptr<TInet6StreamSocket> secondSock)
+ : FirstSocket(std::move(firstSock))
+ , SecondSocket(std::move(secondSock))
+ , ForwardConnection(FirstSocket.get(), SecondSocket.get())
+ , BackwardConnection(SecondSocket.get(), FirstSocket.get())
+ {
+ }
+ };
+
+ template <class It = TList<TConnectionDescriptor>::iterator>
+ class TCustomListIteratorCompare {
+ public:
+ bool operator()(const It& it1, const It& it2) const {
+ return (&(*it1) < &(*it2));
+ }
+ };
+
+ TList<TConnectionDescriptor> Connections;
+ TSet<TList<TConnectionDescriptor>::iterator, TCustomListIteratorCompare<>> DroppedConnections;
+
+public:
+ TTrafficInterrupter(TString address, ui16 listenPort, ui16 forwardPort, TDuration rejectingTrafficTimeout, double bandwidth, bool disconnect = true)
+ : Address(std::move(address))
+ , ForwardPort(forwardPort)
+ , ListenSocket()
+ , RejectingTrafficTimeout(rejectingTrafficTimeout)
+ , CurrentRejectingTimeout(rejectingTrafficTimeout)
+ , RejectingStateTimer()
+ , Bandwidth(bandwidth)
+ , Disconnect(disconnect)
+ , RejectingTraffic(false)
+ {
+ SetReuseAddressAndPort(ListenSocket);
+ TSockAddrInet6 addr(Address.data(), listenPort);
+ Y_VERIFY(ListenSocket.Bind(&addr) == 0);
+ Y_VERIFY(ListenSocket.Listen(5) == 0);
+
+ DelayTraffic = (Bandwidth == 0.0) ? false : true;
+
+ ForwardAddrress.Reset(new TSockAddrInet6(Address.data(), ForwardPort));
+ const ui32 BufSize = DelayTraffic ? 4096 : 65536 + 4096;
+ Buf.resize(BufSize);
+ }
+
+ ~TTrafficInterrupter() {
+ AtomicSet(Running, 0);
+ this->Join();
+ }
+
+private:
+ TAtomic Running = 1;
+ TVector<char> Buf;
+ TSocketPoller SocketPoller;
+ THolder<TSockAddrInet6> ForwardAddrress;
+ TVector<void*> Events;
+ TDuration RejectingTrafficTimeout;
+ TDuration CurrentRejectingTimeout;
+ TDuration DefaultPollTimeout = TDuration::MilliSeconds(100);
+ TDuration DisconnectTimeout = TDuration::MilliSeconds(100);
+ THPTimer RejectingStateTimer;
+ THPTimer DisconnectTimer;
+ double Bandwidth;
+ const bool Disconnect;
+ bool RejectingTraffic;
+ bool DelayTraffic;
+
+ void UpdateRejectingState() {
+ if (TDuration::Seconds(std::abs(RejectingStateTimer.Passed())) > CurrentRejectingTimeout) {
+ RejectingStateTimer.Reset();
+ CurrentRejectingTimeout = (RandomNumber<ui32>(1) ? RejectingTrafficTimeout + TDuration::Seconds(1.0) : RejectingTrafficTimeout - TDuration::Seconds(0.2));
+ RejectingTraffic = !RejectingTraffic;
+ }
+ }
+
+ void RandomlyDisconnect() {
+ if (TDuration::Seconds(std::abs(DisconnectTimer.Passed())) > DisconnectTimeout) {
+ DisconnectTimer.Reset();
+ if (RandomNumber<ui32>(100) > 90) {
+ if (!Connections.empty()) {
+ auto it = Connections.begin();
+ std::advance(it, RandomNumber<ui32>(Connections.size()));
+ SocketPoller.Unwait(static_cast<SOCKET>(*it->FirstSocket.get()));
+ SocketPoller.Unwait(static_cast<SOCKET>(*it->SecondSocket.get()));
+ Connections.erase(it);
+ }
+ }
+ }
+ }
+
+ void* ThreadProc() override {
+ int pollReadyCount = 0;
+ SocketPoller.WaitRead(static_cast<SOCKET>(ListenSocket), &ListenSocket);
+ Events.resize(10);
+
+ while (AtomicGet(Running)) {
+ if (RejectingTrafficTimeout != TDuration::Zero()) {
+ UpdateRejectingState();
+ }
+ if (Disconnect) {
+ RandomlyDisconnect();
+ }
+ if (!RejectingTraffic) {
+ TDuration timeout = DefaultPollTimeout;
+ auto updateTimout = [&timeout](TDirectedConnection& conn) {
+ if (conn.DelayedQueue) {
+ timeout = Min(timeout, conn.DelayedQueue.top().first - TInstant::Now());
+ }
+ };
+ for (auto& it : Connections) {
+ updateTimout(it.ForwardConnection);
+ updateTimout(it.BackwardConnection);
+ }
+ pollReadyCount = SocketPoller.WaitT(Events.data(), Events.size(), timeout);
+ if (pollReadyCount > 0) {
+ for (int i = 0; i < pollReadyCount; i++) {
+ HandleSocketPollEvent(Events[i]);
+ }
+ for (auto it : DroppedConnections) {
+ Connections.erase(it);
+ }
+ DroppedConnections.clear();
+ }
+ }
+ if (DelayTraffic) { // process packets from DelayQueues
+ auto processDelayedPackages = [](TDirectedConnection& conn) {
+ while (!conn.DelayedQueue.empty()) {
+ auto& frontPackage = conn.DelayedQueue.top();
+ if (TInstant::Now() >= frontPackage.first) {
+ TInet6StreamSocket* sock = frontPackage.second.ForwardSocket;
+ if (sock) {
+ sock->Send(frontPackage.second.Data.data(), frontPackage.second.Data.size());
+ }
+ conn.DelayedQueue.pop();
+ } else {
+ break;
+ }
+ }
+ };
+ for (auto& it : Connections) {
+ processDelayedPackages(it.ForwardConnection);
+ processDelayedPackages(it.BackwardConnection);
+ }
+ }
+ }
+ ListenSocket.Close();
+ return nullptr;
+ }
+
+ void HandleSocketPollEvent(void* ev) {
+ if (ev == static_cast<void*>(&ListenSocket)) {
+ TSockAddrInet6 origin;
+ Connections.emplace_back(TConnectionDescriptor(std::unique_ptr<TInet6StreamSocket>(new TInet6StreamSocket), std::unique_ptr<TInet6StreamSocket>(new TInet6StreamSocket)));
+ int err = ListenSocket.Accept(Connections.back().FirstSocket.get(), &origin);
+ if (!err) {
+ err = Connections.back().SecondSocket->Connect(ForwardAddrress.Get());
+ if (!err) {
+ Connections.back().ForwardConnection.ListIterator = --Connections.end();
+ Connections.back().BackwardConnection.ListIterator = --Connections.end();
+ SocketPoller.WaitRead(static_cast<SOCKET>(*Connections.back().FirstSocket), &Connections.back().ForwardConnection);
+ SocketPoller.WaitRead(static_cast<SOCKET>(*Connections.back().SecondSocket), &Connections.back().BackwardConnection);
+ } else {
+ Connections.back().FirstSocket->Close();
+ }
+ } else {
+ Connections.pop_back();
+ }
+ } else {
+ TDirectedConnection* directedConnection = static_cast<TDirectedConnection*>(ev);
+ int recvSize = 0;
+ do {
+ recvSize = directedConnection->Source->Recv(Buf.data(), Buf.size());
+ } while (recvSize == -EINTR);
+
+ if (recvSize > 0) {
+ if (DelayTraffic) {
+ // put packet into DelayQueue
+ const TDuration baseDelay = TDuration::MicroSeconds(recvSize * 1e6 / Bandwidth);
+ const TInstant now = TInstant::Now();
+ directedConnection->Timestamp = Max(now, directedConnection->Timestamp) + baseDelay;
+ TDelayedPacket pkt;
+ pkt.ForwardSocket = directedConnection->Destination;
+ pkt.Data.resize(recvSize);
+ memcpy(pkt.Data.data(), Buf.data(), recvSize);
+ directedConnection->DelayedQueue.emplace(directedConnection->Timestamp, std::move(pkt));
+ } else {
+ directedConnection->Destination->Send(Buf.data(), recvSize);
+ }
+ } else {
+ SocketPoller.Unwait(static_cast<SOCKET>(*directedConnection->Source));
+ SocketPoller.Unwait(static_cast<SOCKET>(*directedConnection->Destination));
+ DroppedConnections.emplace(directedConnection->ListIterator);
+ }
+ }
+ }
+};
diff --git a/library/cpp/actors/interconnect/ut/lib/node.h b/library/cpp/actors/interconnect/ut/lib/node.h
new file mode 100644
index 0000000000..ff30b1445e
--- /dev/null
+++ b/library/cpp/actors/interconnect/ut/lib/node.h
@@ -0,0 +1,137 @@
+#pragma once
+
+#include <library/cpp/actors/core/actorsystem.h>
+#include <library/cpp/actors/core/executor_pool_basic.h>
+#include <library/cpp/actors/core/scheduler_basic.h>
+#include <library/cpp/actors/core/mailbox.h>
+#include <library/cpp/actors/dnsresolver/dnsresolver.h>
+
+#include <library/cpp/actors/interconnect/interconnect_tcp_server.h>
+#include <library/cpp/actors/interconnect/interconnect_tcp_proxy.h>
+#include <library/cpp/actors/interconnect/interconnect_proxy_wrapper.h>
+
+using namespace NActors;
+
+class TNode {
+ THolder<TActorSystem> ActorSystem;
+
+public:
+ TNode(ui32 nodeId, ui32 numNodes, const THashMap<ui32, ui16>& nodeToPort, const TString& address,
+ NMonitoring::TDynamicCounterPtr counters, TDuration deadPeerTimeout,
+ TChannelsConfig channelsSettings = TChannelsConfig(),
+ ui32 numDynamicNodes = 0, ui32 numThreads = 1) {
+ TActorSystemSetup setup;
+ setup.NodeId = nodeId;
+ setup.ExecutorsCount = 1;
+ setup.Executors.Reset(new TAutoPtr<IExecutorPool>[setup.ExecutorsCount]);
+ for (ui32 i = 0; i < setup.ExecutorsCount; ++i) {
+ setup.Executors[i].Reset(new TBasicExecutorPool(i, numThreads, 20 /* magic number */));
+ }
+ setup.Scheduler.Reset(new TBasicSchedulerThread());
+ const ui32 interconnectPoolId = 0;
+
+ auto common = MakeIntrusive<TInterconnectProxyCommon>();
+ common->NameserviceId = GetNameserviceActorId();
+ common->MonCounters = counters->GetSubgroup("nodeId", ToString(nodeId));
+ common->ChannelsConfig = channelsSettings;
+ common->ClusterUUID = "cluster";
+ common->AcceptUUID = {common->ClusterUUID};
+ common->TechnicalSelfHostName = address;
+ common->Settings.Handshake = TDuration::Seconds(1);
+ common->Settings.DeadPeer = deadPeerTimeout;
+ common->Settings.CloseOnIdle = TDuration::Minutes(1);
+ common->Settings.SendBufferDieLimitInMB = 512;
+ common->Settings.TotalInflightAmountOfData = 512 * 1024;
+ common->Settings.TCPSocketBufferSize = 2048 * 1024;
+
+ setup.Interconnect.ProxyActors.resize(numNodes + 1 - numDynamicNodes);
+ setup.Interconnect.ProxyWrapperFactory = CreateProxyWrapperFactory(common, interconnectPoolId);
+
+ for (ui32 i = 1; i <= numNodes; ++i) {
+ if (i == nodeId) {
+ // create listener actor for local node "nodeId"
+ setup.LocalServices.emplace_back(TActorId(), TActorSetupCmd(new TInterconnectListenerTCP(address,
+ nodeToPort.at(nodeId), common), TMailboxType::ReadAsFilled, interconnectPoolId));
+ } else if (i <= numNodes - numDynamicNodes) {
+ // create proxy actor to reach node "i"
+ setup.Interconnect.ProxyActors[i] = {new TInterconnectProxyTCP(i, common),
+ TMailboxType::ReadAsFilled, interconnectPoolId};
+ }
+ }
+
+ setup.LocalServices.emplace_back(MakePollerActorId(), TActorSetupCmd(CreatePollerActor(),
+ TMailboxType::ReadAsFilled, 0));
+
+ const TActorId loggerActorId(0, "logger");
+ constexpr ui32 LoggerComponentId = 410; // NKikimrServices::LOGGER
+
+ auto loggerSettings = MakeIntrusive<NLog::TSettings>(
+ loggerActorId,
+ (NLog::EComponent)LoggerComponentId,
+ NLog::PRI_INFO,
+ NLog::PRI_DEBUG,
+ 0U);
+
+ loggerSettings->Append(
+ NActorsServices::EServiceCommon_MIN,
+ NActorsServices::EServiceCommon_MAX,
+ NActorsServices::EServiceCommon_Name
+ );
+
+ constexpr ui32 WilsonComponentId = 430; // NKikimrServices::WILSON
+ static const TString WilsonComponentName = "WILSON";
+
+ loggerSettings->Append(
+ (NLog::EComponent)WilsonComponentId,
+ (NLog::EComponent)WilsonComponentId + 1,
+ [](NLog::EComponent) -> const TString & { return WilsonComponentName; });
+
+ // register nameserver table
+ auto names = MakeIntrusive<TTableNameserverSetup>();
+ for (ui32 i = 1; i <= numNodes; ++i) {
+ names->StaticNodeTable[i] = TTableNameserverSetup::TNodeInfo(address, address, nodeToPort.at(i));
+ }
+ setup.LocalServices.emplace_back(
+ NDnsResolver::MakeDnsResolverActorId(),
+ TActorSetupCmd(
+ NDnsResolver::CreateOnDemandDnsResolver(),
+ TMailboxType::ReadAsFilled, interconnectPoolId));
+ setup.LocalServices.emplace_back(GetNameserviceActorId(), TActorSetupCmd(
+ CreateNameserverTable(names, interconnectPoolId), TMailboxType::ReadAsFilled,
+ interconnectPoolId));
+
+ // register logger
+ setup.LocalServices.emplace_back(loggerActorId, TActorSetupCmd(new TLoggerActor(loggerSettings,
+ CreateStderrBackend(), counters->GetSubgroup("subsystem", "logger")),
+ TMailboxType::ReadAsFilled, interconnectPoolId));
+
+ auto sp = MakeHolder<TActorSystemSetup>(std::move(setup));
+ ActorSystem.Reset(new TActorSystem(sp, nullptr, loggerSettings));
+ ActorSystem->Start();
+ }
+
+ ~TNode() {
+ ActorSystem->Stop();
+ }
+
+ bool Send(const TActorId& recipient, IEventBase* ev) {
+ return ActorSystem->Send(recipient, ev);
+ }
+
+ TActorId RegisterActor(IActor* actor) {
+ return ActorSystem->Register(actor);
+ }
+
+ TActorId InterconnectProxy(ui32 peerNodeId) {
+ return ActorSystem->InterconnectProxy(peerNodeId);
+ }
+
+ void RegisterServiceActor(const TActorId& serviceId, IActor* actor) {
+ const TActorId actorId = ActorSystem->Register(actor);
+ ActorSystem->RegisterLocalService(serviceId, actorId);
+ }
+
+ TActorSystem *GetActorSystem() const {
+ return ActorSystem.Get();
+ }
+};
diff --git a/library/cpp/actors/interconnect/ut/lib/test_actors.h b/library/cpp/actors/interconnect/ut/lib/test_actors.h
new file mode 100644
index 0000000000..7591200471
--- /dev/null
+++ b/library/cpp/actors/interconnect/ut/lib/test_actors.h
@@ -0,0 +1,83 @@
+#pragma once
+
+namespace NActors {
+ class TSenderBaseActor: public TActorBootstrapped<TSenderBaseActor> {
+ protected:
+ const TActorId RecipientActorId;
+ const ui32 Preload;
+ ui64 SequenceNumber = 0;
+ ui32 InFlySize = 0;
+
+ public:
+ TSenderBaseActor(const TActorId& recipientActorId, ui32 preload = 1)
+ : RecipientActorId(recipientActorId)
+ , Preload(preload)
+ {
+ }
+
+ virtual ~TSenderBaseActor() {
+ }
+
+ virtual void Bootstrap(const TActorContext& ctx) {
+ Become(&TSenderBaseActor::StateFunc);
+ ctx.Send(ctx.ExecutorThread.ActorSystem->InterconnectProxy(RecipientActorId.NodeId()), new TEvInterconnect::TEvConnectNode);
+ }
+
+ virtual void SendMessagesIfPossible(const TActorContext& ctx) {
+ while (InFlySize < Preload) {
+ SendMessage(ctx);
+ }
+ }
+
+ virtual void SendMessage(const TActorContext& /*ctx*/) {
+ ++SequenceNumber;
+ }
+
+ virtual void Handle(TEvents::TEvUndelivered::TPtr& /*ev*/, const TActorContext& ctx) {
+ SendMessage(ctx);
+ }
+
+ virtual void Handle(TEvTestResponse::TPtr& /*ev*/, const TActorContext& ctx) {
+ SendMessagesIfPossible(ctx);
+ }
+
+ void Handle(TEvInterconnect::TEvNodeConnected::TPtr& /*ev*/, const TActorContext& ctx) {
+ SendMessagesIfPossible(ctx);
+ }
+
+ void Handle(TEvInterconnect::TEvNodeDisconnected::TPtr& /*ev*/, const TActorContext& /*ctx*/) {
+ }
+
+ virtual void Handle(TEvents::TEvPoisonPill::TPtr& /*ev*/, const TActorContext& ctx) {
+ Die(ctx);
+ }
+
+ virtual STRICT_STFUNC(StateFunc,
+ HFunc(TEvTestResponse, Handle)
+ HFunc(TEvents::TEvUndelivered, Handle)
+ HFunc(TEvents::TEvPoisonPill, Handle)
+ HFunc(TEvInterconnect::TEvNodeConnected, Handle)
+ HFunc(TEvInterconnect::TEvNodeDisconnected, Handle)
+ )
+ };
+
+ class TReceiverBaseActor: public TActor<TReceiverBaseActor> {
+ protected:
+ ui64 ReceivedCount = 0;
+
+ public:
+ TReceiverBaseActor()
+ : TActor(&TReceiverBaseActor::StateFunc)
+ {
+ }
+
+ virtual ~TReceiverBaseActor() {
+ }
+
+ virtual STRICT_STFUNC(StateFunc,
+ HFunc(TEvTest, Handle)
+ )
+
+ virtual void Handle(TEvTest::TPtr& /*ev*/, const TActorContext& /*ctx*/) {}
+ };
+}
diff --git a/library/cpp/actors/interconnect/ut/lib/test_events.h b/library/cpp/actors/interconnect/ut/lib/test_events.h
new file mode 100644
index 0000000000..cd0d9e0152
--- /dev/null
+++ b/library/cpp/actors/interconnect/ut/lib/test_events.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <library/cpp/actors/interconnect/ut/protos/interconnect_test.pb.h>
+
+namespace NActors {
+ enum {
+ EvTest = EventSpaceBegin(TEvents::ES_PRIVATE),
+ EvTestChan,
+ EvTestSmall,
+ EvTestLarge,
+ EvTestResponse,
+ };
+
+ struct TEvTest : TEventPB<TEvTest, NInterconnectTest::TEvTest, EvTest> {
+ TEvTest() = default;
+
+ TEvTest(ui64 sequenceNumber, const TString& payload) {
+ Record.SetSequenceNumber(sequenceNumber);
+ Record.SetPayload(payload);
+ }
+ };
+
+ struct TEvTestLarge : TEventPB<TEvTestLarge, NInterconnectTest::TEvTestLarge, EvTestLarge> {
+ TEvTestLarge() = default;
+
+ TEvTestLarge(ui64 sequenceNumber, const TString& payload) {
+ Record.SetSequenceNumber(sequenceNumber);
+ Record.SetPayload(payload);
+ }
+ };
+
+ struct TEvTestSmall : TEventPB<TEvTestSmall, NInterconnectTest::TEvTestSmall, EvTestSmall> {
+ TEvTestSmall() = default;
+
+ TEvTestSmall(ui64 sequenceNumber, const TString& payload) {
+ Record.SetSequenceNumber(sequenceNumber);
+ Record.SetPayload(payload);
+ }
+ };
+
+ struct TEvTestResponse : TEventPB<TEvTestResponse, NInterconnectTest::TEvTestResponse, EvTestResponse> {
+ TEvTestResponse() = default;
+
+ TEvTestResponse(ui64 confirmedSequenceNumber) {
+ Record.SetConfirmedSequenceNumber(confirmedSequenceNumber);
+ }
+ };
+
+}
diff --git a/library/cpp/actors/interconnect/ut/lib/ya.make b/library/cpp/actors/interconnect/ut/lib/ya.make
new file mode 100644
index 0000000000..80f45f364f
--- /dev/null
+++ b/library/cpp/actors/interconnect/ut/lib/ya.make
@@ -0,0 +1,12 @@
+LIBRARY()
+
+OWNER(vkanaev)
+
+SRCS(
+ node.h
+ test_events.h
+ test_actors.h
+ ic_test_cluster.h
+)
+
+END()
diff --git a/library/cpp/actors/interconnect/ut/poller_actor_ut.cpp b/library/cpp/actors/interconnect/ut/poller_actor_ut.cpp
new file mode 100644
index 0000000000..23d846a2fd
--- /dev/null
+++ b/library/cpp/actors/interconnect/ut/poller_actor_ut.cpp
@@ -0,0 +1,264 @@
+#include <library/cpp/actors/interconnect/poller_actor.h>
+#include <library/cpp/actors/testlib/test_runtime.h>
+
+#include <library/cpp/testing/unittest/registar.h>
+
+#include <util/network/pair.h>
+#include <util/network/socket.h>
+
+using namespace NActors;
+
+class TTestSocket: public TSharedDescriptor {
+public:
+ explicit TTestSocket(SOCKET fd)
+ : Fd_(fd)
+ {
+ }
+
+ int GetDescriptor() override {
+ return Fd_;
+ }
+
+private:
+ SOCKET Fd_;
+};
+using TTestSocketPtr = TIntrusivePtr<TTestSocket>;
+
+// create pair of connected, non-blocking sockets
+std::pair<TTestSocketPtr, TTestSocketPtr> NonBlockSockets() {
+ SOCKET fds[2];
+ SocketPair(fds);
+ SetNonBlock(fds[0]);
+ SetNonBlock(fds[1]);
+ return {MakeIntrusive<TTestSocket>(fds[0]), MakeIntrusive<TTestSocket>(fds[1])};
+}
+
+std::pair<TTestSocketPtr, TTestSocketPtr> TcpSockets() {
+ // create server (listening) socket
+ SOCKET server = socket(AF_INET, SOCK_STREAM, 0);
+ Y_VERIFY(server != -1, "socket() failed with %s", strerror(errno));
+
+ // bind it to local address with automatically picked port
+ sockaddr_in addr;
+ addr.sin_family = AF_INET;
+ addr.sin_port = 0;
+ addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+ if (bind(server, (sockaddr*)&addr, sizeof(addr)) == -1) {
+ Y_FAIL("bind() failed with %s", strerror(errno));
+ } else if (listen(server, 1) == -1) {
+ Y_FAIL("listen() failed with %s", strerror(errno));
+ }
+
+ // obtain local address for client
+ socklen_t len = sizeof(addr);
+ if (getsockname(server, (sockaddr*)&addr, &len) == -1) {
+ Y_FAIL("getsockname() failed with %s", strerror(errno));
+ }
+
+ // create client socket
+ SOCKET client = socket(AF_INET, SOCK_STREAM, 0);
+ Y_VERIFY(client != -1, "socket() failed with %s", strerror(errno));
+
+ // connect to server
+ if (connect(client, (sockaddr*)&addr, len) == -1) {
+ Y_FAIL("connect() failed with %s", strerror(errno));
+ }
+
+ // accept connection from the other side
+ SOCKET accepted = accept(server, nullptr, nullptr);
+ Y_VERIFY(accepted != -1, "accept() failed with %s", strerror(errno));
+
+ // close server socket
+ closesocket(server);
+
+ return std::make_pair(MakeIntrusive<TTestSocket>(client), MakeIntrusive<TTestSocket>(accepted));
+}
+
+class TPollerActorTest: public TTestBase {
+ UNIT_TEST_SUITE(TPollerActorTest);
+ UNIT_TEST(Registration)
+ UNIT_TEST(ReadNotification)
+ UNIT_TEST(WriteNotification)
+ UNIT_TEST(HangupNotification)
+ UNIT_TEST_SUITE_END();
+
+public:
+ void SetUp() override {
+ ActorSystem_ = MakeHolder<TTestActorRuntimeBase>();
+ ActorSystem_->Initialize();
+
+ PollerId_ = ActorSystem_->Register(CreatePollerActor());
+
+ TDispatchOptions opts;
+ opts.FinalEvents.emplace_back(TEvents::TSystem::Bootstrap, 1);
+ ActorSystem_->DispatchEvents(opts);
+ }
+
+ void Registration() {
+ auto [s1, s2] = NonBlockSockets();
+ auto readerId = ActorSystem_->AllocateEdgeActor();
+ auto writerId = ActorSystem_->AllocateEdgeActor();
+
+ RegisterSocket(s1, readerId, writerId);
+
+ // reader should receive event after socket registration
+ TPollerToken::TPtr token;
+ {
+ auto ev = ActorSystem_->GrabEdgeEvent<TEvPollerRegisterResult>(readerId);
+ token = ev->Get()->PollerToken;
+ }
+
+ // writer should receive event after socket registration
+ {
+ auto ev = ActorSystem_->GrabEdgeEvent<TEvPollerRegisterResult>(writerId);
+ UNIT_ASSERT_EQUAL(token, ev->Get()->PollerToken);
+ }
+ }
+
+ void ReadNotification() {
+ auto [r, w] = NonBlockSockets();
+ auto clientId = ActorSystem_->AllocateEdgeActor();
+ RegisterSocket(r, clientId, {});
+
+ // notification after registration
+ TPollerToken::TPtr token;
+ {
+ auto ev = ActorSystem_->GrabEdgeEvent<TEvPollerRegisterResult>(clientId);
+ token = ev->Get()->PollerToken;
+ }
+
+ char buf;
+
+ // data not ready yet for read
+ UNIT_ASSERT(read(r->GetDescriptor(), &buf, sizeof(buf)) == -1);
+ UNIT_ASSERT(errno == EWOULDBLOCK);
+
+ // request read poll
+ token->Request(true, false);
+
+ // write data
+ UNIT_ASSERT(write(w->GetDescriptor(), "x", 1) == 1);
+
+ // notification after socket become readable
+ {
+ auto ev = ActorSystem_->GrabEdgeEvent<TEvPollerReady>(clientId);
+ UNIT_ASSERT_EQUAL(ev->Get()->Socket, r);
+ UNIT_ASSERT(ev->Get()->Read);
+ UNIT_ASSERT(!ev->Get()->Write);
+ }
+
+ // read data
+ UNIT_ASSERT(read(r->GetDescriptor(), &buf, sizeof(buf)) == 1);
+ UNIT_ASSERT_EQUAL('x', buf);
+
+ // no more data to read
+ UNIT_ASSERT(read(r->GetDescriptor(), &buf, sizeof(buf)) == -1);
+ UNIT_ASSERT(errno == EWOULDBLOCK);
+ }
+
+ void WriteNotification() {
+ auto [r, w] = TcpSockets();
+ auto clientId = ActorSystem_->AllocateEdgeActor();
+ SetNonBlock(w->GetDescriptor());
+ RegisterSocket(w, TActorId{}, clientId);
+
+ // notification after registration
+ TPollerToken::TPtr token;
+ {
+ auto ev = ActorSystem_->GrabEdgeEvent<TEvPollerRegisterResult>(clientId);
+ token = ev->Get()->PollerToken;
+ }
+
+ char buffer[4096];
+ memset(buffer, 'x', sizeof(buffer));
+
+ for (int i = 0; i < 1000; ++i) {
+ // write as much as possible to send buffer
+ ssize_t written = 0;
+ for (;;) {
+ ssize_t res = send(w->GetDescriptor(), buffer, sizeof(buffer), 0);
+ if (res > 0) {
+ written += res;
+ } else if (res == 0) {
+ UNIT_FAIL("unexpected zero return from send()");
+ } else {
+ UNIT_ASSERT(res == -1);
+ if (errno == EINTR) {
+ continue;
+ } else if (errno == EWOULDBLOCK || errno == EAGAIN) {
+ token->Request(false, true);
+ break;
+ } else {
+ UNIT_FAIL("unexpected error from send()");
+ }
+ }
+ }
+ Cerr << "written " << written << " bytes" << Endl;
+
+ // read all written data from the read end
+ for (;;) {
+ char buffer[4096];
+ ssize_t res = recv(r->GetDescriptor(), buffer, sizeof(buffer), 0);
+ if (res > 0) {
+ UNIT_ASSERT(written >= res);
+ written -= res;
+ if (!written) {
+ break;
+ }
+ } else if (res == 0) {
+ UNIT_FAIL("unexpected zero return from recv()");
+ } else {
+ UNIT_ASSERT(res == -1);
+ if (errno == EINTR) {
+ continue;
+ } else {
+ UNIT_FAIL("unexpected error from recv()");
+ }
+ }
+ }
+
+ // wait for notification after socket becomes writable again
+ {
+ auto ev = ActorSystem_->GrabEdgeEvent<TEvPollerReady>(clientId);
+ UNIT_ASSERT_EQUAL(ev->Get()->Socket, w);
+ UNIT_ASSERT(!ev->Get()->Read);
+ UNIT_ASSERT(ev->Get()->Write);
+ }
+ }
+ }
+
+ void HangupNotification() {
+ auto [r, w] = NonBlockSockets();
+ auto clientId = ActorSystem_->AllocateEdgeActor();
+ RegisterSocket(r, clientId, TActorId{});
+
+ // notification after registration
+ TPollerToken::TPtr token;
+ {
+ auto ev = ActorSystem_->GrabEdgeEvent<TEvPollerRegisterResult>(clientId);
+ token = ev->Get()->PollerToken;
+ }
+
+ token->Request(true, false);
+ ShutDown(w->GetDescriptor(), SHUT_RDWR);
+
+ // notification after peer shuts down its socket
+ {
+ auto ev = ActorSystem_->GrabEdgeEvent<TEvPollerReady>(clientId);
+ UNIT_ASSERT_EQUAL(ev->Get()->Socket, r);
+ UNIT_ASSERT(ev->Get()->Read);
+ }
+ }
+
+private:
+ void RegisterSocket(TTestSocketPtr socket, TActorId readActorId, TActorId writeActorId) {
+ auto ev = new TEvPollerRegister{socket, readActorId, writeActorId};
+ ActorSystem_->Send(new IEventHandle(PollerId_, TActorId{}, ev));
+ }
+
+private:
+ THolder<TTestActorRuntimeBase> ActorSystem_;
+ TActorId PollerId_;
+};
+
+UNIT_TEST_SUITE_REGISTRATION(TPollerActorTest);
diff --git a/library/cpp/actors/interconnect/ut/protos/interconnect_test.proto b/library/cpp/actors/interconnect/ut/protos/interconnect_test.proto
new file mode 100644
index 0000000000..b9b2bd6a4e
--- /dev/null
+++ b/library/cpp/actors/interconnect/ut/protos/interconnect_test.proto
@@ -0,0 +1,25 @@
+package NInterconnectTest;
+
+message TEvTest {
+ optional uint64 SequenceNumber = 1;
+ optional bytes Payload = 2;
+}
+
+message TEvTestChan {
+ optional uint64 SequenceNumber = 1;
+ optional uint64 Payload = 2;
+}
+
+message TEvTestLarge {
+ optional uint64 SequenceNumber = 1;
+ optional bytes Payload = 2;
+}
+
+message TEvTestSmall {
+ optional uint64 SequenceNumber = 1;
+ optional bytes Payload = 2;
+}
+
+message TEvTestResponse {
+ optional uint64 ConfirmedSequenceNumber = 1;
+}
diff --git a/library/cpp/actors/interconnect/ut/protos/ya.make b/library/cpp/actors/interconnect/ut/protos/ya.make
new file mode 100644
index 0000000000..48a8cc129f
--- /dev/null
+++ b/library/cpp/actors/interconnect/ut/protos/ya.make
@@ -0,0 +1,11 @@
+PROTO_LIBRARY()
+
+OWNER(vkanaev)
+
+SRCS(
+ interconnect_test.proto
+)
+
+EXCLUDE_TAGS(GO_PROTO)
+
+END()
diff --git a/library/cpp/actors/interconnect/ut/ya.make b/library/cpp/actors/interconnect/ut/ya.make
new file mode 100644
index 0000000000..2f5b13352e
--- /dev/null
+++ b/library/cpp/actors/interconnect/ut/ya.make
@@ -0,0 +1,36 @@
+UNITTEST()
+
+OWNER(
+ alexvru
+ g:kikimr
+)
+
+IF (SANITIZER_TYPE == "thread")
+ TIMEOUT(1200)
+ SIZE(LARGE)
+ TAG(ya:fat)
+ELSE()
+ TIMEOUT(600)
+ SIZE(MEDIUM)
+ENDIF()
+
+SRCS(
+ channel_scheduler_ut.cpp
+ event_holder_pool_ut.cpp
+ interconnect_ut.cpp
+ large.cpp
+ poller_actor_ut.cpp
+ dynamic_proxy_ut.cpp
+)
+
+PEERDIR(
+ library/cpp/actors/core
+ library/cpp/actors/interconnect
+ library/cpp/actors/interconnect/ut/lib
+ library/cpp/actors/interconnect/ut/protos
+ library/cpp/actors/testlib
+ library/cpp/digest/md5
+ library/cpp/testing/unittest
+)
+
+END()
diff --git a/library/cpp/actors/interconnect/ut_fat/main.cpp b/library/cpp/actors/interconnect/ut_fat/main.cpp
new file mode 100644
index 0000000000..5d19bc3003
--- /dev/null
+++ b/library/cpp/actors/interconnect/ut_fat/main.cpp
@@ -0,0 +1,133 @@
+
+#include <library/cpp/actors/interconnect/interconnect_tcp_proxy.h>
+#include <library/cpp/actors/interconnect/ut/protos/interconnect_test.pb.h>
+#include <library/cpp/actors/interconnect/ut/lib/ic_test_cluster.h>
+#include <library/cpp/actors/interconnect/ut/lib/interrupter.h>
+#include <library/cpp/actors/interconnect/ut/lib/test_events.h>
+#include <library/cpp/actors/interconnect/ut/lib/test_actors.h>
+#include <library/cpp/actors/interconnect/ut/lib/node.h>
+
+#include <library/cpp/testing/unittest/tests_data.h>
+#include <library/cpp/testing/unittest/registar.h>
+
+#include <util/network/sock.h>
+#include <util/network/poller.h>
+#include <util/system/atomic.h>
+#include <util/generic/set.h>
+
+Y_UNIT_TEST_SUITE(InterconnectUnstableConnection) {
+ using namespace NActors;
+
+ class TSenderActor: public TSenderBaseActor {
+ TDeque<ui64> InFly;
+ ui16 SendFlags;
+
+ public:
+ TSenderActor(const TActorId& recipientActorId, ui16 sendFlags)
+ : TSenderBaseActor(recipientActorId, 32)
+ , SendFlags(sendFlags)
+ {
+ }
+
+ ~TSenderActor() override {
+ Cerr << "Sent " << SequenceNumber << " messages\n";
+ }
+
+ void SendMessage(const TActorContext& ctx) override {
+ const ui32 flags = IEventHandle::MakeFlags(0, SendFlags);
+ const ui64 cookie = SequenceNumber;
+ const TString payload('@', RandomNumber<size_t>(65536) + 4096);
+ ctx.Send(RecipientActorId, new TEvTest(SequenceNumber, payload), flags, cookie);
+ InFly.push_back(SequenceNumber);
+ ++InFlySize;
+ ++SequenceNumber;
+ }
+
+ void Handle(TEvents::TEvUndelivered::TPtr& ev, const TActorContext& ctx) override {
+ auto record = std::find(InFly.begin(), InFly.end(), ev->Cookie);
+ if (SendFlags & IEventHandle::FlagGenerateUnsureUndelivered) {
+ if (record != InFly.end()) {
+ InFly.erase(record);
+ --InFlySize;
+ SendMessage(ctx);
+ }
+ } else {
+ Y_VERIFY(record != InFly.end());
+ }
+ }
+
+ void Handle(TEvTestResponse::TPtr& ev, const TActorContext& ctx) override {
+ Y_VERIFY(InFly);
+ const NInterconnectTest::TEvTestResponse& record = ev->Get()->Record;
+ Y_VERIFY(record.HasConfirmedSequenceNumber());
+ if (!(SendFlags & IEventHandle::FlagGenerateUnsureUndelivered)) {
+ while (record.GetConfirmedSequenceNumber() != InFly.front()) {
+ InFly.pop_front();
+ --InFlySize;
+ }
+ }
+ Y_VERIFY(record.GetConfirmedSequenceNumber() == InFly.front(), "got# %" PRIu64 " expected# %" PRIu64,
+ record.GetConfirmedSequenceNumber(), InFly.front());
+ InFly.pop_front();
+ --InFlySize;
+ SendMessagesIfPossible(ctx);
+ }
+ };
+
+ class TReceiverActor: public TReceiverBaseActor {
+ ui64 ReceivedCount = 0;
+ TNode* SenderNode = nullptr;
+
+ public:
+ TReceiverActor(TNode* senderNode)
+ : TReceiverBaseActor()
+ , SenderNode(senderNode)
+ {
+ }
+
+ void Handle(TEvTest::TPtr& ev, const TActorContext& /*ctx*/) override {
+ const NInterconnectTest::TEvTest& m = ev->Get()->Record;
+ Y_VERIFY(m.HasSequenceNumber());
+ Y_VERIFY(m.GetSequenceNumber() >= ReceivedCount, "got #%" PRIu64 " expected at least #%" PRIu64,
+ m.GetSequenceNumber(), ReceivedCount);
+ ++ReceivedCount;
+ SenderNode->Send(ev->Sender, new TEvTestResponse(m.GetSequenceNumber()));
+ }
+
+ ~TReceiverActor() override {
+ Cerr << "Received " << ReceivedCount << " messages\n";
+ }
+ };
+
+ Y_UNIT_TEST(InterconnectTestWithProxyUnsureUndelivered) {
+ ui32 numNodes = 2;
+ double bandWidth = 1000000;
+ ui16 flags = IEventHandle::FlagTrackDelivery | IEventHandle::FlagGenerateUnsureUndelivered;
+ TTestICCluster::TTrafficInterrupterSettings interrupterSettings{TDuration::Seconds(2), bandWidth, true};
+
+ TTestICCluster testCluster(numNodes, TChannelsConfig(), &interrupterSettings);
+
+ TReceiverActor* receiverActor = new TReceiverActor(testCluster.GetNode(1));
+ const TActorId recipient = testCluster.RegisterActor(receiverActor, 2);
+ TSenderActor* senderActor = new TSenderActor(recipient, flags);
+ testCluster.RegisterActor(senderActor, 1);
+
+ NanoSleep(30ULL * 1000 * 1000 * 1000);
+ }
+
+ Y_UNIT_TEST(InterconnectTestWithProxy) {
+ ui32 numNodes = 2;
+ double bandWidth = 1000000;
+ ui16 flags = IEventHandle::FlagTrackDelivery;
+ TTestICCluster::TTrafficInterrupterSettings interrupterSettings{TDuration::Seconds(2), bandWidth, true};
+
+ TTestICCluster testCluster(numNodes, TChannelsConfig(), &interrupterSettings);
+
+ TReceiverActor* receiverActor = new TReceiverActor(testCluster.GetNode(1));
+ const TActorId recipient = testCluster.RegisterActor(receiverActor, 2);
+ TSenderActor* senderActor = new TSenderActor(recipient, flags);
+ testCluster.RegisterActor(senderActor, 1);
+
+ NanoSleep(30ULL * 1000 * 1000 * 1000);
+ }
+}
diff --git a/library/cpp/actors/interconnect/ut_fat/ya.make b/library/cpp/actors/interconnect/ut_fat/ya.make
new file mode 100644
index 0000000000..6e58d08154
--- /dev/null
+++ b/library/cpp/actors/interconnect/ut_fat/ya.make
@@ -0,0 +1,25 @@
+UNITTEST()
+
+OWNER(
+ vkanaev
+ alexvru
+)
+
+SIZE(LARGE)
+
+TAG(ya:fat)
+
+SRCS(
+ main.cpp
+)
+
+PEERDIR(
+ library/cpp/actors/core
+ library/cpp/actors/interconnect
+ library/cpp/actors/interconnect/mock
+ library/cpp/actors/interconnect/ut/lib
+ library/cpp/actors/interconnect/ut/protos
+ library/cpp/testing/unittest
+)
+
+END()
diff --git a/library/cpp/actors/interconnect/watchdog_timer.h b/library/cpp/actors/interconnect/watchdog_timer.h
new file mode 100644
index 0000000000..c190105a59
--- /dev/null
+++ b/library/cpp/actors/interconnect/watchdog_timer.h
@@ -0,0 +1,68 @@
+#pragma once
+
+namespace NActors {
+ template <typename TEvent>
+ class TWatchdogTimer {
+ using TCallback = std::function<void()>;
+
+ const TDuration Timeout;
+ const TCallback Callback;
+
+ TInstant LastResetTimestamp;
+ TEvent* ExpectedEvent = nullptr;
+ ui32 Iteration = 0;
+
+ static constexpr ui32 NumIterationsBeforeFiring = 2;
+
+ public:
+ TWatchdogTimer(TDuration timeout, TCallback callback)
+ : Timeout(timeout)
+ , Callback(std::move(callback))
+ {
+ }
+
+ void Arm(const TActorIdentity& actor) {
+ if (Timeout != TDuration::Zero() && Timeout != TDuration::Max()) {
+ Schedule(Timeout, actor);
+ Reset();
+ }
+ }
+
+ void Reset() {
+ LastResetTimestamp = TActivationContext::Now();
+ }
+
+ void Disarm() {
+ ExpectedEvent = nullptr;
+ }
+
+ void operator()(typename TEvent::TPtr& ev) {
+ if (ev->Get() == ExpectedEvent) {
+ const TInstant now = TActivationContext::Now();
+ const TInstant barrier = LastResetTimestamp + Timeout;
+ if (now < barrier) {
+ // the time hasn't come yet
+ Schedule(barrier - now, TActorIdentity(ev->Recipient));
+ } else if (Iteration < NumIterationsBeforeFiring) {
+ // time has come, but we will still give actor a chance to process some messages and rearm timer
+ ++Iteration;
+ TActivationContext::Send(ev.Release()); // send this event into queue once more
+ } else {
+ // no chance to disarm, fire callback
+ Callback();
+ ExpectedEvent = nullptr;
+ Iteration = 0;
+ }
+ }
+ }
+
+ private:
+ void Schedule(TDuration timeout, const TActorIdentity& actor) {
+ auto ev = MakeHolder<TEvent>();
+ ExpectedEvent = ev.Get();
+ Iteration = 0;
+ actor.Schedule(timeout, ev.Release());
+ }
+ };
+
+}
diff --git a/library/cpp/actors/interconnect/ya.make b/library/cpp/actors/interconnect/ya.make
new file mode 100644
index 0000000000..60d29b0fc0
--- /dev/null
+++ b/library/cpp/actors/interconnect/ya.make
@@ -0,0 +1,94 @@
+LIBRARY()
+
+OWNER(
+ ddoarn
+ alexvru
+ g:kikimr
+)
+
+NO_WSHADOW()
+
+IF (PROFILE_MEMORY_ALLOCATIONS)
+ CFLAGS(-DPROFILE_MEMORY_ALLOCATIONS)
+ENDIF()
+
+SRCS(
+ channel_scheduler.h
+ event_filter.h
+ event_holder_pool.h
+ events_local.h
+ interconnect_address.cpp
+ interconnect_address.h
+ interconnect_channel.cpp
+ interconnect_channel.h
+ interconnect_common.h
+ interconnect_counters.cpp
+ interconnect.h
+ interconnect_handshake.cpp
+ interconnect_handshake.h
+ interconnect_impl.h
+ interconnect_mon.cpp
+ interconnect_mon.h
+ interconnect_nameserver_dynamic.cpp
+ interconnect_nameserver_table.cpp
+ interconnect_proxy_wrapper.cpp
+ interconnect_proxy_wrapper.h
+ interconnect_resolve.cpp
+ interconnect_stream.cpp
+ interconnect_stream.h
+ interconnect_tcp_input_session.cpp
+ interconnect_tcp_proxy.cpp
+ interconnect_tcp_proxy.h
+ interconnect_tcp_server.cpp
+ interconnect_tcp_server.h
+ interconnect_tcp_session.cpp
+ interconnect_tcp_session.h
+ load.cpp
+ load.h
+ logging.h
+ packet.cpp
+ packet.h
+ poller_actor.cpp
+ poller_actor.h
+ poller.h
+ poller_tcp.cpp
+ poller_tcp.h
+ poller_tcp_unit.cpp
+ poller_tcp_unit.h
+ poller_tcp_unit_select.cpp
+ poller_tcp_unit_select.h
+ profiler.h
+ slowpoke_actor.h
+ types.cpp
+ types.h
+ watchdog_timer.h
+)
+
+IF (OS_LINUX)
+ SRCS(
+ poller_tcp_unit_epoll.cpp
+ poller_tcp_unit_epoll.h
+ )
+ENDIF()
+
+PEERDIR(
+ contrib/libs/libc_compat
+ contrib/libs/openssl
+ library/cpp/actors/core
+ library/cpp/actors/dnscachelib
+ library/cpp/actors/dnsresolver
+ library/cpp/actors/helpers
+ library/cpp/actors/prof
+ library/cpp/actors/protos
+ library/cpp/actors/util
+ library/cpp/digest/crc32c
+ library/cpp/json
+ library/cpp/lwtrace
+ library/cpp/monlib/dynamic_counters
+ library/cpp/monlib/metrics
+ library/cpp/monlib/service/pages/tablesorter
+ library/cpp/openssl/init
+ library/cpp/packedtypes
+)
+
+END()