1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
|
#pragma once
#if defined(OS_LINUX)
#include <functional>
#include <queue>
#include <optional>
#include <Client/HedgedConnectionsFactory.h>
#include <Client/IConnections.h>
#include <Client/PacketReceiver.h>
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
}
/** To receive data from multiple replicas (connections) from one shard asynchronously.
* The principe of Hedged Connections is used to reduce tail latency:
* if we don't receive data from replica and there is no progress in query execution
* for a long time, we try to get new replica and send query to it,
* without cancelling working with previous replica. This class
* supports all functionality that MultipleConnections has.
*/
class HedgedConnections : public IConnections
{
public:
using PacketReceiverPtr = std::unique_ptr<PacketReceiver>;
struct ReplicaState
{
explicit ReplicaState(Connection * connection_) : connection(connection_), packet_receiver(std::make_unique<PacketReceiver>(connection_))
{
}
Connection * connection = nullptr;
PacketReceiverPtr packet_receiver;
TimerDescriptor change_replica_timeout;
bool is_change_replica_timeout_expired = false;
};
struct OffsetState
{
/// Replicas with the same offset.
std::vector<ReplicaState> replicas;
/// An amount of active replicas. When can_change_replica is false,
/// active_connection_count is always <= 1 (because we stopped working with
/// other replicas with the same offset)
size_t active_connection_count = 0;
bool can_change_replica = true;
/// This flag is true when this offset is in queue for
/// new replicas. It's needed to process receive timeout
/// (throw an exception when receive timeout expired and there is no
/// new replica in process)
bool next_replica_in_process = false;
};
/// We process events in epoll, so we need to determine replica by it's
/// file descriptor. We store map fd -> replica location. To determine
/// where replica is, we need a replica offset
/// (the same as parallel_replica_offset), and index, which is needed because
/// we can have many replicas with same offset (when receive_data_timeout has expired).
struct ReplicaLocation
{
size_t offset;
size_t index;
};
HedgedConnections(const ConnectionPoolWithFailoverPtr & pool_,
ContextPtr context_,
const ConnectionTimeouts & timeouts_,
const ThrottlerPtr & throttler,
PoolMode pool_mode,
std::shared_ptr<QualifiedTableName> table_to_check_ = nullptr,
AsyncCallback async_callback = {});
void sendScalarsData(Scalars & data) override;
void sendExternalTablesData(std::vector<ExternalTablesData> & data) override;
void sendQuery(
const ConnectionTimeouts & timeouts,
const String & query,
const String & query_id,
UInt64 stage,
ClientInfo & client_info,
bool with_pending_data) override;
void sendReadTaskResponse(const String &) override
{
throw Exception(ErrorCodes::LOGICAL_ERROR, "sendReadTaskResponse in not supported with HedgedConnections");
}
void sendMergeTreeReadTaskResponse(const ParallelReadResponse &) override
{
throw Exception(ErrorCodes::LOGICAL_ERROR, "sendMergeTreeReadTaskResponse in not supported with HedgedConnections");
}
Packet receivePacket() override;
Packet receivePacketUnlocked(AsyncCallback async_callback) override;
void disconnect() override;
void sendCancel() override;
void sendIgnoredPartUUIDs(const std::vector<UUID> & uuids) override;
Packet drain() override;
std::string dumpAddresses() const override;
size_t size() const override { return offset_states.size(); }
bool hasActiveConnections() const override { return active_connection_count > 0; }
void setReplicaInfo(ReplicaInfo value) override { replica_info = value; }
void setAsyncCallback(AsyncCallback async_callback) override;
private:
/// If we don't receive data from replica and there is no progress in query
/// execution for receive_data_timeout, we are trying to get new
/// replica and send query to it. Beside sending query, there are some
/// additional actions like sendScalarsData or sendExternalTablesData and we need
/// to perform these actions in the same order on the new replica. So, we will
/// save actions with replicas in pipeline to perform them on the new replicas.
class Pipeline
{
public:
void add(std::function<void(ReplicaState &)> send_function);
void run(ReplicaState & replica);
private:
std::vector<std::function<void(ReplicaState &)>> pipeline;
};
Packet receivePacketFromReplica(const ReplicaLocation & replica_location);
ReplicaLocation getReadyReplicaLocation(AsyncCallback async_callback = {});
bool resumePacketReceiver(const ReplicaLocation & replica_location);
void disableChangingReplica(const ReplicaLocation & replica_location);
void startNewReplica();
void checkNewReplica();
void processNewReplicaState(HedgedConnectionsFactory::State state, Connection * connection);
void finishProcessReplica(ReplicaState & replica, bool disconnect);
int getReadyFileDescriptor(AsyncCallback async_callback = {});
HedgedConnectionsFactory hedged_connections_factory;
/// All replicas in offset_states[offset] is responsible for process query
/// with setting parallel_replica_offset = offset. In common situations
/// replica_states[offset].replicas.size() = 1 (like in MultiplexedConnections).
std::vector<OffsetState> offset_states;
/// Map socket file descriptor to replica location (it's offset and index in OffsetState.replicas).
std::unordered_map<int, ReplicaLocation> fd_to_replica_location;
/// Map receive data timeout file descriptor to replica location.
std::unordered_map<int, ReplicaLocation> timeout_fd_to_replica_location;
/// A queue of offsets for new replicas. When we get RECEIVE_DATA_TIMEOUT from
/// the replica, we push it's offset to this queue and start trying to get
/// new replica.
std::queue<int> offsets_queue;
/// The current number of valid connections to the replicas of this shard.
size_t active_connection_count;
/// We count offsets in which we can't change replica anymore,
/// it's needed to cancel choosing new replicas when we
/// disabled replica changing in all offsets.
size_t offsets_with_disabled_changing_replica;
Pipeline pipeline_for_new_replicas;
/// New replica may not support two-level aggregation due to version incompatibility.
/// If we didn't disabled it, we need to skip this replica.
bool disable_two_level_aggregation = false;
/// We will save replica with last received packet
/// (except cases when packet type is EndOfStream or Exception)
/// to resume it's packet receiver when new packet is needed.
std::optional<ReplicaLocation> replica_with_last_received_packet;
Packet last_received_packet;
Epoll epoll;
ContextPtr context;
const Settings & settings;
ThrottlerPtr throttler;
bool sent_query = false;
bool cancelled = false;
ReplicaInfo replica_info;
mutable std::mutex cancel_mutex;
};
}
#endif
|