1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
|
#pragma once
#include <time.h>
#include <cstdlib>
#include <climits>
#include <random>
#include <functional>
#include <base/types.h>
#include <base/scope_guard.h>
#include <base/sort.h>
#include <Common/PoolBase.h>
#include <Common/ProfileEvents.h>
#include <Common/NetException.h>
#include <Common/Exception.h>
#include <Common/randomSeed.h>
#include <Common/Priority.h>
namespace DB
{
namespace ErrorCodes
{
extern const int ALL_CONNECTION_TRIES_FAILED;
extern const int ALL_REPLICAS_ARE_STALE;
extern const int LOGICAL_ERROR;
}
}
namespace ProfileEvents
{
extern const Event DistributedConnectionFailTry;
extern const Event DistributedConnectionFailAtAll;
}
/// This class provides a pool with fault tolerance. It is used for pooling of connections to replicated DB.
/// Initialized by several PoolBase objects.
/// When a connection is requested, tries to create or choose an alive connection from one of the nested pools.
/// Pools are tried in the order consistent with lexicographical order of (error count, slowdown count, config priority, priority, random number) tuples.
/// Number of tries for a single pool is limited by max_tries parameter.
/// The client can set nested pool priority by passing a GetPriority functor.
///
/// NOTE: if one of the nested pools blocks because it is empty, this pool will also block.
///
/// The client must provide a TryGetEntryFunc functor, which should perform a single try to get a connection from a nested pool.
/// This functor can also check if the connection satisfies some eligibility criterion (e.g. check if
/// the replica is up-to-date).
template <typename TNestedPool>
class PoolWithFailoverBase : private boost::noncopyable
{
public:
using NestedPool = TNestedPool;
using NestedPoolPtr = std::shared_ptr<NestedPool>;
using Entry = typename NestedPool::Entry;
using NestedPools = std::vector<NestedPoolPtr>;
PoolWithFailoverBase(
NestedPools nested_pools_,
time_t decrease_error_period_,
size_t max_error_cap_,
Poco::Logger * log_)
: nested_pools(std::move(nested_pools_))
, decrease_error_period(decrease_error_period_)
, max_error_cap(max_error_cap_)
, shared_pool_states(nested_pools.size())
, log(log_)
{
for (size_t i = 0;i < nested_pools.size(); ++i)
shared_pool_states[i].config_priority = nested_pools[i]->getPriority();
}
struct TryResult
{
TryResult() = default;
explicit TryResult(Entry entry_)
: entry(std::move(entry_))
, is_usable(true)
, is_up_to_date(true)
{
}
void reset()
{
entry = Entry();
is_usable = false;
is_up_to_date = false;
staleness = 0.0;
}
Entry entry;
bool is_usable = false; /// If false, the entry is unusable for current request
/// (but may be usable for other requests, so error counts are not incremented)
bool is_up_to_date = false; /// If true, the entry is a connection to up-to-date replica.
double staleness = 0.0; /// Helps choosing the "least stale" option when all replicas are stale.
};
struct PoolState;
using PoolStates = std::vector<PoolState>;
struct ShuffledPool
{
NestedPool * pool{};
const PoolState * state{}; // WARNING: valid only during initial ordering, dangling
size_t index = 0;
size_t error_count = 0;
size_t slowdown_count = 0;
};
/// This functor must be provided by a client. It must perform a single try that takes a connection
/// from the provided pool and checks that it is good.
using TryGetEntryFunc = std::function<TryResult(NestedPool & pool, std::string & fail_message)>;
/// The client can provide this functor to affect load balancing - the index of a pool is passed to
/// this functor. The pools with lower result value will be tried first.
using GetPriorityFunc = std::function<Priority(size_t index)>;
/// Returns at least min_entries and at most max_entries connections (at most one connection per nested pool).
/// The method will throw if it is unable to get min_entries alive connections or
/// if fallback_to_stale_replicas is false and it is unable to get min_entries connections to up-to-date replicas.
std::vector<TryResult> getMany(
size_t min_entries, size_t max_entries, size_t max_tries,
size_t max_ignored_errors,
bool fallback_to_stale_replicas,
const TryGetEntryFunc & try_get_entry,
const GetPriorityFunc & get_priority = GetPriorityFunc());
protected:
/// Returns a single connection.
Entry get(size_t max_ignored_errors, bool fallback_to_stale_replicas,
const TryGetEntryFunc & try_get_entry, const GetPriorityFunc & get_priority = GetPriorityFunc());
/// This function returns a copy of pool states to avoid race conditions when modifying shared pool states.
PoolStates updatePoolStates(size_t max_ignored_errors);
void updateErrorCounts(PoolStates & states, time_t & last_decrease_time) const;
std::vector<ShuffledPool> getShuffledPools(size_t max_ignored_errors, const GetPriorityFunc & get_priority);
inline void updateSharedErrorCounts(std::vector<ShuffledPool> & shuffled_pools);
auto getPoolExtendedStates() const
{
std::lock_guard lock(pool_states_mutex);
return std::make_tuple(shared_pool_states, nested_pools, last_error_decrease_time);
}
NestedPools nested_pools;
const time_t decrease_error_period;
const size_t max_error_cap;
mutable std::mutex pool_states_mutex;
PoolStates shared_pool_states;
/// The time when error counts were last decreased.
time_t last_error_decrease_time = 0;
Poco::Logger * log;
};
template <typename TNestedPool>
std::vector<typename PoolWithFailoverBase<TNestedPool>::ShuffledPool>
PoolWithFailoverBase<TNestedPool>::getShuffledPools(
size_t max_ignored_errors, const PoolWithFailoverBase::GetPriorityFunc & get_priority)
{
/// Update random numbers and error counts.
PoolStates pool_states = updatePoolStates(max_ignored_errors);
if (get_priority)
{
for (size_t i = 0; i < pool_states.size(); ++i)
pool_states[i].priority = get_priority(i);
}
/// Sort the pools into order in which they will be tried (based on respective PoolStates).
/// Note that `error_count` and `slowdown_count` are used for ordering, but set to zero in the resulting ShuffledPool
std::vector<ShuffledPool> shuffled_pools;
shuffled_pools.reserve(nested_pools.size());
for (size_t i = 0; i < nested_pools.size(); ++i)
shuffled_pools.push_back(ShuffledPool{nested_pools[i].get(), &pool_states[i], i, /* error_count = */ 0, /* slowdown_count = */ 0});
::sort(
shuffled_pools.begin(), shuffled_pools.end(),
[](const ShuffledPool & lhs, const ShuffledPool & rhs)
{
return PoolState::compare(*lhs.state, *rhs.state);
});
return shuffled_pools;
}
template <typename TNestedPool>
inline void PoolWithFailoverBase<TNestedPool>::updateSharedErrorCounts(std::vector<ShuffledPool> & shuffled_pools)
{
std::lock_guard lock(pool_states_mutex);
for (const ShuffledPool & pool: shuffled_pools)
{
auto & pool_state = shared_pool_states[pool.index];
pool_state.error_count = std::min<UInt64>(max_error_cap, pool_state.error_count + pool.error_count);
pool_state.slowdown_count += pool.slowdown_count;
}
}
template <typename TNestedPool>
typename TNestedPool::Entry
PoolWithFailoverBase<TNestedPool>::get(size_t max_ignored_errors, bool fallback_to_stale_replicas,
const TryGetEntryFunc & try_get_entry, const GetPriorityFunc & get_priority)
{
std::vector<TryResult> results = getMany(
1 /* min entries */, 1 /* max entries */, 1 /* max tries */,
max_ignored_errors, fallback_to_stale_replicas,
try_get_entry, get_priority);
if (results.empty() || results[0].entry.isNull())
throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR,
"PoolWithFailoverBase::getMany() returned less than min_entries entries.");
return results[0].entry;
}
template <typename TNestedPool>
std::vector<typename PoolWithFailoverBase<TNestedPool>::TryResult>
PoolWithFailoverBase<TNestedPool>::getMany(
size_t min_entries, size_t max_entries, size_t max_tries,
size_t max_ignored_errors,
bool fallback_to_stale_replicas,
const TryGetEntryFunc & try_get_entry,
const GetPriorityFunc & get_priority)
{
std::vector<ShuffledPool> shuffled_pools = getShuffledPools(max_ignored_errors, get_priority);
/// Limit `max_tries` value by `max_error_cap` to avoid unlimited number of retries
if (max_tries > max_error_cap)
max_tries = max_error_cap;
/// We will try to get a connection from each pool until a connection is produced or max_tries is reached.
std::vector<TryResult> try_results(shuffled_pools.size());
size_t entries_count = 0;
size_t usable_count = 0;
size_t up_to_date_count = 0;
size_t failed_pools_count = 0;
/// At exit update shared error counts with error counts occurred during this call.
SCOPE_EXIT(
{
updateSharedErrorCounts(shuffled_pools);
});
std::string fail_messages;
bool finished = false;
while (!finished)
{
for (size_t i = 0; i < shuffled_pools.size(); ++i)
{
if (up_to_date_count >= max_entries /// Already enough good entries.
|| entries_count + failed_pools_count >= nested_pools.size()) /// No more good entries will be produced.
{
finished = true;
break;
}
ShuffledPool & shuffled_pool = shuffled_pools[i];
TryResult & result = try_results[i];
if (max_tries && (shuffled_pool.error_count >= max_tries || !result.entry.isNull()))
continue;
std::string fail_message;
result = try_get_entry(*shuffled_pool.pool, fail_message);
if (!fail_message.empty())
fail_messages += fail_message + '\n';
if (!result.entry.isNull())
{
++entries_count;
if (result.is_usable)
{
++usable_count;
if (result.is_up_to_date)
++up_to_date_count;
}
}
else
{
LOG_WARNING(log, "Connection failed at try №{}, reason: {}", (shuffled_pool.error_count + 1), fail_message);
ProfileEvents::increment(ProfileEvents::DistributedConnectionFailTry);
shuffled_pool.error_count = std::min(max_error_cap, shuffled_pool.error_count + 1);
if (shuffled_pool.error_count >= max_tries)
{
++failed_pools_count;
ProfileEvents::increment(ProfileEvents::DistributedConnectionFailAtAll);
}
}
}
}
if (usable_count < min_entries)
throw DB::NetException(DB::ErrorCodes::ALL_CONNECTION_TRIES_FAILED,
"All connection tries failed. Log: \n\n{}\n", fail_messages);
std::erase_if(try_results, [](const TryResult & r) { return r.entry.isNull() || !r.is_usable; });
/// Sort so that preferred items are near the beginning.
std::stable_sort(
try_results.begin(), try_results.end(),
[](const TryResult & left, const TryResult & right)
{
return std::forward_as_tuple(!left.is_up_to_date, left.staleness)
< std::forward_as_tuple(!right.is_up_to_date, right.staleness);
});
if (fallback_to_stale_replicas)
{
/// There is not enough up-to-date entries but we are allowed to return stale entries.
/// Gather all up-to-date ones and least-bad stale ones.
size_t size = std::min(try_results.size(), max_entries);
try_results.resize(size);
}
else if (up_to_date_count >= min_entries)
{
/// There is enough up-to-date entries.
try_results.resize(up_to_date_count);
}
else
throw DB::Exception(DB::ErrorCodes::ALL_REPLICAS_ARE_STALE,
"Could not find enough connections to up-to-date replicas. Got: {}, needed: {}", up_to_date_count, max_entries);
return try_results;
}
template <typename TNestedPool>
struct PoolWithFailoverBase<TNestedPool>::PoolState
{
UInt64 error_count = 0;
/// The number of slowdowns that led to changing replica in HedgedRequestsFactory
UInt64 slowdown_count = 0;
/// Priority from the <remote_server> configuration.
Priority config_priority{1};
/// Priority from the GetPriorityFunc.
Priority priority{0};
UInt64 random = 0;
void randomize()
{
random = rng();
}
static bool compare(const PoolState & lhs, const PoolState & rhs)
{
return std::forward_as_tuple(lhs.error_count, lhs.slowdown_count, lhs.config_priority, lhs.priority, lhs.random)
< std::forward_as_tuple(rhs.error_count, rhs.slowdown_count, rhs.config_priority, rhs.priority, rhs.random);
}
private:
std::minstd_rand rng = std::minstd_rand(static_cast<uint_fast32_t>(randomSeed()));
};
template <typename TNestedPool>
typename PoolWithFailoverBase<TNestedPool>::PoolStates
PoolWithFailoverBase<TNestedPool>::updatePoolStates(size_t max_ignored_errors)
{
PoolStates result;
result.reserve(nested_pools.size());
{
std::lock_guard lock(pool_states_mutex);
for (auto & state : shared_pool_states)
state.randomize();
updateErrorCounts(shared_pool_states, last_error_decrease_time);
result.assign(shared_pool_states.begin(), shared_pool_states.end());
}
/// distributed_replica_max_ignored_errors
for (auto & state : result)
state.error_count = state.error_count > max_ignored_errors ? state.error_count - max_ignored_errors : 0;
return result;
}
template <typename TNestedPool>
void PoolWithFailoverBase<TNestedPool>::updateErrorCounts(PoolWithFailoverBase<TNestedPool>::PoolStates & states, time_t & last_decrease_time) const
{
time_t current_time = time(nullptr);
if (last_decrease_time)
{
time_t delta = current_time - last_decrease_time;
if (delta >= 0)
{
const UInt64 max_bits = sizeof(UInt64) * CHAR_BIT;
size_t shift_amount = max_bits;
/// Divide error counts by 2 every decrease_error_period seconds.
if (decrease_error_period)
shift_amount = delta / decrease_error_period;
/// Update time but don't do it more often than once a period.
/// Else if the function is called often enough, error count will never decrease.
if (shift_amount)
last_decrease_time = current_time;
if (shift_amount >= max_bits)
{
for (auto & state : states)
{
state.error_count = 0;
state.slowdown_count = 0;
}
}
else if (shift_amount)
{
for (auto & state : states)
{
state.error_count >>= shift_amount;
state.slowdown_count >>= shift_amount;
}
}
}
}
else
last_decrease_time = current_time;
}
|