contrib/clickhouse/src/Backups/WithRetries.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79

#pragma once

#include <Storages/MergeTree/ZooKeeperRetries.h>
#include <Common/ZooKeeper/Common.h>
#include <Common/ZooKeeper/ZooKeeperWithFaultInjection.h>

namespace DB
{

/// In backups every request to [Zoo]Keeper should be retryable
/// and this tiny class encapsulates all the machinery for make it possible -
/// a [Zoo]Keeper client which injects faults with configurable probability
/// and a retries controller which performs retries with growing backoff.
class WithRetries
{
public:
    using FaultyKeeper = Coordination::ZooKeeperWithFaultInjection::Ptr;
    using RenewerCallback = std::function<void(FaultyKeeper &)>;

    struct KeeperSettings
    {
        UInt64 keeper_max_retries{0};
        UInt64 keeper_retry_initial_backoff_ms{0};
        UInt64 keeper_retry_max_backoff_ms{0};
        UInt64 batch_size_for_keeper_multiread{10000};
        Float64 keeper_fault_injection_probability{0};
        UInt64 keeper_fault_injection_seed{42};
        UInt64 keeper_value_max_size{1048576};
    };

    /// For simplicity a separate ZooKeeperRetriesInfo and a faulty [Zoo]Keeper client
    /// are stored in one place.
    /// This helps to avoid writing too much boilerplate each time we need to
    /// execute some operation (a set of requests) over [Zoo]Keeper with retries.
    /// Why ZooKeeperRetriesInfo is separate for each operation?
    /// The reason is that backup usually takes long time to finish and it makes no sense
    /// to limit the overall number of retries (for example 1000) for the whole backup
    /// and have a continuously growing backoff.
    class RetriesControlHolder
    {
    public:
        ZooKeeperRetriesInfo info;
        ZooKeeperRetriesControl retries_ctl;
        FaultyKeeper faulty_zookeeper;

    private:
        friend class WithRetries;
        RetriesControlHolder(const WithRetries * parent, const String & name);
    };

    RetriesControlHolder createRetriesControlHolder(const String & name);
    WithRetries(Poco::Logger * log, zkutil::GetZooKeeper get_zookeeper_, const KeeperSettings & settings, RenewerCallback callback);

    /// Used to re-establish new connection inside a retry loop.
    void renewZooKeeper(FaultyKeeper my_faulty_zookeeper) const;
private:
    /// This will provide a special wrapper which is useful for testing
    FaultyKeeper getFaultyZooKeeper() const;

    Poco::Logger * log;
    zkutil::GetZooKeeper get_zookeeper;
    KeeperSettings settings;
    /// This callback is called each time when a new [Zoo]Keeper session is created.
    /// In backups it is primarily used to re-create an ephemeral node to signal the coordinator
    /// that the host is alive and able to continue writing the backup.
    /// Coordinator (or an initiator) of the backup also retries when it doesn't find an ephemeral node
    /// for a particular host.
    /// Again, this schema is not ideal. False-positives are still possible, but in worst case scenario
    /// it could lead just to a failed backup which could possibly be successful
    /// if there were a little bit more retries.
    RenewerCallback callback;
    ZooKeeperRetriesInfo global_zookeeper_retries_info;

    /// This is needed only to protect zookeeper object
    mutable std::mutex zookeeper_mutex;
    mutable zkutil::ZooKeeperPtr zookeeper;
};

}