aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/clickhouse/src/IO/ISchedulerNode.h
blob: 1c33c0337446be4d0c660607fe8c29c859236dd7 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
#pragma once

#include <Common/ErrorCodes.h>
#include <Common/Exception.h>
#include <Common/Priority.h>

#include <IO/ResourceRequest.h>
#include <Poco/Util/AbstractConfiguration.h>
#include <Poco/Util/XMLConfiguration.h>

#include <boost/noncopyable.hpp>

#include <deque>
#include <functional>
#include <memory>
#include <mutex>


namespace DB
{

namespace ErrorCodes
{
    extern const int INVALID_SCHEDULER_NODE;
}

class ISchedulerNode;

inline const Poco::Util::AbstractConfiguration & emptyConfig()
{
    static Poco::AutoPtr<Poco::Util::XMLConfiguration> config = new Poco::Util::XMLConfiguration();
    return *config;
}

/*
 * Info read and write for scheduling purposes by parent
 */
struct SchedulerNodeInfo
{
    double weight = 1.0; /// Weight of this node among it's siblings
    Priority priority; /// Priority of this node among it's siblings (lower value means higher priority)

    /// Arbitrary data accessed/stored by parent
    union {
        size_t idx;
        void * ptr;
    } parent;

    SchedulerNodeInfo() = default;

    explicit SchedulerNodeInfo(const Poco::Util::AbstractConfiguration & config = emptyConfig(), const String & config_prefix = {})
    {
        setWeight(config.getDouble(config_prefix + ".weight", weight));
        setPriority(config.getInt64(config_prefix + ".priority", priority));
    }

    void setWeight(double value)
    {
        if (value <= 0 || !isfinite(value))
            throw Exception(
                ErrorCodes::INVALID_SCHEDULER_NODE,
                "Negative and non-finite node weights are not allowed: {}",
                value);
        weight = value;
    }

    void setPriority(Int64 value)
    {
        priority.value = value;
    }
};

/*
 * Simple waitable thread-safe FIFO task queue.
 * Intended to hold postponed events for later handling (usually by scheduler thread).
 */
class EventQueue
{
public:
    using Event = std::function<void()>;

    void enqueue(Event&& event)
    {
        std::unique_lock lock{mutex};
        bool was_empty = queue.empty();
        queue.emplace_back(event);
        if (was_empty)
            pending.notify_one();
    }

    /// Process single event if it exists
    /// Returns `true` iff event has been processed
    bool tryProcess()
    {
        std::unique_lock lock{mutex};
        if (queue.empty())
            return false;
        Event event = std::move(queue.front());
        queue.pop_front();
        lock.unlock(); // do not hold queue mutext while processing events
        event();
        return true;
    }

    /// Wait for single event (if not available) and process it
    void process()
    {
        std::unique_lock lock{mutex};
        pending.wait(lock, [&] { return !queue.empty(); });
        Event event = std::move(queue.front());
        queue.pop_front();
        lock.unlock(); // do not hold queue mutext while processing events
        event();
    }

private:
    std::mutex mutex;
    std::condition_variable pending;
    std::deque<Event> queue;
};

/*
 * Node of hierarchy for scheduling requests for resource. Base class for all
 * kinds of scheduling elements (queues, policies, constraints and schedulers).
 *
 * Root node is a scheduler, which has it's thread to dequeue requests,
 * execute requests (see ResourceRequest) and process events in a thread-safe manner.
 * Immediate children of the scheduler represent independent resources.
 * Each resource has it's own hierarchy to achieve required scheduling policies.
 * Non-leaf nodes do not hold requests, but keep scheduling state
 * (e.g. consumption history, amount of in-flight requests, etc).
 * Leafs of hierarchy are queues capable of holding pending requests.
 *
 *        scheduler         (SchedulerRoot)
 *         /     \
 *  constraint  constraint  (SemaphoreConstraint)
 *      |           |
 *   policy      policy     (PriorityPolicy)
 *   /    \      /    \
 *  q1    q2    q3    q4    (FifoQueue)
 *
 * Dequeueing request from an inner node will dequeue request from one of active leaf-queues in its subtree.
 * Node is considered to be active iff:
 *  - it has at least one pending request in one of leaves of it's subtree;
 *  - and enforced constraints, if any, are satisfied
 *    (e.g. amount of concurrent requests is not greater than some number).
 *
 * All methods must be called only from scheduler thread for thread-safety.
 */
class ISchedulerNode : private boost::noncopyable
{
public:
    ISchedulerNode(EventQueue * event_queue_, const Poco::Util::AbstractConfiguration & config = emptyConfig(), const String & config_prefix = {})
        : event_queue(event_queue_)
        , info(config, config_prefix)
    {}

    virtual ~ISchedulerNode() {}

    // Checks if two nodes configuration is equal
    virtual bool equals(ISchedulerNode * other) = 0;

    /// Attach new child
    virtual void attachChild(const std::shared_ptr<ISchedulerNode> & child) = 0;

    /// Detach and destroy child
    virtual void removeChild(ISchedulerNode * child) = 0;

    /// Get attached child by name
    virtual ISchedulerNode * getChild(const String & child_name) = 0;

    /// Activation of child due to the first pending request
    /// Should be called on leaf node (i.e. queue) to propagate activation signal through chain to the root
    virtual void activateChild(ISchedulerNode * child) = 0;

    /// Returns true iff node is active
    virtual bool isActive() = 0;

    /// Returns the first request to be executed as the first component of resuting pair.
    /// The second pair component is `true` iff node is still active after dequeueing.
    virtual std::pair<ResourceRequest *, bool> dequeueRequest() = 0;

    /// Returns full path string using names of every parent
    String getPath()
    {
        String result;
        ISchedulerNode * ptr = this;
        while (ptr->parent)
        {
            result = "/" + ptr->basename + result;
            ptr = ptr->parent;
        }
        return result.empty() ? "/" : result;
    }

    /// Attach to a parent (used by attachChild)
    virtual void setParent(ISchedulerNode * parent_)
    {
        parent = parent_;
    }

protected:
    /// Notify parents about the first pending request or constraint becoming satisfied.
    /// Postponed to be handled in scheduler thread, so it is intended to be called from outside.
    void scheduleActivation()
    {
        if (likely(parent))
        {
            event_queue->enqueue([this] { parent->activateChild(this); });
        }
    }

public:
    EventQueue * const event_queue;
    String basename;
    SchedulerNodeInfo info;
    ISchedulerNode * parent = nullptr;
};

using SchedulerNodePtr = std::shared_ptr<ISchedulerNode>;

}