1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
|
#include "ThreadProfileEvents.h"
#if defined(OS_LINUX)
#include "NetlinkMetricsProvider.h"
#include "ProcfsMetricsProvider.h"
#include "hasLinuxCapability.h"
#include <filesystem>
#include <fstream>
#include <optional>
#include <unordered_set>
#include <fcntl.h>
#include <unistd.h>
#include <linux/perf_event.h>
#include <syscall.h>
#include <sys/ioctl.h>
#include <cerrno>
#include <sys/types.h>
#include <dirent.h>
#include <boost/algorithm/string/split.hpp>
#include <base/errnoToString.h>
#include <Common/logger_useful.h>
namespace ProfileEvents
{
extern const Event OSIOWaitMicroseconds;
extern const Event OSCPUWaitMicroseconds;
extern const Event OSCPUVirtualTimeMicroseconds;
extern const Event OSReadChars;
extern const Event OSWriteChars;
extern const Event OSReadBytes;
extern const Event OSWriteBytes;
extern const Event PerfCpuCycles;
extern const Event PerfInstructions;
extern const Event PerfCacheReferences;
extern const Event PerfCacheMisses;
extern const Event PerfBranchInstructions;
extern const Event PerfBranchMisses;
extern const Event PerfBusCycles;
extern const Event PerfStalledCyclesFrontend;
extern const Event PerfStalledCyclesBackend;
extern const Event PerfRefCpuCycles;
extern const Event PerfCpuClock;
extern const Event PerfTaskClock;
extern const Event PerfContextSwitches;
extern const Event PerfCpuMigrations;
extern const Event PerfAlignmentFaults;
extern const Event PerfEmulationFaults;
extern const Event PerfMinEnabledTime;
extern const Event PerfMinEnabledRunningTime;
extern const Event PerfDataTLBReferences;
extern const Event PerfDataTLBMisses;
extern const Event PerfInstructionTLBReferences;
extern const Event PerfInstructionTLBMisses;
extern const Event PerfLocalMemoryReferences;
extern const Event PerfLocalMemoryMisses;
}
namespace DB
{
const char * TasksStatsCounters::metricsProviderString(MetricsProvider provider)
{
switch (provider)
{
case MetricsProvider::None:
return "none";
case MetricsProvider::Procfs:
return "procfs";
case MetricsProvider::Netlink:
return "netlink";
}
UNREACHABLE();
}
bool TasksStatsCounters::checkIfAvailable()
{
return findBestAvailableProvider() != MetricsProvider::None;
}
std::unique_ptr<TasksStatsCounters> TasksStatsCounters::create(UInt64 tid)
{
std::unique_ptr<TasksStatsCounters> instance;
if (checkIfAvailable())
instance.reset(new TasksStatsCounters(tid, findBestAvailableProvider()));
return instance;
}
TasksStatsCounters::MetricsProvider TasksStatsCounters::findBestAvailableProvider()
{
/// This initialization is thread-safe and executed once since C++11
static std::optional<MetricsProvider> provider =
[]() -> MetricsProvider
{
if (NetlinkMetricsProvider::checkPermissions())
{
return MetricsProvider::Netlink;
}
else if (ProcfsMetricsProvider::isAvailable())
{
return MetricsProvider::Procfs;
}
return MetricsProvider::None;
}();
return *provider;
}
TasksStatsCounters::TasksStatsCounters(const UInt64 tid, const MetricsProvider provider)
{
switch (provider)
{
case MetricsProvider::Netlink:
stats_getter = [metrics_provider = std::make_shared<NetlinkMetricsProvider>(), tid]()
{
::taskstats result{};
metrics_provider->getStat(result, static_cast<pid_t>(tid));
return result;
};
break;
case MetricsProvider::Procfs:
stats_getter = [metrics_provider = std::make_shared<ProcfsMetricsProvider>(tid)]()
{
::taskstats result{};
metrics_provider->getTaskStats(result);
return result;
};
break;
case MetricsProvider::None:
;
}
}
void TasksStatsCounters::reset()
{
if (stats_getter)
stats = stats_getter();
}
void TasksStatsCounters::updateCounters(ProfileEvents::Counters & profile_events)
{
if (!stats_getter)
return;
const auto new_stats = stats_getter();
incrementProfileEvents(stats, new_stats, profile_events);
stats = new_stats;
}
void TasksStatsCounters::incrementProfileEvents(const ::taskstats & prev, const ::taskstats & curr, ProfileEvents::Counters & profile_events)
{
profile_events.increment(ProfileEvents::OSCPUWaitMicroseconds,
safeDiff(prev.cpu_delay_total, curr.cpu_delay_total) / 1000U);
profile_events.increment(ProfileEvents::OSIOWaitMicroseconds,
safeDiff(prev.blkio_delay_total, curr.blkio_delay_total) / 1000U);
profile_events.increment(ProfileEvents::OSCPUVirtualTimeMicroseconds,
safeDiff(prev.cpu_run_virtual_total, curr.cpu_run_virtual_total) / 1000U);
/// Since TASKSTATS_VERSION = 3 extended accounting and IO accounting is available.
if (curr.version < 3)
return;
profile_events.increment(ProfileEvents::OSReadChars, safeDiff(prev.read_char, curr.read_char));
profile_events.increment(ProfileEvents::OSWriteChars, safeDiff(prev.write_char, curr.write_char));
profile_events.increment(ProfileEvents::OSReadBytes, safeDiff(prev.read_bytes, curr.read_bytes));
profile_events.increment(ProfileEvents::OSWriteBytes, safeDiff(prev.write_bytes, curr.write_bytes));
}
}
#endif
#if defined(OS_LINUX)
namespace DB
{
thread_local PerfEventsCounters current_thread_counters;
#define SOFTWARE_EVENT(PERF_NAME, LOCAL_NAME) \
PerfEventInfo \
{ \
.event_type = perf_type_id::PERF_TYPE_SOFTWARE, \
.event_config = (PERF_NAME), \
.profile_event = ProfileEvents::LOCAL_NAME, \
.settings_name = #LOCAL_NAME \
}
#define HARDWARE_EVENT(PERF_NAME, LOCAL_NAME) \
PerfEventInfo \
{ \
.event_type = perf_type_id::PERF_TYPE_HARDWARE, \
.event_config = (PERF_NAME), \
.profile_event = ProfileEvents::LOCAL_NAME, \
.settings_name = #LOCAL_NAME \
}
// One event for cache accesses and one for cache misses.
// Type is ACCESS or MISS
#define CACHE_EVENT(PERF_NAME, LOCAL_NAME, TYPE) \
PerfEventInfo \
{ \
.event_type = perf_type_id::PERF_TYPE_HW_CACHE, \
.event_config = (PERF_NAME) \
| (PERF_COUNT_HW_CACHE_OP_READ << 8) \
| (PERF_COUNT_HW_CACHE_RESULT_ ## TYPE << 16), \
.profile_event = ProfileEvents::LOCAL_NAME, \
.settings_name = #LOCAL_NAME \
}
// descriptions' source: http://man7.org/linux/man-pages/man2/perf_event_open.2.html
static const PerfEventInfo raw_events_info[] = {
HARDWARE_EVENT(PERF_COUNT_HW_CPU_CYCLES, PerfCpuCycles),
HARDWARE_EVENT(PERF_COUNT_HW_INSTRUCTIONS, PerfInstructions),
HARDWARE_EVENT(PERF_COUNT_HW_CACHE_REFERENCES, PerfCacheReferences),
HARDWARE_EVENT(PERF_COUNT_HW_CACHE_MISSES, PerfCacheMisses),
HARDWARE_EVENT(PERF_COUNT_HW_BRANCH_INSTRUCTIONS, PerfBranchInstructions),
HARDWARE_EVENT(PERF_COUNT_HW_BRANCH_MISSES, PerfBranchMisses),
HARDWARE_EVENT(PERF_COUNT_HW_BUS_CYCLES, PerfBusCycles),
HARDWARE_EVENT(PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, PerfStalledCyclesFrontend),
HARDWARE_EVENT(PERF_COUNT_HW_STALLED_CYCLES_BACKEND, PerfStalledCyclesBackend),
HARDWARE_EVENT(PERF_COUNT_HW_REF_CPU_CYCLES, PerfRefCpuCycles),
// `cpu-clock` is a bit broken according to this: https://stackoverflow.com/a/56967896
SOFTWARE_EVENT(PERF_COUNT_SW_CPU_CLOCK, PerfCpuClock),
SOFTWARE_EVENT(PERF_COUNT_SW_TASK_CLOCK, PerfTaskClock),
SOFTWARE_EVENT(PERF_COUNT_SW_CONTEXT_SWITCHES, PerfContextSwitches),
SOFTWARE_EVENT(PERF_COUNT_SW_CPU_MIGRATIONS, PerfCpuMigrations),
SOFTWARE_EVENT(PERF_COUNT_SW_ALIGNMENT_FAULTS, PerfAlignmentFaults),
SOFTWARE_EVENT(PERF_COUNT_SW_EMULATION_FAULTS, PerfEmulationFaults),
// Don't add them -- they are the same as SoftPageFaults and HardPageFaults,
// match well numerically.
// SOFTWARE_EVENT(PERF_COUNT_SW_PAGE_FAULTS_MIN, PerfPageFaultsMinor),
// SOFTWARE_EVENT(PERF_COUNT_SW_PAGE_FAULTS_MAJ, PerfPageFaultsMajor),
CACHE_EVENT(PERF_COUNT_HW_CACHE_DTLB, PerfDataTLBReferences, ACCESS),
CACHE_EVENT(PERF_COUNT_HW_CACHE_DTLB, PerfDataTLBMisses, MISS),
// Apparently it doesn't make sense to treat these values as relative:
// https://stackoverflow.com/questions/49933319/how-to-interpret-perf-itlb-loads-itlb-load-misses
CACHE_EVENT(PERF_COUNT_HW_CACHE_ITLB, PerfInstructionTLBReferences, ACCESS),
CACHE_EVENT(PERF_COUNT_HW_CACHE_ITLB, PerfInstructionTLBMisses, MISS),
CACHE_EVENT(PERF_COUNT_HW_CACHE_NODE, PerfLocalMemoryReferences, ACCESS),
CACHE_EVENT(PERF_COUNT_HW_CACHE_NODE, PerfLocalMemoryMisses, MISS),
};
static_assert(sizeof(raw_events_info) / sizeof(raw_events_info[0]) == NUMBER_OF_RAW_EVENTS);
#undef HARDWARE_EVENT
#undef SOFTWARE_EVENT
#undef CACHE_EVENT
// A map of event name -> event index, to parse event list in settings.
static std::unordered_map<std::string_view, size_t> populateEventMap()
{
std::unordered_map<std::string_view, size_t> name_to_index;
name_to_index.reserve(NUMBER_OF_RAW_EVENTS);
for (size_t i = 0; i < NUMBER_OF_RAW_EVENTS; ++i)
{
name_to_index.emplace(raw_events_info[i].settings_name, i);
}
return name_to_index;
}
static const auto event_name_to_index = populateEventMap();
static int openPerfEvent(perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, UInt64 flags)
{
return static_cast<int>(syscall(SYS_perf_event_open, hw_event, pid, cpu, group_fd, flags));
}
static int openPerfEventDisabled(Int32 perf_event_paranoid, bool has_cap_sys_admin, UInt32 perf_event_type, UInt64 perf_event_config)
{
perf_event_attr pe{};
pe.type = perf_event_type;
pe.size = sizeof(struct perf_event_attr);
pe.config = perf_event_config;
// disable by default to add as little extra time as possible
pe.disabled = 1;
// can record kernel only when `perf_event_paranoid` <= 1 or have CAP_SYS_ADMIN
pe.exclude_kernel = perf_event_paranoid >= 2 && !has_cap_sys_admin;
pe.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;
return openPerfEvent(&pe, /* measure the calling thread */ 0, /* on any cpu */ -1, -1, 0);
}
static void enablePerfEvent(int event_fd)
{
if (ioctl(event_fd, PERF_EVENT_IOC_ENABLE, 0))
{
LOG_WARNING(&Poco::Logger::get("PerfEvents"),
"Can't enable perf event with file descriptor {}: '{}' ({})",
event_fd, errnoToString(), errno);
}
}
static void disablePerfEvent(int event_fd)
{
if (ioctl(event_fd, PERF_EVENT_IOC_DISABLE, 0))
{
LOG_WARNING(&Poco::Logger::get("PerfEvents"),
"Can't disable perf event with file descriptor {}: '{}' ({})",
event_fd, errnoToString(), errno);
}
}
static void releasePerfEvent(int event_fd)
{
if (close(event_fd))
{
LOG_WARNING(&Poco::Logger::get("PerfEvents"),
"Can't close perf event file descriptor {}: {} ({})",
event_fd, errnoToString(), errno);
}
}
static bool validatePerfEventDescriptor(int & fd)
{
if (fcntl(fd, F_GETFL) != -1)
return true;
if (errno == EBADF)
{
LOG_WARNING(&Poco::Logger::get("PerfEvents"),
"Event descriptor {} was closed from the outside; reopening", fd);
}
else
{
LOG_WARNING(&Poco::Logger::get("PerfEvents"),
"Error while checking availability of event descriptor {}: {} ({})",
fd, errnoToString(), errno);
disablePerfEvent(fd);
releasePerfEvent(fd);
}
fd = -1;
return false;
}
bool PerfEventsCounters::processThreadLocalChanges(const std::string & needed_events_list)
{
const auto valid_event_indices = eventIndicesFromString(needed_events_list);
// find state changes (if there are any)
bool old_state[NUMBER_OF_RAW_EVENTS];
for (size_t i = 0; i < NUMBER_OF_RAW_EVENTS; ++i)
old_state[i] = thread_events_descriptors_holder.descriptors[i] != -1;
bool new_state[NUMBER_OF_RAW_EVENTS];
std::fill_n(new_state, NUMBER_OF_RAW_EVENTS, false);
for (size_t opened_index : valid_event_indices)
new_state[opened_index] = true;
std::vector<size_t> events_to_open;
std::vector<size_t> events_to_release;
for (size_t i = 0; i < NUMBER_OF_RAW_EVENTS; ++i)
{
bool old_one = old_state[i];
bool new_one = new_state[i];
if (old_one == new_one)
{
if (old_one
&& !validatePerfEventDescriptor(
thread_events_descriptors_holder.descriptors[i]))
{
events_to_open.push_back(i);
}
continue;
}
if (new_one)
events_to_open.push_back(i);
else
events_to_release.push_back(i);
}
// release unused descriptors
for (size_t i : events_to_release)
{
int & fd = thread_events_descriptors_holder.descriptors[i];
disablePerfEvent(fd);
releasePerfEvent(fd);
fd = -1;
}
if (events_to_open.empty())
{
return true;
}
// check permissions
// cat /proc/sys/kernel/perf_event_paranoid
// -1: Allow use of (almost) all events by all users
// >=0: Disallow raw tracepoint access by users without CAP_IOC_LOCK
// >=1: Disallow CPU event access by users without CAP_SYS_ADMIN
// >=2: Disallow kernel profiling by users without CAP_SYS_ADMIN
// >=3: Disallow all event access by users without CAP_SYS_ADMIN
Int32 perf_event_paranoid = 0;
std::ifstream paranoid_file("/proc/sys/kernel/perf_event_paranoid");
paranoid_file >> perf_event_paranoid;
bool has_cap_sys_admin = hasLinuxCapability(CAP_SYS_ADMIN);
if (perf_event_paranoid >= 3 && !has_cap_sys_admin)
{
LOG_WARNING(&Poco::Logger::get("PerfEvents"),
"Not enough permissions to record perf events: "
"perf_event_paranoid = {} and CAP_SYS_ADMIN = 0",
perf_event_paranoid);
return false;
}
// Open descriptors for new events.
// Theoretically, we can run out of file descriptors. Threads go up to 10k,
// and there might be a dozen perf events per thread, so we're looking at
// 100k open files. In practice, this is not likely -- perf events are
// mostly used in performance tests or other kinds of testing, and the
// number of threads stays below hundred.
// We used to check the number of open files by enumerating /proc/self/fd,
// but listing all open files before opening more files is obviously
// quadratic, and quadraticity never ends well.
for (size_t i : events_to_open)
{
const PerfEventInfo & event_info = raw_events_info[i];
int & fd = thread_events_descriptors_holder.descriptors[i];
// disable by default to add as little extra time as possible
fd = openPerfEventDisabled(perf_event_paranoid, has_cap_sys_admin, event_info.event_type, event_info.event_config);
if (fd == -1 && errno != ENOENT)
{
// ENOENT means that the event is not supported. Don't log it, because
// this is called for each thread and would be too verbose. Log other
// error codes because they might signify an error.
LOG_WARNING(&Poco::Logger::get("PerfEvents"),
"Failed to open perf event {} (event_type={}, event_config={}): "
"'{}' ({})", event_info.settings_name, event_info.event_type,
event_info.event_config, errnoToString(), errno);
}
}
return true;
}
// Parse comma-separated list of event names. Empty means all available events.
std::vector<size_t> PerfEventsCounters::eventIndicesFromString(const std::string & events_list)
{
std::vector<size_t> result;
result.reserve(NUMBER_OF_RAW_EVENTS);
if (events_list.empty())
{
for (size_t i = 0; i < NUMBER_OF_RAW_EVENTS; ++i)
{
result.push_back(i);
}
return result;
}
std::vector<std::string> event_names;
boost::split(event_names, events_list, [](char c) { return c == ','; });
for (auto & event_name : event_names)
{
// Allow spaces at the beginning of the token, so that you can write 'a, b'.
event_name.erase(0, event_name.find_first_not_of(' '));
auto entry = event_name_to_index.find(event_name);
if (entry != event_name_to_index.end())
{
result.push_back(entry->second);
}
else
{
LOG_ERROR(&Poco::Logger::get("PerfEvents"),
"Unknown perf event name '{}' specified in settings", event_name);
}
}
return result;
}
void PerfEventsCounters::initializeProfileEvents(const std::string & events_list)
{
if (!processThreadLocalChanges(events_list))
return;
for (int fd : thread_events_descriptors_holder.descriptors)
{
if (fd == -1)
continue;
// We don't reset the event, because the time_running and time_enabled
// can't be reset anyway and we have to calculate deltas.
enablePerfEvent(fd);
}
}
void PerfEventsCounters::finalizeProfileEvents(ProfileEvents::Counters & profile_events)
{
// Disable all perf events.
for (auto fd : thread_events_descriptors_holder.descriptors)
{
if (fd == -1)
continue;
disablePerfEvent(fd);
}
// Read the counter values.
PerfEventValue current_values[NUMBER_OF_RAW_EVENTS];
for (size_t i = 0; i < NUMBER_OF_RAW_EVENTS; ++i)
{
int fd = thread_events_descriptors_holder.descriptors[i];
if (fd == -1)
continue;
constexpr ssize_t bytes_to_read = sizeof(current_values[0]);
const ssize_t bytes_read = read(fd, ¤t_values[i], bytes_to_read);
if (bytes_read != bytes_to_read)
{
LOG_WARNING(&Poco::Logger::get("PerfEvents"),
"Can't read event value from file descriptor {}: '{}' ({})",
fd, errnoToString(), errno);
current_values[i] = {};
}
}
// Actually process counters' values. Track the minimal time that a performance
// counter was enabled, and the corresponding running time, to give some idea
// about the amount of counter multiplexing.
UInt64 min_enabled_time = -1;
UInt64 running_time_for_min_enabled_time = 0;
for (size_t i = 0; i < NUMBER_OF_RAW_EVENTS; ++i)
{
int fd = thread_events_descriptors_holder.descriptors[i];
if (fd == -1)
continue;
const PerfEventInfo & info = raw_events_info[i];
const PerfEventValue & previous_value = previous_values[i];
const PerfEventValue & current_value = current_values[i];
// Account for counter multiplexing. time_running and time_enabled are
// not reset by PERF_EVENT_IOC_RESET, so we don't use it and calculate
// deltas from old values.
const auto enabled = current_value.time_enabled - previous_value.time_enabled;
const auto running = current_value.time_running - previous_value.time_running;
const UInt64 delta = static_cast<UInt64>(
(current_value.value - previous_value.value) * enabled / std::max(1.f, float(running)));
if (min_enabled_time > enabled)
{
min_enabled_time = enabled;
running_time_for_min_enabled_time = running;
}
profile_events.increment(info.profile_event, delta);
}
// If we had at least one enabled event, also show multiplexing-related
// statistics.
if (min_enabled_time != UInt64(-1))
{
profile_events.increment(ProfileEvents::PerfMinEnabledTime,
min_enabled_time);
profile_events.increment(ProfileEvents::PerfMinEnabledRunningTime,
running_time_for_min_enabled_time);
}
// Store current counter values for the next profiling period.
memcpy(previous_values, current_values, sizeof(current_values));
}
void PerfEventsCounters::closeEventDescriptors()
{
thread_events_descriptors_holder.releaseResources();
}
PerfDescriptorsHolder::PerfDescriptorsHolder()
{
for (int & descriptor : descriptors)
descriptor = -1;
}
PerfDescriptorsHolder::~PerfDescriptorsHolder()
{
releaseResources();
}
void PerfDescriptorsHolder::releaseResources()
{
for (int & descriptor : descriptors)
{
if (descriptor == -1)
continue;
disablePerfEvent(descriptor);
releasePerfEvent(descriptor);
descriptor = -1;
}
}
}
#else
namespace DB
{
// the functionality is disabled when we are not running on Linux.
PerfEventsCounters current_thread_counters;
}
#endif
|