1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
|
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef Y_ABSL_RANDOM_INTERNAL_NANOBENCHMARK_H_
#define Y_ABSL_RANDOM_INTERNAL_NANOBENCHMARK_H_
// Benchmarks functions of a single integer argument with realistic branch
// prediction hit rates. Uses a robust estimator to summarize the measurements.
// The precision is about 0.2%.
//
// Examples: see nanobenchmark_test.cc.
//
// Background: Microbenchmarks such as http://github.com/google/benchmark
// can measure elapsed times on the order of a microsecond. Shorter functions
// are typically measured by repeating them thousands of times and dividing
// the total elapsed time by this count. Unfortunately, repetition (especially
// with the same input parameter!) influences the runtime. In time-critical
// code, it is reasonable to expect warm instruction/data caches and TLBs,
// but a perfect record of which branches will be taken is unrealistic.
// Unless the application also repeatedly invokes the measured function with
// the same parameter, the benchmark is measuring something very different -
// a best-case result, almost as if the parameter were made a compile-time
// constant. This may lead to erroneous conclusions about branch-heavy
// algorithms outperforming branch-free alternatives.
//
// Our approach differs in three ways. Adding fences to the timer functions
// reduces variability due to instruction reordering, improving the timer
// resolution to about 40 CPU cycles. However, shorter functions must still
// be invoked repeatedly. For more realistic branch prediction performance,
// we vary the input parameter according to a user-specified distribution.
// Thus, instead of VaryInputs(Measure(Repeat(func))), we change the
// loop nesting to Measure(Repeat(VaryInputs(func))). We also estimate the
// central tendency of the measurement samples with the "half sample mode",
// which is more robust to outliers and skewed data than the mean or median.
// NOTE: for compatibility with multiple translation units compiled with
// distinct flags, avoid #including headers that define functions.
#include <stddef.h>
#include <stdint.h>
#include "y_absl/base/config.h"
namespace y_absl {
Y_ABSL_NAMESPACE_BEGIN
namespace random_internal_nanobenchmark {
// Input influencing the function being measured (e.g. number of bytes to copy).
using FuncInput = size_t;
// "Proof of work" returned by Func to ensure the compiler does not elide it.
using FuncOutput = uint64_t;
// Function to measure: either 1) a captureless lambda or function with two
// arguments or 2) a lambda with capture, in which case the first argument
// is reserved for use by MeasureClosure.
using Func = FuncOutput (*)(const void*, FuncInput);
// Internal parameters that determine precision/resolution/measuring time.
struct Params {
// For measuring timer overhead/resolution. Used in a nested loop =>
// quadratic time, acceptable because we know timer overhead is "low".
// constexpr because this is used to define array bounds.
static constexpr size_t kTimerSamples = 256;
// Best-case precision, expressed as a divisor of the timer resolution.
// Larger => more calls to Func and higher precision.
size_t precision_divisor = 1024;
// Ratio between full and subset input distribution sizes. Cannot be less
// than 2; larger values increase measurement time but more faithfully
// model the given input distribution.
size_t subset_ratio = 2;
// Together with the estimated Func duration, determines how many times to
// call Func before checking the sample variability. Larger values increase
// measurement time, memory/cache use and precision.
double seconds_per_eval = 4E-3;
// The minimum number of samples before estimating the central tendency.
size_t min_samples_per_eval = 7;
// The mode is better than median for estimating the central tendency of
// skewed/fat-tailed distributions, but it requires sufficient samples
// relative to the width of half-ranges.
size_t min_mode_samples = 64;
// Maximum permissible variability (= median absolute deviation / center).
double target_rel_mad = 0.002;
// Abort after this many evals without reaching target_rel_mad. This
// prevents infinite loops.
size_t max_evals = 9;
// Retry the measure loop up to this many times.
size_t max_measure_retries = 2;
// Whether to print additional statistics to stdout.
bool verbose = true;
};
// Measurement result for each unique input.
struct Result {
FuncInput input;
// Robust estimate (mode or median) of duration.
float ticks;
// Measure of variability (median absolute deviation relative to "ticks").
float variability;
};
// Ensures the thread is running on the specified cpu, and no others.
// Reduces noise due to desynchronized socket RDTSC and context switches.
// If "cpu" is negative, pin to the currently running core.
void PinThreadToCPU(const int cpu = -1);
// Returns tick rate, useful for converting measurements to seconds. Invariant
// means the tick counter frequency is independent of CPU throttling or sleep.
// This call may be expensive, callers should cache the result.
double InvariantTicksPerSecond();
// Precisely measures the number of ticks elapsed when calling "func" with the
// given inputs, shuffled to ensure realistic branch prediction hit rates.
//
// "func" returns a 'proof of work' to ensure its computations are not elided.
// "arg" is passed to Func, or reserved for internal use by MeasureClosure.
// "inputs" is an array of "num_inputs" (not necessarily unique) arguments to
// "func". The values should be chosen to maximize coverage of "func". This
// represents a distribution, so a value's frequency should reflect its
// probability in the real application. Order does not matter; for example, a
// uniform distribution over [0, 4) could be represented as {3,0,2,1}.
// Returns how many Result were written to "results": one per unique input, or
// zero if the measurement failed (an error message goes to stderr).
size_t Measure(const Func func, const void* arg, const FuncInput* inputs,
const size_t num_inputs, Result* results,
const Params& p = Params());
// Calls operator() of the given closure (lambda function).
template <class Closure>
static FuncOutput CallClosure(const void* f, const FuncInput input) {
return (*reinterpret_cast<const Closure*>(f))(input);
}
// Same as Measure, except "closure" is typically a lambda function of
// FuncInput -> FuncOutput with a capture list.
template <class Closure>
static inline size_t MeasureClosure(const Closure& closure,
const FuncInput* inputs,
const size_t num_inputs, Result* results,
const Params& p = Params()) {
return Measure(reinterpret_cast<Func>(&CallClosure<Closure>),
reinterpret_cast<const void*>(&closure), inputs, num_inputs,
results, p);
}
} // namespace random_internal_nanobenchmark
Y_ABSL_NAMESPACE_END
} // namespace y_absl
#endif // Y_ABSL_RANDOM_INTERNAL_NANOBENCHMARK_H_
|