1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
|
#pragma once
#include <util/system/types.h>
#include <concepts>
#include <cstddef>
#include <type_traits>
namespace NYT::NRseq {
// The library is Linux-only: rseq is a Linux kernel feature and the build enforces OS_LINUX
// (see ya.make); off the rseq fast path the primitives fall back to plain atomics.
////////////////////////////////////////////////////////////////////////////////
//! Number of shards a per-CPU array must provide -- one per CPU.
/*!
* Equals nr_cpu_ids (highest possible CPU id + 1), from /sys/devices/system/cpu/possible so
* it covers offlined and hot-pluggable CPUs. The fast path indexes by the raw rseq cpu_id, so
* a plain count (e.g. _SC_NPROCESSORS_CONF) would undersize the array on sparse topologies --
* it is only the fallback when the bitmap is unreadable. Always >= 1; cached.
*/
int GetCpuCount();
//! Returns whether the per-CPU rseq fast path is safe to use in this process.
/*!
* The fast path reads the rseq area at a thread-pointer offset cached at startup, which is
* sound only when __rseq_abi sits at a fixed offset from the thread pointer (a glibc-owned
* area or the static TLS block, incl. tcmalloc) -- not when it lands in a dlopen'd module's
* dynamically allocated TLS. Returns false there (and where there is no fast path) so callers
* fall back to atomics. Decided once on a spawned thread and cached: one spawn at first use.
*/
bool IsPerCpuFastPathSafe();
//! Adds |delta| to the calling CPU's slot of a per-CPU array of shards, lock-free.
/*!
* |shards| is an array of GetCpuCount() |TShard| slots (typically cache-line padded); |field|
* selects the |TValue| to update. The stride is sizeof(TShard), which must be a multiple of 8
* (checked at compile time) so the field stays 8-byte aligned (for a tear-free RMW).
*
* Fast path (x86-64 Linux): a non-atomic read-modify-write committed by an rseq critical
* section -- no atomic, no lock; safe against preemption/migration (the kernel restarts it)
* and other threads (one thread per CPU). Otherwise (non-x86-64 Linux, or no kernel rseq): an
* atomic fetch_add. A process uses one path consistently, so the two never mix on a slot
* (except on exotic sparse topologies; see per_cpu.cpp).
*
* WARNING (fiber TLS): the fast path reads the thread pointer, so reach #AddPerCpu only via a
* non-inlinable, fiber-switch-free frame (a virtual call or YT_PREVENT_TLS_CACHING; see
* library/cpp/yt/misc/tls.h).
*/
template <class TShard, class TValue>
requires std::integral<TValue> && (sizeof(TValue) == 8)
void AddPerCpu(TShard* shards, TValue TShard::* field, TValue delta);
//! Stores |value| (8 or 16 bytes) into the calling CPU's slot, lock-free.
/*!
* |shards| / |field| as in #AddPerCpu; |TValue| is an 8- or 16-byte trivially-copyable type.
*
* Fast path (x86-64 Linux): an rseq-committed store (movq for 8 bytes, movdqu for 16);
* otherwise relaxed atomic store(s). An 8-byte store is single-copy atomic, never torn; a
* 16-byte store is not atomic on either path, so a reader on another CPU may see the halves
* split -- fine for a last-writer-wins gauge.
*
* WARNING (fiber TLS): same contract as #AddPerCpu.
*/
template <class TShard, class TValue>
requires (sizeof(TValue) == 8 || sizeof(TValue) == 16) && std::is_trivially_copyable_v<TValue>
void StorePerCpu(TShard* shards, TValue TShard::* field, TValue value);
//! Relaxed atomic load of slot |index| -- the reader counterpart of #AddPerCpu.
/*!
* |shards| / |field| as in #AddPerCpu; reads shards[index].*field for |index| in
* [0, GetCpuCount()). Not tied to the calling CPU (no rseq fast path); aggregate a counter by
* loading every slot and summing.
*/
template <class TShard, class TValue>
requires std::integral<TValue> && (sizeof(TValue) == 8)
TValue LoadPerCpu(const TShard* shards, TValue TShard::* field, int index);
////////////////////////////////////////////////////////////////////////////////
} // namespace NYT::NRseq
#define PER_CPU_INL_H_
#include "per_cpu-inl.h"
#undef PER_CPU_INL_H_
|