diff options
| author | babenko <[email protected]> | 2026-06-19 14:27:43 +0300 |
|---|---|---|
| committer | babenko <[email protected]> | 2026-06-19 15:12:00 +0300 |
| commit | 89c0e29c8f9ba29ecdc736fefda87286482ac213 (patch) | |
| tree | 1adfbcd839240d8d0155771c6b775fa00a3e5f32 /library/cpp/yt/rseq/per_cpu.cpp | |
| parent | 824b32f6aab5c67b2d39288b1d229eb257f248f0 (diff) | |
Add lock-free per-CPU primitives to library/cpp/yt/rseq
Introduce AddPerCpu and StorePerCpu over an rseq-sharded per-CPU array.
On the x86-64 Linux fast path the update is committed by a hand-rolled
rseq critical section (non-atomic, migration-safe): addq for the 8-byte
accumulate, movq / movdqu for the 8- or 16-byte store. The kernel
restarts the sequence on preemption or migration, and only one thread
runs on a CPU at a time, so no atomic or lock is needed. Off the fast
path (other arches, no kernel rseq) the operation falls back to an
atomic on the slot indexed by sched_getcpu().
A naturally-aligned 8-byte store is single-copy atomic on x86-64, so it
is never observed torn; the 16-byte store may be, which is acceptable for
a last-writer-wins gauge.
commit_hash:6250f6e9e35cf3895ebafe0b534ec12cca50b03b
Diffstat (limited to 'library/cpp/yt/rseq/per_cpu.cpp')
| -rw-r--r-- | library/cpp/yt/rseq/per_cpu.cpp | 112 |
1 files changed, 112 insertions, 0 deletions
diff --git a/library/cpp/yt/rseq/per_cpu.cpp b/library/cpp/yt/rseq/per_cpu.cpp new file mode 100644 index 00000000000..90f1fa87fb3 --- /dev/null +++ b/library/cpp/yt/rseq/per_cpu.cpp @@ -0,0 +1,112 @@ +#include "per_cpu.h" + +#if defined(__linux__) +#include <sched.h> +#include <unistd.h> + +#include <cstdio> +#endif + +#include <mutex> + +namespace NYT::NRseq { + +//////////////////////////////////////////////////////////////////////////////// + +namespace NDetail { + +int ParsePossibleCpuCount(std::string_view list) +{ + // The list enumerates CPU id ranges (e.g. "0-3,8-11"); the highest id + 1 is nr_cpu_ids, + // the exclusive upper bound for the rseq cpu_id. This differs from _SC_NPROCESSORS_CONF + // (a popcount) on sparse topologies: "0-3,8-11" yields 12 here but a count of 8. + int maxId = -1; + for (size_t index = 0; index < list.size();) { + if (list[index] < '0' || list[index] > '9') { + ++index; + continue; + } + int value = 0; + while (index < list.size() && list[index] >= '0' && list[index] <= '9') { + value = value * 10 + (list[index] - '0'); + ++index; + } + if (value > maxId) { + maxId = value; + } + } + return maxId >= 0 ? maxId + 1 : -1; +} + +#if defined(__linux__) + +//! Reads /sys/devices/system/cpu/possible and returns nr_cpu_ids, or -1 if it cannot be read. +static int TryReadPossibleCpuCount() +{ + auto* file = std::fopen("/sys/devices/system/cpu/possible", "re"); + if (!file) { + return -1; + } + char buffer[256] = {}; + size_t size = std::fread(buffer, 1, sizeof(buffer) - 1, file); + std::fclose(file); + return ParsePossibleCpuCount(std::string_view(buffer, size)); +} + +#endif + +// Published by GetCpuCount(); see the declaration in per_cpu-inl.h. Defaults to 0 so the +// fast path's bounds check sends every update to the safe atomic fallback until the size is +// known. +constinit int CpuCount = 0; + +int GetFallbackCpuId() +{ +#if defined(__linux__) + int cpuId = ::sched_getcpu(); + if (cpuId < 0) { + return 0; + } + int cpuCount = GetCpuCount(); + // Defensive: keep the index in range even if a CPU came online beyond the configured + // count. On the fallback path slots are touched atomically, so a shared slot is safe. + return cpuId < cpuCount ? cpuId : cpuId % cpuCount; +#else + return 0; +#endif +} + +} // namespace NDetail + +//////////////////////////////////////////////////////////////////////////////// + +int GetCpuCount() +{ + static std::once_flag OnceFlag; + std::call_once(OnceFlag, [] { + int cpuCount = 1; +#if defined(__linux__) + // The fast path indexes the slot array by the raw rseq cpu_id, so size to the highest + // CPU id the kernel can report plus one (nr_cpu_ids), not merely the number of CPUs; + // the possible-CPU bitmap gives this exact bound, and then every cpu_id is in range. + if (int possible = NDetail::TryReadPossibleCpuCount(); possible > 0) { + cpuCount = possible; + } else { + // Bitmap unavailable (e.g. /sys masked in a container): _SC_NPROCESSORS_CONF is a + // count, not a cpu_id bound, so on a sparse topology it may be smaller than some + // cpu_id. The fast path's bounds check then routes those CPUs to the clamped + // atomic fallback -- still memory-safe, though a clamped slot may mix atomic and + // non-atomic writes (at worst a lost counter update on such exotic setups). + int configured = static_cast<int>(::sysconf(_SC_NPROCESSORS_CONF)); + cpuCount = configured > 0 ? configured : 1; + } +#endif + // Publish for the fast-path bounds check before any update can index the array. + NDetail::CpuCount = cpuCount; + }); + return NDetail::CpuCount; +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NRseq |
