summaryrefslogtreecommitdiffstats
path: root/library/cpp/yt/rseq/per_cpu.cpp
diff options
context:
space:
mode:
authorbabenko <[email protected]>2026-06-19 14:27:43 +0300
committerbabenko <[email protected]>2026-06-19 15:12:00 +0300
commit89c0e29c8f9ba29ecdc736fefda87286482ac213 (patch)
tree1adfbcd839240d8d0155771c6b775fa00a3e5f32 /library/cpp/yt/rseq/per_cpu.cpp
parent824b32f6aab5c67b2d39288b1d229eb257f248f0 (diff)
Add lock-free per-CPU primitives to library/cpp/yt/rseq
Introduce AddPerCpu and StorePerCpu over an rseq-sharded per-CPU array. On the x86-64 Linux fast path the update is committed by a hand-rolled rseq critical section (non-atomic, migration-safe): addq for the 8-byte accumulate, movq / movdqu for the 8- or 16-byte store. The kernel restarts the sequence on preemption or migration, and only one thread runs on a CPU at a time, so no atomic or lock is needed. Off the fast path (other arches, no kernel rseq) the operation falls back to an atomic on the slot indexed by sched_getcpu(). A naturally-aligned 8-byte store is single-copy atomic on x86-64, so it is never observed torn; the 16-byte store may be, which is acceptable for a last-writer-wins gauge. commit_hash:6250f6e9e35cf3895ebafe0b534ec12cca50b03b
Diffstat (limited to 'library/cpp/yt/rseq/per_cpu.cpp')
-rw-r--r--library/cpp/yt/rseq/per_cpu.cpp112
1 files changed, 112 insertions, 0 deletions
diff --git a/library/cpp/yt/rseq/per_cpu.cpp b/library/cpp/yt/rseq/per_cpu.cpp
new file mode 100644
index 00000000000..90f1fa87fb3
--- /dev/null
+++ b/library/cpp/yt/rseq/per_cpu.cpp
@@ -0,0 +1,112 @@
+#include "per_cpu.h"
+
+#if defined(__linux__)
+#include <sched.h>
+#include <unistd.h>
+
+#include <cstdio>
+#endif
+
+#include <mutex>
+
+namespace NYT::NRseq {
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace NDetail {
+
+int ParsePossibleCpuCount(std::string_view list)
+{
+ // The list enumerates CPU id ranges (e.g. "0-3,8-11"); the highest id + 1 is nr_cpu_ids,
+ // the exclusive upper bound for the rseq cpu_id. This differs from _SC_NPROCESSORS_CONF
+ // (a popcount) on sparse topologies: "0-3,8-11" yields 12 here but a count of 8.
+ int maxId = -1;
+ for (size_t index = 0; index < list.size();) {
+ if (list[index] < '0' || list[index] > '9') {
+ ++index;
+ continue;
+ }
+ int value = 0;
+ while (index < list.size() && list[index] >= '0' && list[index] <= '9') {
+ value = value * 10 + (list[index] - '0');
+ ++index;
+ }
+ if (value > maxId) {
+ maxId = value;
+ }
+ }
+ return maxId >= 0 ? maxId + 1 : -1;
+}
+
+#if defined(__linux__)
+
+//! Reads /sys/devices/system/cpu/possible and returns nr_cpu_ids, or -1 if it cannot be read.
+static int TryReadPossibleCpuCount()
+{
+ auto* file = std::fopen("/sys/devices/system/cpu/possible", "re");
+ if (!file) {
+ return -1;
+ }
+ char buffer[256] = {};
+ size_t size = std::fread(buffer, 1, sizeof(buffer) - 1, file);
+ std::fclose(file);
+ return ParsePossibleCpuCount(std::string_view(buffer, size));
+}
+
+#endif
+
+// Published by GetCpuCount(); see the declaration in per_cpu-inl.h. Defaults to 0 so the
+// fast path's bounds check sends every update to the safe atomic fallback until the size is
+// known.
+constinit int CpuCount = 0;
+
+int GetFallbackCpuId()
+{
+#if defined(__linux__)
+ int cpuId = ::sched_getcpu();
+ if (cpuId < 0) {
+ return 0;
+ }
+ int cpuCount = GetCpuCount();
+ // Defensive: keep the index in range even if a CPU came online beyond the configured
+ // count. On the fallback path slots are touched atomically, so a shared slot is safe.
+ return cpuId < cpuCount ? cpuId : cpuId % cpuCount;
+#else
+ return 0;
+#endif
+}
+
+} // namespace NDetail
+
+////////////////////////////////////////////////////////////////////////////////
+
+int GetCpuCount()
+{
+ static std::once_flag OnceFlag;
+ std::call_once(OnceFlag, [] {
+ int cpuCount = 1;
+#if defined(__linux__)
+ // The fast path indexes the slot array by the raw rseq cpu_id, so size to the highest
+ // CPU id the kernel can report plus one (nr_cpu_ids), not merely the number of CPUs;
+ // the possible-CPU bitmap gives this exact bound, and then every cpu_id is in range.
+ if (int possible = NDetail::TryReadPossibleCpuCount(); possible > 0) {
+ cpuCount = possible;
+ } else {
+ // Bitmap unavailable (e.g. /sys masked in a container): _SC_NPROCESSORS_CONF is a
+ // count, not a cpu_id bound, so on a sparse topology it may be smaller than some
+ // cpu_id. The fast path's bounds check then routes those CPUs to the clamped
+ // atomic fallback -- still memory-safe, though a clamped slot may mix atomic and
+ // non-atomic writes (at worst a lost counter update on such exotic setups).
+ int configured = static_cast<int>(::sysconf(_SC_NPROCESSORS_CONF));
+ cpuCount = configured > 0 ? configured : 1;
+ }
+#endif
+ // Publish for the fast-path bounds check before any update can index the array.
+ NDetail::CpuCount = cpuCount;
+ });
+ return NDetail::CpuCount;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NYT::NRseq