Update contrib/libs/cxxsupp/builtins to 19.1.3

commit_hash:4898490dcc35775adf8be6d67c2ca83001fb5311
author: robot-contrib <robot-contrib@yandex-team.com> 2024-12-02 20:23:41 +0300
committer: robot-contrib <robot-contrib@yandex-team.com> 2024-12-02 20:47:52 +0300
commit: 81518d12414d1a5f8f1e3d3e13d884306230609b (patch)
tree: 3eea40f5490048f98fcca4f5b22e1597d2364f54
parent: c6bd6398f1bec61405c83f91872481e3b5e33510 (diff)
download: ydb-81518d12414d1a5f8f1e3d3e13d884306230609b.tar.gz
34 files changed, 1167 insertions, 363 deletions
diff --git a/build/sysincl/darwin.yml b/build/sysincl/darwin.yml
index 06eaa50480f..d583225dfad 100644
--- a/build/sysincl/darwin.yml
+++ b/build/sysincl/darwin.yml
@@ -37,6 +37,7 @@
   - MacTypes.h
   - TargetConditionals.h
   - architecture/byte_order.h
+  - arm/cpu_capabilities_public.h
   - asl.h
   - copyfile.h
   - crt_externs.h
diff --git a/contrib/libs/cxxsupp/builtins/.yandex_meta/build.ym b/contrib/libs/cxxsupp/builtins/.yandex_meta/build.ym
index 20967507889..e454ac162ac 100644
--- a/contrib/libs/cxxsupp/builtins/.yandex_meta/build.ym
+++ b/contrib/libs/cxxsupp/builtins/.yandex_meta/build.ym
@@ -1,6 +1,6 @@
 {% extends '//builtin/bag.ym' %}
 
-{% block current_version %}18.1.8{% endblock %}
+{% block current_version %}19.1.3{% endblock %}
 
 {% block current_url %}
 https://github.com/llvm/llvm-project/releases/download/llvmorg-{{self.version().strip()}}/compiler-rt-{{self.version().strip()}}.src.tar.xz
diff --git a/contrib/libs/cxxsupp/builtins/.yandex_meta/devtools.licenses.report b/contrib/libs/cxxsupp/builtins/.yandex_meta/devtools.licenses.report
index ce8fb1f4bb7..874c592edd9 100644
--- a/contrib/libs/cxxsupp/builtins/.yandex_meta/devtools.licenses.report
+++ b/contrib/libs/cxxsupp/builtins/.yandex_meta/devtools.licenses.report
@@ -103,7 +103,9 @@ BELONGS ya.make
         aarch64/chkstk.S [1:2]
         aarch64/fp_mode.c [3:4]
         aarch64/sme-abi-init.c [1:2]
+        aarch64/sme-abi-vg.c [1:2]
         aarch64/sme-abi.S [1:2]
+        aarch64/sme-libc-mem-routines.S [1:2]
         absvdi2.c [3:4]
         absvsi2.c [3:4]
         absvti2.c [3:4]
@@ -237,7 +239,9 @@ BELONGS ya.make
         comparedf2.c [3:4]
         comparesf2.c [3:4]
         comparetf2.c [3:4]
+        cpu_model/AArch64CPUFeatures.inc [3:4]
         cpu_model/aarch64.c [3:4]
+        cpu_model/aarch64.h [3:4]
         cpu_model/cpu_model.h [3:4]
         cpu_model/x86.c [3:4]
         crtbegin.c [3:4]
@@ -261,6 +265,7 @@ BELONGS ya.make
         emutls.c [3:4]
         enable_execute_stack.c [3:4]
         eprintf.c [3:4]
+        extendbfsf2.c [3:4]
         extenddftf2.c [3:4]
         extendhfsf2.c [3:4]
         extendsfdf2.c [3:4]
@@ -486,7 +491,9 @@ BELONGS ya.make
         aarch64/chkstk.S [1:2]
         aarch64/fp_mode.c [3:4]
         aarch64/sme-abi-init.c [1:2]
+        aarch64/sme-abi-vg.c [1:2]
         aarch64/sme-abi.S [1:2]
+        aarch64/sme-libc-mem-routines.S [1:2]
         absvdi2.c [3:4]
         absvsi2.c [3:4]
         absvti2.c [3:4]
@@ -620,7 +627,9 @@ BELONGS ya.make
         comparedf2.c [3:4]
         comparesf2.c [3:4]
         comparetf2.c [3:4]
+        cpu_model/AArch64CPUFeatures.inc [3:4]
         cpu_model/aarch64.c [3:4]
+        cpu_model/aarch64.h [3:4]
         cpu_model/cpu_model.h [3:4]
         cpu_model/x86.c [3:4]
         crtbegin.c [3:4]
@@ -644,6 +653,7 @@ BELONGS ya.make
         emutls.c [3:4]
         enable_execute_stack.c [3:4]
         eprintf.c [3:4]
+        extendbfsf2.c [3:4]
         extenddftf2.c [3:4]
         extendhfsf2.c [3:4]
         extendsfdf2.c [3:4]
@@ -927,7 +937,9 @@ BELONGS ya.make
         aarch64/chkstk.S [3:3]
         aarch64/fp_mode.c [5:5]
         aarch64/sme-abi-init.c [3:3]
+        aarch64/sme-abi-vg.c [3:3]
         aarch64/sme-abi.S [3:3]
+        aarch64/sme-libc-mem-routines.S [3:3]
         absvdi2.c [5:5]
         absvsi2.c [5:5]
         absvti2.c [5:5]
@@ -1061,7 +1073,9 @@ BELONGS ya.make
         comparedf2.c [5:5]
         comparesf2.c [5:5]
         comparetf2.c [5:5]
+        cpu_model/AArch64CPUFeatures.inc [5:5]
         cpu_model/aarch64.c [5:5]
+        cpu_model/aarch64.h [5:5]
         cpu_model/cpu_model.h [5:5]
         cpu_model/x86.c [5:5]
         crtbegin.c [5:5]
@@ -1085,6 +1099,7 @@ BELONGS ya.make
         emutls.c [5:5]
         enable_execute_stack.c [5:5]
         eprintf.c [5:5]
+        extendbfsf2.c [5:5]
         extenddftf2.c [5:5]
         extendhfsf2.c [5:5]
         extendsfdf2.c [5:5]
@@ -1310,7 +1325,9 @@ BELONGS ya.make
         aarch64/chkstk.S [3:3]
         aarch64/fp_mode.c [5:5]
         aarch64/sme-abi-init.c [3:3]
+        aarch64/sme-abi-vg.c [3:3]
         aarch64/sme-abi.S [3:3]
+        aarch64/sme-libc-mem-routines.S [3:3]
         absvdi2.c [5:5]
         absvsi2.c [5:5]
         absvti2.c [5:5]
@@ -1444,7 +1461,9 @@ BELONGS ya.make
         comparedf2.c [5:5]
         comparesf2.c [5:5]
         comparetf2.c [5:5]
+        cpu_model/AArch64CPUFeatures.inc [5:5]
         cpu_model/aarch64.c [5:5]
+        cpu_model/aarch64.h [5:5]
         cpu_model/cpu_model.h [5:5]
         cpu_model/x86.c [5:5]
         crtbegin.c [5:5]
@@ -1468,6 +1487,7 @@ BELONGS ya.make
         emutls.c [5:5]
         enable_execute_stack.c [5:5]
         eprintf.c [5:5]
+        extendbfsf2.c [5:5]
         extenddftf2.c [5:5]
         extendhfsf2.c [5:5]
         extendsfdf2.c [5:5]
diff --git a/contrib/libs/cxxsupp/builtins/CODE_OWNERS.TXT b/contrib/libs/cxxsupp/builtins/CODE_OWNERS.TXT
index ad136edf967..bd51a1073cc 100644
--- a/contrib/libs/cxxsupp/builtins/CODE_OWNERS.TXT
+++ b/contrib/libs/cxxsupp/builtins/CODE_OWNERS.TXT
@@ -67,3 +67,11 @@ D: ThreadSanitizer
 N: Bill Wendling
 E: isanbard@gmail.com
 D: Profile runtime library
+
+N: Christopher Apple, David Trevelyan
+E: cja-private@pm.me, realtime.sanitizer@gmail.com
+D: Realtime Sanitizer (RTSan)
+
+N: Alexander Shaposhnikov
+E: alexander.v.shaposhnikov@gmail.com
+D: Numerical Sanitizer (NSAN)
diff --git a/contrib/libs/cxxsupp/builtins/README.txt b/contrib/libs/cxxsupp/builtins/README.txt
index 2d213d95f33..19f26c92a0f 100644
--- a/contrib/libs/cxxsupp/builtins/README.txt
+++ b/contrib/libs/cxxsupp/builtins/README.txt
@@ -272,6 +272,11 @@ switch32
 switch8
 switchu8
 
+// This function generates a custom trampoline function with the specific
+// realFunc and localsPtr values.
+void __trampoline_setup(uint32_t* trampOnStack, int trampSizeAllocated,
+                        const void* realFunc, void* localsPtr);
+
 // There is no C interface to the *_vfp_d8_d15_regs functions.  There are
 // called in the prolog and epilog of Thumb1 functions.  When the C++ ABI use
 // SJLJ for exceptions, each function with a catch clause or destructors needs
diff --git a/contrib/libs/cxxsupp/builtins/aarch64/sme-abi-vg.c b/contrib/libs/cxxsupp/builtins/aarch64/sme-abi-vg.c
new file mode 100644
index 00000000000..20061012e16
--- /dev/null
+++ b/contrib/libs/cxxsupp/builtins/aarch64/sme-abi-vg.c
@@ -0,0 +1,21 @@
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "../cpu_model/aarch64.h"
+
+struct FEATURES {
+  unsigned long long features;
+};
+
+extern struct FEATURES __aarch64_cpu_features;
+
+#if __GNUC__ >= 9
+#pragma GCC diagnostic ignored "-Wprio-ctor-dtor"
+#endif
+__attribute__((constructor(90))) static void get_aarch64_cpu_features(void) {
+  if (__atomic_load_n(&__aarch64_cpu_features.features, __ATOMIC_RELAXED))
+    return;
+
+  __init_cpu_features();
+}
diff --git a/contrib/libs/cxxsupp/builtins/aarch64/sme-abi.S b/contrib/libs/cxxsupp/builtins/aarch64/sme-abi.S
index d470ecaf7aa..cd8153f6067 100644
--- a/contrib/libs/cxxsupp/builtins/aarch64/sme-abi.S
+++ b/contrib/libs/cxxsupp/builtins/aarch64/sme-abi.S
@@ -12,11 +12,15 @@
 #if !defined(__APPLE__)
 #define TPIDR2_SYMBOL SYMBOL_NAME(__aarch64_has_sme_and_tpidr2_el0)
 #define TPIDR2_SYMBOL_OFFSET :lo12:SYMBOL_NAME(__aarch64_has_sme_and_tpidr2_el0)
+#define CPU_FEATS_SYMBOL SYMBOL_NAME(__aarch64_cpu_features)
+#define CPU_FEATS_SYMBOL_OFFSET :lo12:SYMBOL_NAME(__aarch64_cpu_features)
 #else
 // MachO requires @page/@pageoff directives because the global is defined
 // in a different file. Otherwise this file may fail to build.
 #define TPIDR2_SYMBOL SYMBOL_NAME(__aarch64_has_sme_and_tpidr2_el0)@page
 #define TPIDR2_SYMBOL_OFFSET SYMBOL_NAME(__aarch64_has_sme_and_tpidr2_el0)@pageoff
+#define CPU_FEATS_SYMBOL SYMBOL_NAME(__aarch64_cpu_features)@page
+#define CPU_FEATS_SYMBOL_OFFSET SYMBOL_NAME(__aarch64_cpu_features)@pageoff
 #endif
 
 .arch armv9-a+sme
@@ -26,9 +30,10 @@
 // abort(). Note that there is no need to preserve any state before the call,
 // because the function does not return.
 DEFINE_COMPILERRT_PRIVATE_FUNCTION(do_abort)
-.cfi_startproc
-	.variant_pcs SYMBOL_NAME(do_abort)
-	stp	x29, x30, [sp, #-32]!
+  .cfi_startproc
+  .variant_pcs SYMBOL_NAME(do_abort)
+  BTI_C
+  stp  x29, x30, [sp, #-32]!
   cntd x0
   // Store VG to a stack location that we describe with .cfi_offset
   str x0, [sp, #16]
@@ -36,22 +41,23 @@ DEFINE_COMPILERRT_PRIVATE_FUNCTION(do_abort)
   .cfi_offset w30, -24
   .cfi_offset w29, -32
   .cfi_offset 46, -16
-	bl	__arm_sme_state
-	tbz	x0, #0, 2f
+  bl  __arm_sme_state
+  tbz  x0, #0, 2f
 1:
-	smstop sm
+  smstop sm
 2:
   // We can't make this into a tail-call because the unwinder would
   // need to restore the value of VG.
-	bl	SYMBOL_NAME(abort)
-.cfi_endproc
+  bl  SYMBOL_NAME(abort)
+  .cfi_endproc
 END_COMPILERRT_FUNCTION(do_abort)
 
 // __arm_sme_state fills the result registers based on a local
 // that is set as part of the compiler-rt startup code.
 //   __aarch64_has_sme_and_tpidr2_el0
 DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sme_state)
-	.variant_pcs __arm_sme_state
+  .variant_pcs __arm_sme_state
+  BTI_C
   mov x0, xzr
   mov x1, xzr
 
@@ -68,7 +74,8 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sme_state)
 END_COMPILERRT_OUTLINE_FUNCTION(__arm_sme_state)
 
 DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_tpidr2_restore)
-	.variant_pcs __arm_tpidr2_restore
+  .variant_pcs __arm_tpidr2_restore
+  BTI_C
   // If TPIDR2_EL0 is nonnull, the subroutine aborts in some platform-specific
   // manner.
   mrs x14, TPIDR2_EL0
@@ -103,7 +110,8 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_tpidr2_restore)
 END_COMPILERRT_OUTLINE_FUNCTION(__arm_tpidr2_restore)
 
 DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_tpidr2_save)
-	.variant_pcs __arm_tpidr2_restore
+  .variant_pcs __arm_tpidr2_restore
+  BTI_C
   // If the current thread does not have access to TPIDR2_EL0, the subroutine
   // does nothing.
   adrp  x14, TPIDR2_SYMBOL
@@ -143,7 +151,8 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_tpidr2_save)
 END_COMPILERRT_OUTLINE_FUNCTION(__arm_tpidr2_save)
 
 DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_za_disable)
-	.variant_pcs __arm_tpidr2_restore
+  .variant_pcs __arm_tpidr2_restore
+  BTI_C
   // If the current thread does not have access to SME, the subroutine does
   // nothing.
   adrp  x14, TPIDR2_SYMBOL
@@ -174,3 +183,48 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_za_disable)
 0:
   ret
 END_COMPILERRT_OUTLINE_FUNCTION(__arm_za_disable)
+
+DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_get_current_vg)
+  .variant_pcs __arm_get_current_vg
+  BTI_C
+
+  stp     x29, x30, [sp, #-16]!
+  .cfi_def_cfa_offset 16
+  mov     x29, sp
+  .cfi_def_cfa w29, 16
+  .cfi_offset w30, -8
+  .cfi_offset w29, -16
+  adrp    x17, CPU_FEATS_SYMBOL
+  ldr     w17, [x17, CPU_FEATS_SYMBOL_OFFSET]
+  tbnz    w17, #30, 0f
+  adrp    x16, TPIDR2_SYMBOL
+  ldrb    w16, [x16, TPIDR2_SYMBOL_OFFSET]
+  cbz     w16, 1f
+0:
+  mov     x18, x1
+  bl      __arm_sme_state
+  mov     x1, x18
+  and     x17, x17, #0x40000000
+  bfxil   x17, x0, #0, #1
+  cbz     x17, 1f
+  cntd    x0
+  .cfi_def_cfa wsp, 16
+  ldp     x29, x30, [sp], #16
+  .cfi_def_cfa_offset 0
+  .cfi_restore w30
+  .cfi_restore w29
+  ret
+1:
+  mov     x0, xzr
+  .cfi_def_cfa wsp, 16
+  ldp     x29, x30, [sp], #16
+  .cfi_def_cfa_offset 0
+  .cfi_restore w30
+  .cfi_restore w29
+  ret
+END_COMPILERRT_OUTLINE_FUNCTION(__arm_get_current_vg)
+
+NO_EXEC_STACK_DIRECTIVE
+
+// GNU property note for BTI and PAC
+GNU_PROPERTY_BTI_PAC
diff --git a/contrib/libs/cxxsupp/builtins/aarch64/sme-libc-mem-routines.S b/contrib/libs/cxxsupp/builtins/aarch64/sme-libc-mem-routines.S
new file mode 100644
index 00000000000..0318d9a6f1e
--- /dev/null
+++ b/contrib/libs/cxxsupp/builtins/aarch64/sme-libc-mem-routines.S
@@ -0,0 +1,352 @@
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// Routines taken from libc/AOR_v20.02/string/aarch64
+
+#include "../assembly.h"
+
+#ifdef __aarch64__
+
+#define L(l) .L ## l
+
+//
+//  __arm_sc_memcpy / __arm_sc_memmove
+//
+
+#define dstin    x0
+#define src      x1
+#define count    x2
+#define dst      x3
+#define srcend1  x4
+#define dstend1  x5
+#define A_l      x6
+#define A_lw     w6
+#define A_h      x7
+#define B_l      x8
+#define B_lw     w8
+#define B_h      x9
+#define C_l      x10
+#define C_lw     w10
+#define C_h      x11
+#define D_l      x12
+#define D_h      x13
+#define E_l      x14
+#define E_h      x15
+#define F_l      x16
+#define F_h      x17
+#define G_l      count
+#define G_h      dst
+#define H_l      src
+#define H_h      srcend1
+#define tmp1     x14
+
+/* This implementation handles overlaps and supports both memcpy and memmove
+   from a single entry point.  It uses unaligned accesses and branchless
+   sequences to keep the code small, simple and improve performance.
+
+   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+   copies of up to 128 bytes, and large copies.  The overhead of the overlap
+   check is negligible since it is only required for large copies.
+
+   Large copies use a software pipelined loop processing 64 bytes per iteration.
+   The destination pointer is 16-byte aligned to minimize unaligned accesses.
+   The loop tail is handled by always copying 64 bytes from the end.
+*/
+
+DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sc_memcpy)
+        add     srcend1, src, count
+        add     dstend1, dstin, count
+        cmp     count, 128
+        b.hi    L(copy_long)
+        cmp     count, 32
+        b.hi    L(copy32_128)
+
+        /* Small copies: 0..32 bytes.  */
+        cmp     count, 16
+        b.lo    L(copy16)
+        ldp     A_l, A_h, [src]
+        ldp     D_l, D_h, [srcend1, -16]
+        stp     A_l, A_h, [dstin]
+        stp     D_l, D_h, [dstend1, -16]
+        ret
+
+        /* Copy 8-15 bytes.  */
+L(copy16):
+        tbz     count, 3, L(copy8)
+        ldr     A_l, [src]
+        ldr     A_h, [srcend1, -8]
+        str     A_l, [dstin]
+        str     A_h, [dstend1, -8]
+        ret
+
+        .p2align 3
+        /* Copy 4-7 bytes.  */
+L(copy8):
+        tbz     count, 2, L(copy4)
+        ldr     A_lw, [src]
+        ldr     B_lw, [srcend1, -4]
+        str     A_lw, [dstin]
+        str     B_lw, [dstend1, -4]
+        ret
+
+        /* Copy 0..3 bytes using a branchless sequence.  */
+L(copy4):
+        cbz     count, L(copy0)
+        lsr     tmp1, count, 1
+        ldrb    A_lw, [src]
+        ldrb    C_lw, [srcend1, -1]
+        ldrb    B_lw, [src, tmp1]
+        strb    A_lw, [dstin]
+        strb    B_lw, [dstin, tmp1]
+        strb    C_lw, [dstend1, -1]
+L(copy0):
+        ret
+
+        .p2align 4
+        /* Medium copies: 33..128 bytes.  */
+L(copy32_128):
+        ldp     A_l, A_h, [src]
+        ldp     B_l, B_h, [src, 16]
+        ldp     C_l, C_h, [srcend1, -32]
+        ldp     D_l, D_h, [srcend1, -16]
+        cmp     count, 64
+        b.hi    L(copy128)
+        stp     A_l, A_h, [dstin]
+        stp     B_l, B_h, [dstin, 16]
+        stp     C_l, C_h, [dstend1, -32]
+        stp     D_l, D_h, [dstend1, -16]
+        ret
+
+        .p2align 4
+        /* Copy 65..128 bytes.  */
+L(copy128):
+        ldp     E_l, E_h, [src, 32]
+        ldp     F_l, F_h, [src, 48]
+        cmp     count, 96
+        b.ls    L(copy96)
+        ldp     G_l, G_h, [srcend1, -64]
+        ldp     H_l, H_h, [srcend1, -48]
+        stp     G_l, G_h, [dstend1, -64]
+        stp     H_l, H_h, [dstend1, -48]
+L(copy96):
+        stp     A_l, A_h, [dstin]
+        stp     B_l, B_h, [dstin, 16]
+        stp     E_l, E_h, [dstin, 32]
+        stp     F_l, F_h, [dstin, 48]
+        stp     C_l, C_h, [dstend1, -32]
+        stp     D_l, D_h, [dstend1, -16]
+        ret
+
+        .p2align 4
+        /* Copy more than 128 bytes.  */
+L(copy_long):
+        /* Use backwards copy if there is an overlap.  */
+        sub     tmp1, dstin, src
+        cbz     tmp1, L(copy0)
+        cmp     tmp1, count
+        b.lo    L(copy_long_backwards)
+
+        /* Copy 16 bytes and then align dst to 16-byte alignment.  */
+
+        ldp     D_l, D_h, [src]
+        and     tmp1, dstin, 15
+        bic     dst, dstin, 15
+        sub     src, src, tmp1
+        add     count, count, tmp1      /* Count is now 16 too large.  */
+        ldp     A_l, A_h, [src, 16]
+        stp     D_l, D_h, [dstin]
+        ldp     B_l, B_h, [src, 32]
+        ldp     C_l, C_h, [src, 48]
+        ldp     D_l, D_h, [src, 64]!
+        subs    count, count, 128 + 16  /* Test and readjust count.  */
+        b.ls    L(copy64_from_end)
+L(loop64):
+        stp     A_l, A_h, [dst, 16]
+        ldp     A_l, A_h, [src, 16]
+        stp     B_l, B_h, [dst, 32]
+        ldp     B_l, B_h, [src, 32]
+        stp     C_l, C_h, [dst, 48]
+        ldp     C_l, C_h, [src, 48]
+        stp     D_l, D_h, [dst, 64]!
+        ldp     D_l, D_h, [src, 64]!
+        subs    count, count, 64
+        b.hi    L(loop64)
+
+        /* Write the last iteration and copy 64 bytes from the end.  */
+L(copy64_from_end):
+        ldp     E_l, E_h, [srcend1, -64]
+        stp     A_l, A_h, [dst, 16]
+        ldp     A_l, A_h, [srcend1, -48]
+        stp     B_l, B_h, [dst, 32]
+        ldp     B_l, B_h, [srcend1, -32]
+        stp     C_l, C_h, [dst, 48]
+        ldp     C_l, C_h, [srcend1, -16]
+        stp     D_l, D_h, [dst, 64]
+        stp     E_l, E_h, [dstend1, -64]
+        stp     A_l, A_h, [dstend1, -48]
+        stp     B_l, B_h, [dstend1, -32]
+        stp     C_l, C_h, [dstend1, -16]
+        ret
+
+        .p2align 4
+
+        /* Large backwards copy for overlapping copies.
+           Copy 16 bytes and then align dst to 16-byte alignment.  */
+L(copy_long_backwards):
+        ldp     D_l, D_h, [srcend1, -16]
+        and     tmp1, dstend1, 15
+        sub     srcend1, srcend1, tmp1
+        sub     count, count, tmp1
+        ldp     A_l, A_h, [srcend1, -16]
+        stp     D_l, D_h, [dstend1, -16]
+        ldp     B_l, B_h, [srcend1, -32]
+        ldp     C_l, C_h, [srcend1, -48]
+        ldp     D_l, D_h, [srcend1, -64]!
+        sub     dstend1, dstend1, tmp1
+        subs    count, count, 128
+        b.ls    L(copy64_from_start)
+
+L(loop64_backwards):
+        stp     A_l, A_h, [dstend1, -16]
+        ldp     A_l, A_h, [srcend1, -16]
+        stp     B_l, B_h, [dstend1, -32]
+        ldp     B_l, B_h, [srcend1, -32]
+        stp     C_l, C_h, [dstend1, -48]
+        ldp     C_l, C_h, [srcend1, -48]
+        stp     D_l, D_h, [dstend1, -64]!
+        ldp     D_l, D_h, [srcend1, -64]!
+        subs    count, count, 64
+        b.hi    L(loop64_backwards)
+
+        /* Write the last iteration and copy 64 bytes from the start.  */
+L(copy64_from_start):
+        ldp     G_l, G_h, [src, 48]
+        stp     A_l, A_h, [dstend1, -16]
+        ldp     A_l, A_h, [src, 32]
+        stp     B_l, B_h, [dstend1, -32]
+        ldp     B_l, B_h, [src, 16]
+        stp     C_l, C_h, [dstend1, -48]
+        ldp     C_l, C_h, [src]
+        stp     D_l, D_h, [dstend1, -64]
+        stp     G_l, G_h, [dstin, 48]
+        stp     A_l, A_h, [dstin, 32]
+        stp     B_l, B_h, [dstin, 16]
+        stp     C_l, C_h, [dstin]
+        ret
+END_COMPILERRT_OUTLINE_FUNCTION(__arm_sc_memcpy)
+
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__arm_sc_memmove, __arm_sc_memcpy)
+
+
+//
+//  __arm_sc_memset
+//
+
+#define dstin    x0
+#define val      x1
+#define valw     w1
+#define count    x2
+#define dst      x3
+#define dstend2  x4
+#define zva_val  x5
+
+DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sc_memset)
+#ifdef __ARM_FEATURE_SVE
+        mov     z0.b, valw
+#else
+        bfi valw, valw, #8, #8
+        bfi valw, valw, #16, #16
+        bfi val, val, #32, #32
+        fmov d0, val
+        fmov v0.d[1], val
+#endif
+        add     dstend2, dstin, count
+
+        cmp     count, 96
+        b.hi    L(set_long)
+        cmp     count, 16
+        b.hs    L(set_medium)
+        mov     val, v0.D[0]
+
+        /* Set 0..15 bytes.  */
+        tbz     count, 3, 1f
+        str     val, [dstin]
+        str     val, [dstend2, -8]
+        ret
+        nop
+1:      tbz     count, 2, 2f
+        str     valw, [dstin]
+        str     valw, [dstend2, -4]
+        ret
+2:      cbz     count, 3f
+        strb    valw, [dstin]
+        tbz     count, 1, 3f
+        strh    valw, [dstend2, -2]
+3:      ret
+
+        /* Set 17..96 bytes.  */
+L(set_medium):
+        str     q0, [dstin]
+        tbnz    count, 6, L(set96)
+        str     q0, [dstend2, -16]
+        tbz     count, 5, 1f
+        str     q0, [dstin, 16]
+        str     q0, [dstend2, -32]
+1:      ret
+
+        .p2align 4
+        /* Set 64..96 bytes.  Write 64 bytes from the start and
+           32 bytes from the end.  */
+L(set96):
+        str     q0, [dstin, 16]
+        stp     q0, q0, [dstin, 32]
+        stp     q0, q0, [dstend2, -32]
+        ret
+
+        .p2align 4
+L(set_long):
+        and     valw, valw, 255
+        bic     dst, dstin, 15
+        str     q0, [dstin]
+        cmp     count, 160
+        ccmp    valw, 0, 0, hs
+        b.ne    L(no_zva)
+
+#ifndef SKIP_ZVA_CHECK
+        mrs     zva_val, dczid_el0
+        and     zva_val, zva_val, 31
+        cmp     zva_val, 4              /* ZVA size is 64 bytes.  */
+        b.ne    L(no_zva)
+#endif
+        str     q0, [dst, 16]
+        stp     q0, q0, [dst, 32]
+        bic     dst, dst, 63
+        sub     count, dstend2, dst      /* Count is now 64 too large.  */
+        sub     count, count, 128       /* Adjust count and bias for loop.  */
+
+        .p2align 4
+L(zva_loop):
+        add     dst, dst, 64
+        dc      zva, dst
+        subs    count, count, 64
+        b.hi    L(zva_loop)
+        stp     q0, q0, [dstend2, -64]
+        stp     q0, q0, [dstend2, -32]
+        ret
+
+L(no_zva):
+        sub     count, dstend2, dst      /* Count is 16 too large.  */
+        sub     dst, dst, 16            /* Dst is biased by -32.  */
+        sub     count, count, 64 + 16   /* Adjust count and bias for loop.  */
+L(no_zva_loop):
+        stp     q0, q0, [dst, 32]
+        stp     q0, q0, [dst, 64]!
+        subs    count, count, 64
+        b.hi    L(no_zva_loop)
+        stp     q0, q0, [dstend2, -64]
+        stp     q0, q0, [dstend2, -32]
+        ret
+END_COMPILERRT_OUTLINE_FUNCTION(__arm_sc_memset)
+
+#endif // __aarch64__
diff --git a/contrib/libs/cxxsupp/builtins/aarch64/sme-libc-routines.c b/contrib/libs/cxxsupp/builtins/aarch64/sme-libc-routines.c
index cd73025a19c..315490e73ea 100644
--- a/contrib/libs/cxxsupp/builtins/aarch64/sme-libc-routines.c
+++ b/contrib/libs/cxxsupp/builtins/aarch64/sme-libc-routines.c
@@ -1,79 +1,4 @@
-#include <stdlib.h>
-
-// WARNING: When building the scalar versions of these functions you need to
-// use the compiler flag "-mllvm -disable-loop-idiom-all" to prevent clang
-// from recognising a loop idiom and planting calls to memcpy!
-
-static void *__arm_sc_memcpy_fwd(void *dest, const void *src,
-                                 size_t n) __arm_streaming_compatible {
-  unsigned char *destp = (unsigned char *)dest;
-  const unsigned char *srcp = (const unsigned char *)src;
-  for (size_t i = 0; i < n; ++i)
-    destp[i] = srcp[i];
-
-  return dest;
-}
-
-// If dest and src overlap then behaviour is undefined, hence we can add the
-// restrict keywords here. This also matches the definition of the libc memcpy
-// according to the man page.
-void *__arm_sc_memcpy(void *__restrict__ dest, const void *__restrict__ src,
-                      size_t n) __arm_streaming_compatible {
-  return __arm_sc_memcpy_fwd(dest, src, n);
-}
-
-void *__arm_sc_memset(void *dest, int c, size_t n) __arm_streaming_compatible {
-  unsigned char *destp = (unsigned char *)dest;
-  unsigned char c8 = (unsigned char)c;
-  for (size_t i = 0; i < n; ++i)
-    destp[i] = c8;
-
-  return dest;
-}
-
-static void *__arm_sc_memcpy_rev(void *dest, const void *src,
-                                 size_t n) __arm_streaming_compatible {
-  unsigned char *destp = (unsigned char *)dest;
-  const unsigned char *srcp = (const unsigned char *)src;
-  // TODO: Improve performance by copying larger chunks in reverse, or by
-  // using SVE.
-  while (n > 0) {
-    --n;
-    destp[n] = srcp[n];
-  }
-  return dest;
-}
-
-// Semantically a memmove is equivalent to the following:
-//   1. Copy the entire contents of src to a temporary array that does not
-//      overlap with src or dest.
-//   2. Copy the contents of the temporary array into dest.
-void *__arm_sc_memmove(void *dest, const void *src,
-                       size_t n) __arm_streaming_compatible {
-  unsigned char *destp = (unsigned char *)dest;
-  const unsigned char *srcp = (const unsigned char *)src;
-
-  // If src and dest don't overlap then just invoke memcpy
-  if ((srcp > (destp + n)) || (destp > (srcp + n)))
-    return __arm_sc_memcpy_fwd(dest, src, n);
-
-  // Overlap case 1:
-  //     src: Low     |   ->   |     High
-  //    dest: Low  |   ->   |        High
-  // Here src is always ahead of dest at a higher addres. If we first read a
-  // chunk of data from src we can safely write the same chunk to dest without
-  // corrupting future reads of src.
-  if (srcp > destp)
-    return __arm_sc_memcpy_fwd(dest, src, n);
-
-  // Overlap case 2:
-  //     src: Low  |   ->   |        High
-  //    dest: Low     |   ->   |     High
-  // While we're in the overlap region we're always corrupting future reads of
-  // src when writing to dest. An efficient way to do this is to copy the data
-  // in reverse by starting at the highest address.
-  return __arm_sc_memcpy_rev(dest, src, n);
-}
+#include <stddef.h>
 
 const void *__arm_sc_memchr(const void *src, int c,
                             size_t n) __arm_streaming_compatible {
diff --git a/contrib/libs/cxxsupp/builtins/atomic.c b/contrib/libs/cxxsupp/builtins/atomic.c
index 852bb20f086..aded25d9baa 100644
--- a/contrib/libs/cxxsupp/builtins/atomic.c
+++ b/contrib/libs/cxxsupp/builtins/atomic.c
@@ -12,7 +12,7 @@
 //
 //  1) This code must work with C programs that do not link to anything
 //     (including pthreads) and so it should not depend on any pthread
-//     functions.
+//     functions. If the user wishes to opt into using pthreads, they may do so.
 //  2) Atomic operations, rather than explicit mutexes, are most commonly used
 //     on code where contended operations are rate.
 //
@@ -56,7 +56,17 @@ static const long SPINLOCK_MASK = SPINLOCK_COUNT - 1;
 // defined.  Each platform should define the Lock type, and corresponding
 // lock() and unlock() functions.
 ////////////////////////////////////////////////////////////////////////////////
-#if defined(__FreeBSD__) || defined(__DragonFly__)
+#if defined(_LIBATOMIC_USE_PTHREAD)
+#include <pthread.h>
+typedef pthread_mutex_t Lock;
+/// Unlock a lock.  This is a release operation.
+__inline static void unlock(Lock *l) { pthread_mutex_unlock(l); }
+/// Locks a lock.
+__inline static void lock(Lock *l) { pthread_mutex_lock(l); }
+/// locks for atomic operations
+static Lock locks[SPINLOCK_COUNT];
+
+#elif defined(__FreeBSD__) || defined(__DragonFly__)
 #include <errno.h>
 // clang-format off
 #include <sys/types.h>
diff --git a/contrib/libs/cxxsupp/builtins/cpu_model/AArch64CPUFeatures.inc b/contrib/libs/cxxsupp/builtins/cpu_model/AArch64CPUFeatures.inc
new file mode 100644
index 00000000000..e78bb88cfed
--- /dev/null
+++ b/contrib/libs/cxxsupp/builtins/cpu_model/AArch64CPUFeatures.inc
@@ -0,0 +1,91 @@
+//===- AArch64CPUFeatures.inc - AArch64 CPU Features enum -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the CPUFeatures enum for AArch64 to facilitate better
+// testing of this code between LLVM and compiler-rt, primarily that the files
+// are an exact match.
+//
+// This file has two identical copies. The primary copy lives in LLVM and
+// the other one sits in compiler-rt/lib/builtins/cpu_model directory. To make
+// changes in this file, first modify the primary copy and copy it over to
+// compiler-rt. compiler-rt tests will fail if the two files are not synced up.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AARCH64_CPU_FEATURS_INC_H
+#define AARCH64_CPU_FEATURS_INC_H
+
+// Function Multi Versioning CPU features.
+enum CPUFeatures {
+  FEAT_RNG,
+  FEAT_FLAGM,
+  FEAT_FLAGM2,
+  FEAT_FP16FML,
+  FEAT_DOTPROD,
+  FEAT_SM4,
+  FEAT_RDM,
+  FEAT_LSE,
+  FEAT_FP,
+  FEAT_SIMD,
+  FEAT_CRC,
+  FEAT_SHA1,
+  FEAT_SHA2,
+  FEAT_SHA3,
+  FEAT_AES,
+  FEAT_PMULL,
+  FEAT_FP16,
+  FEAT_DIT,
+  FEAT_DPB,
+  FEAT_DPB2,
+  FEAT_JSCVT,
+  FEAT_FCMA,
+  FEAT_RCPC,
+  FEAT_RCPC2,
+  FEAT_FRINTTS,
+  FEAT_DGH,
+  FEAT_I8MM,
+  FEAT_BF16,
+  FEAT_EBF16,
+  FEAT_RPRES,
+  FEAT_SVE,
+  FEAT_SVE_BF16,
+  FEAT_SVE_EBF16,
+  FEAT_SVE_I8MM,
+  FEAT_SVE_F32MM,
+  FEAT_SVE_F64MM,
+  FEAT_SVE2,
+  FEAT_SVE_AES,
+  FEAT_SVE_PMULL128,
+  FEAT_SVE_BITPERM,
+  FEAT_SVE_SHA3,
+  FEAT_SVE_SM4,
+  FEAT_SME,
+  FEAT_MEMTAG,
+  FEAT_MEMTAG2,
+  FEAT_MEMTAG3,
+  FEAT_SB,
+  FEAT_PREDRES,
+  FEAT_SSBS,
+  FEAT_SSBS2,
+  FEAT_BTI,
+  FEAT_LS64,
+  FEAT_LS64_V,
+  FEAT_LS64_ACCDATA,
+  FEAT_WFXT,
+  FEAT_SME_F64,
+  FEAT_SME_I64,
+  FEAT_SME2,
+  FEAT_RCPC3,
+  FEAT_MOPS,
+  FEAT_MAX,
+  FEAT_EXT = 62, // Reserved to indicate presence of additional features field
+                 // in __aarch64_cpu_features
+  FEAT_INIT      // Used as flag of features initialization completion
+};
+
+#endif
diff --git a/contrib/libs/cxxsupp/builtins/cpu_model/aarch64.c b/contrib/libs/cxxsupp/builtins/cpu_model/aarch64.c
index 17bddfca46f..b868caa991b 100644
--- a/contrib/libs/cxxsupp/builtins/cpu_model/aarch64.c
+++ b/contrib/libs/cxxsupp/builtins/cpu_model/aarch64.c
@@ -12,7 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "cpu_model.h"
+#include "aarch64.h"
 
 #if !defined(__aarch64__)
 #error This file is intended only for aarch64-based targets
@@ -53,74 +53,6 @@ _Bool __aarch64_have_lse_atomics
 #endif
 
 #if !defined(DISABLE_AARCH64_FMV)
-// CPUFeatures must correspond to the same AArch64 features in
-// AArch64TargetParser.h
-enum CPUFeatures {
-  FEAT_RNG,
-  FEAT_FLAGM,
-  FEAT_FLAGM2,
-  FEAT_FP16FML,
-  FEAT_DOTPROD,
-  FEAT_SM4,
-  FEAT_RDM,
-  FEAT_LSE,
-  FEAT_FP,
-  FEAT_SIMD,
-  FEAT_CRC,
-  FEAT_SHA1,
-  FEAT_SHA2,
-  FEAT_SHA3,
-  FEAT_AES,
-  FEAT_PMULL,
-  FEAT_FP16,
-  FEAT_DIT,
-  FEAT_DPB,
-  FEAT_DPB2,
-  FEAT_JSCVT,
-  FEAT_FCMA,
-  FEAT_RCPC,
-  FEAT_RCPC2,
-  FEAT_FRINTTS,
-  FEAT_DGH,
-  FEAT_I8MM,
-  FEAT_BF16,
-  FEAT_EBF16,
-  FEAT_RPRES,
-  FEAT_SVE,
-  FEAT_SVE_BF16,
-  FEAT_SVE_EBF16,
-  FEAT_SVE_I8MM,
-  FEAT_SVE_F32MM,
-  FEAT_SVE_F64MM,
-  FEAT_SVE2,
-  FEAT_SVE_AES,
-  FEAT_SVE_PMULL128,
-  FEAT_SVE_BITPERM,
-  FEAT_SVE_SHA3,
-  FEAT_SVE_SM4,
-  FEAT_SME,
-  FEAT_MEMTAG,
-  FEAT_MEMTAG2,
-  FEAT_MEMTAG3,
-  FEAT_SB,
-  FEAT_PREDRES,
-  FEAT_SSBS,
-  FEAT_SSBS2,
-  FEAT_BTI,
-  FEAT_LS64,
-  FEAT_LS64_V,
-  FEAT_LS64_ACCDATA,
-  FEAT_WFXT,
-  FEAT_SME_F64,
-  FEAT_SME_I64,
-  FEAT_SME2,
-  FEAT_RCPC3,
-  FEAT_MOPS,
-  FEAT_MAX,
-  FEAT_EXT = 62, // Reserved to indicate presence of additional features field
-                 // in __aarch64_cpu_features
-  FEAT_INIT      // Used as flag of features initialization completion
-};
 
 // Architecture features used
 // in Function Multi Versioning
diff --git a/contrib/libs/cxxsupp/builtins/cpu_model/aarch64.h b/contrib/libs/cxxsupp/builtins/cpu_model/aarch64.h
new file mode 100644
index 00000000000..f6cbf75d582
--- /dev/null
+++ b/contrib/libs/cxxsupp/builtins/cpu_model/aarch64.h
@@ -0,0 +1,21 @@
+//===-- cpu_model/aarch64.h --------------------------------------------- -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "cpu_model.h"
+
+#if !defined(__aarch64__)
+#error This file is intended only for aarch64-based targets
+#endif
+
+#if !defined(DISABLE_AARCH64_FMV)
+
+#include "AArch64CPUFeatures.inc"
+
+void __init_cpu_features(void);
+
+#endif // !defined(DISABLE_AARCH64_FMV)
diff --git a/contrib/libs/cxxsupp/builtins/cpu_model/aarch64/fmv/android.inc b/contrib/libs/cxxsupp/builtins/cpu_model/aarch64/fmv/android.inc
index f711431489c..a9e3594e93c 100644
--- a/contrib/libs/cxxsupp/builtins/cpu_model/aarch64/fmv/android.inc
+++ b/contrib/libs/cxxsupp/builtins/cpu_model/aarch64/fmv/android.inc
@@ -1,6 +1,6 @@
 void __init_cpu_features_resolver(unsigned long hwcap,
                                   const __ifunc_arg_t *arg) {
-  if (__aarch64_cpu_features.features)
+  if (__atomic_load_n(&__aarch64_cpu_features.features, __ATOMIC_RELAXED))
     return;
 
   // ifunc resolvers don't have hwcaps in arguments on Android API lower
@@ -17,7 +17,7 @@ void __init_cpu_features_resolver(unsigned long hwcap,
 
 void CONSTRUCTOR_ATTRIBUTE __init_cpu_features(void) {
   // CPU features already initialized.
-  if (__aarch64_cpu_features.features)
+  if (__atomic_load_n(&__aarch64_cpu_features.features, __ATOMIC_RELAXED))
     return;
 
   // Don't set any CPU features,
diff --git a/contrib/libs/cxxsupp/builtins/cpu_model/aarch64/fmv/apple.inc b/contrib/libs/cxxsupp/builtins/cpu_model/aarch64/fmv/apple.inc
index 0bb755f4b30..f0694900f23 100644
--- a/contrib/libs/cxxsupp/builtins/cpu_model/aarch64/fmv/apple.inc
+++ b/contrib/libs/cxxsupp/builtins/cpu_model/aarch64/fmv/apple.inc
@@ -1,8 +1,27 @@
 #include <TargetConditionals.h>
 #if TARGET_OS_OSX || TARGET_OS_IPHONE
-#include <dispatch/dispatch.h>
 #include <sys/sysctl.h>
 
+#if __has_include(<arm/cpu_capabilities_public.h>)
+#include <arm/cpu_capabilities_public.h>
+#define HAS_CPU_CAPABILITIES_PUBLIC_H 1
+
+// FB13964283 - A few of these didn't make it into the public SDK yet.
+#ifndef CAP_BIT_FEAT_SME
+#define CAP_BIT_FEAT_SME            40
+#endif
+#ifndef CAP_BIT_FEAT_SME2
+#define CAP_BIT_FEAT_SME2           41
+#endif
+#ifndef CAP_BIT_FEAT_SME_F64F64
+#define CAP_BIT_FEAT_SME_F64F64     42
+#endif
+#ifndef CAP_BIT_FEAT_SME_I16I64
+#define CAP_BIT_FEAT_SME_I16I64     43
+#endif
+
+#endif
+
 static bool isKnownAndSupported(const char *name) {
   int32_t val = 0;
   size_t size = sizeof(val);
@@ -11,61 +30,130 @@ static bool isKnownAndSupported(const char *name) {
   return val;
 }
 
+static uint64_t deriveImplicitFeatures(uint64_t features) {
+  // FEAT_SSBS2 implies FEAT_SSBS
+  if ((1ULL << FEAT_SSBS2) & features)
+    features |= (1ULL << FEAT_SSBS);
+
+  // FEAT_FP is always enabled
+  features |= (1ULL << FEAT_FP);
+
+  features |= (1ULL << FEAT_INIT);
+
+  return features;
+}
+
 void __init_cpu_features_resolver(void) {
   // On Darwin platforms, this may be called concurrently by multiple threads
   // because the resolvers that use it are called lazily at runtime (unlike on
   // ELF platforms, where IFuncs are resolved serially at load time). This
   // function's effect on __aarch64_cpu_features must be idempotent.
 
-  if (!__atomic_load_n(&__aarch64_cpu_features.features, __ATOMIC_RELAXED)) {
-    uint64_t features = 0;
-
-    // https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics
-    static const struct {
-      const char *sysctl_name;
-      enum CPUFeatures feature;
-    } feature_checks[] = {
-        {"hw.optional.arm.FEAT_FlagM", FEAT_FLAGM},
-        {"hw.optional.arm.FEAT_FlagM2", FEAT_FLAGM2},
-        {"hw.optional.arm.FEAT_FHM", FEAT_FP16FML},
-        {"hw.optional.arm.FEAT_DotProd", FEAT_DOTPROD},
-        {"hw.optional.arm.FEAT_RDM", FEAT_RDM},
-        {"hw.optional.arm.FEAT_LSE", FEAT_LSE},
-        {"hw.optional.floatingpoint", FEAT_FP},
-        {"hw.optional.AdvSIMD", FEAT_SIMD},
-        {"hw.optional.armv8_crc32", FEAT_CRC},
-        {"hw.optional.arm.FEAT_SHA1", FEAT_SHA1},
-        {"hw.optional.arm.FEAT_SHA256", FEAT_SHA2},
-        {"hw.optional.arm.FEAT_SHA3", FEAT_SHA3},
-        {"hw.optional.arm.FEAT_AES", FEAT_AES},
-        {"hw.optional.arm.FEAT_PMULL", FEAT_PMULL},
-        {"hw.optional.arm.FEAT_FP16", FEAT_FP16},
-        {"hw.optional.arm.FEAT_DIT", FEAT_DIT},
-        {"hw.optional.arm.FEAT_DPB", FEAT_DPB},
-        {"hw.optional.arm.FEAT_DPB2", FEAT_DPB2},
-        {"hw.optional.arm.FEAT_JSCVT", FEAT_JSCVT},
-        {"hw.optional.arm.FEAT_FCMA", FEAT_FCMA},
-        {"hw.optional.arm.FEAT_LRCPC", FEAT_RCPC},
-        {"hw.optional.arm.FEAT_LRCPC2", FEAT_RCPC2},
-        {"hw.optional.arm.FEAT_FRINTTS", FEAT_FRINTTS},
-        {"hw.optional.arm.FEAT_I8MM", FEAT_I8MM},
-        {"hw.optional.arm.FEAT_BF16", FEAT_BF16},
-        {"hw.optional.arm.FEAT_SB", FEAT_SB},
-        {"hw.optional.arm.FEAT_SPECRES", FEAT_PREDRES},
-        {"hw.optional.arm.FEAT_SSBS", FEAT_SSBS2},
-        {"hw.optional.arm.FEAT_BTI", FEAT_BTI},
-    };
-
-    for (size_t I = 0, E = sizeof(feature_checks) / sizeof(feature_checks[0]);
-         I != E; ++I)
-      if (isKnownAndSupported(feature_checks[I].sysctl_name))
-        features |= (1ULL << feature_checks[I].feature);
-
-    features |= (1ULL << FEAT_INIT);
+  if (__atomic_load_n(&__aarch64_cpu_features.features, __ATOMIC_RELAXED))
+    return;
+
+  uint64_t features = 0;
+
+#ifdef HAS_CPU_CAPABILITIES_PUBLIC_H
+  uint8_t feats_bitvec[(CAP_BIT_NB + 7) / 8] = {0};
+  size_t len = sizeof(feats_bitvec);
+  // When hw.optional.arm.feats is available (macOS 15.0+, iOS 18.0+), use the
+  // fast path to get all the feature bits, otherwise fall back to the slow
+  // ~20-something sysctls path.
+  if (!sysctlbyname("hw.optional.arm.caps", &feats_bitvec, &len, 0, 0)) {
+
+#define CHECK_BIT(FROM, TO)                                                    \
+  do {                                                                         \
+    if (feats_bitvec[FROM / 8] & (1u << ((FROM) & 7))) {                       \
+      features |= (1ULL << TO);                                                \
+    }                                                                          \
+  } while (0)
+
+    CHECK_BIT(CAP_BIT_FEAT_FlagM, FEAT_FLAGM);
+    CHECK_BIT(CAP_BIT_FEAT_FlagM2, FEAT_FLAGM2);
+    CHECK_BIT(CAP_BIT_FEAT_FHM, FEAT_FP16FML);
+    CHECK_BIT(CAP_BIT_FEAT_DotProd, FEAT_DOTPROD);
+    CHECK_BIT(CAP_BIT_FEAT_SHA3, FEAT_SHA3);
+    CHECK_BIT(CAP_BIT_FEAT_RDM, FEAT_RDM);
+    CHECK_BIT(CAP_BIT_FEAT_LSE, FEAT_LSE);
+    CHECK_BIT(CAP_BIT_FEAT_SHA256, FEAT_SHA2);
+    CHECK_BIT(CAP_BIT_FEAT_SHA1, FEAT_SHA1);
+    CHECK_BIT(CAP_BIT_FEAT_AES, FEAT_AES);
+    CHECK_BIT(CAP_BIT_FEAT_PMULL, FEAT_PMULL);
+    CHECK_BIT(CAP_BIT_FEAT_SPECRES, FEAT_PREDRES);
+    CHECK_BIT(CAP_BIT_FEAT_SB, FEAT_SB);
+    CHECK_BIT(CAP_BIT_FEAT_FRINTTS, FEAT_FRINTTS);
+    CHECK_BIT(CAP_BIT_FEAT_LRCPC, FEAT_RCPC);
+    CHECK_BIT(CAP_BIT_FEAT_LRCPC2, FEAT_RCPC2);
+    CHECK_BIT(CAP_BIT_FEAT_FCMA, FEAT_FCMA);
+    CHECK_BIT(CAP_BIT_FEAT_JSCVT, FEAT_JSCVT);
+    CHECK_BIT(CAP_BIT_FEAT_DPB, FEAT_DPB);
+    CHECK_BIT(CAP_BIT_FEAT_DPB2, FEAT_DPB2);
+    CHECK_BIT(CAP_BIT_FEAT_BF16, FEAT_BF16);
+    CHECK_BIT(CAP_BIT_FEAT_I8MM, FEAT_I8MM);
+    CHECK_BIT(CAP_BIT_FEAT_DIT, FEAT_DIT);
+    CHECK_BIT(CAP_BIT_FEAT_FP16, FEAT_FP16);
+    CHECK_BIT(CAP_BIT_FEAT_SSBS, FEAT_SSBS2);
+    CHECK_BIT(CAP_BIT_FEAT_BTI, FEAT_BTI);
+    CHECK_BIT(CAP_BIT_AdvSIMD, FEAT_SIMD);
+    CHECK_BIT(CAP_BIT_CRC32, FEAT_CRC);
+    CHECK_BIT(CAP_BIT_FEAT_SME, FEAT_SME);
+    CHECK_BIT(CAP_BIT_FEAT_SME2, FEAT_SME2);
+    CHECK_BIT(CAP_BIT_FEAT_SME_F64F64, FEAT_SME_F64);
+    CHECK_BIT(CAP_BIT_FEAT_SME_I16I64, FEAT_SME_I64);
+
+    features = deriveImplicitFeatures(features);
 
     __atomic_store(&__aarch64_cpu_features.features, &features,
                    __ATOMIC_RELAXED);
+    return;
   }
+#endif
+
+  // https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics
+  static const struct {
+    const char *sysctl_name;
+    enum CPUFeatures feature;
+  } feature_checks[] = {
+      {"hw.optional.arm.FEAT_FlagM", FEAT_FLAGM},
+      {"hw.optional.arm.FEAT_FlagM2", FEAT_FLAGM2},
+      {"hw.optional.arm.FEAT_FHM", FEAT_FP16FML},
+      {"hw.optional.arm.FEAT_DotProd", FEAT_DOTPROD},
+      {"hw.optional.arm.FEAT_RDM", FEAT_RDM},
+      {"hw.optional.arm.FEAT_LSE", FEAT_LSE},
+      {"hw.optional.AdvSIMD", FEAT_SIMD},
+      {"hw.optional.armv8_crc32", FEAT_CRC},
+      {"hw.optional.arm.FEAT_SHA1", FEAT_SHA1},
+      {"hw.optional.arm.FEAT_SHA256", FEAT_SHA2},
+      {"hw.optional.arm.FEAT_SHA3", FEAT_SHA3},
+      {"hw.optional.arm.FEAT_AES", FEAT_AES},
+      {"hw.optional.arm.FEAT_PMULL", FEAT_PMULL},
+      {"hw.optional.arm.FEAT_FP16", FEAT_FP16},
+      {"hw.optional.arm.FEAT_DIT", FEAT_DIT},
+      {"hw.optional.arm.FEAT_DPB", FEAT_DPB},
+      {"hw.optional.arm.FEAT_DPB2", FEAT_DPB2},
+      {"hw.optional.arm.FEAT_JSCVT", FEAT_JSCVT},
+      {"hw.optional.arm.FEAT_FCMA", FEAT_FCMA},
+      {"hw.optional.arm.FEAT_LRCPC", FEAT_RCPC},
+      {"hw.optional.arm.FEAT_LRCPC2", FEAT_RCPC2},
+      {"hw.optional.arm.FEAT_FRINTTS", FEAT_FRINTTS},
+      {"hw.optional.arm.FEAT_I8MM", FEAT_I8MM},
+      {"hw.optional.arm.FEAT_BF16", FEAT_BF16},
+      {"hw.optional.arm.FEAT_SB", FEAT_SB},
+      {"hw.optional.arm.FEAT_SPECRES", FEAT_PREDRES},
+      {"hw.optional.arm.FEAT_SSBS", FEAT_SSBS2},
+      {"hw.optional.arm.FEAT_BTI", FEAT_BTI},
+  };
+
+  for (size_t I = 0, E = sizeof(feature_checks) / sizeof(feature_checks[0]);
+        I != E; ++I)
+    if (isKnownAndSupported(feature_checks[I].sysctl_name))
+      features |= (1ULL << feature_checks[I].feature);
+
+  features = deriveImplicitFeatures(features);
+
+  __atomic_store(&__aarch64_cpu_features.features, &features,
+                  __ATOMIC_RELAXED);
 }
 
 #endif // TARGET_OS_OSX || TARGET_OS_IPHONE
diff --git a/contrib/libs/cxxsupp/builtins/cpu_model/aarch64/fmv/freebsd.inc b/contrib/libs/cxxsupp/builtins/cpu_model/aarch64/fmv/freebsd.inc
index 793adef44b9..aa975dc854f 100644
--- a/contrib/libs/cxxsupp/builtins/cpu_model/aarch64/fmv/freebsd.inc
+++ b/contrib/libs/cxxsupp/builtins/cpu_model/aarch64/fmv/freebsd.inc
@@ -1,6 +1,6 @@
 void __init_cpu_features_resolver(unsigned long hwcap,
                                   const __ifunc_arg_t *arg) {
-  if (__aarch64_cpu_features.features)
+  if (__atomic_load_n(&__aarch64_cpu_features.features, __ATOMIC_RELAXED))
     return;
 
   __init_cpu_features_constructor(hwcap, arg);
@@ -10,7 +10,7 @@ void CONSTRUCTOR_ATTRIBUTE __init_cpu_features(void) {
   unsigned long hwcap = 0;
   unsigned long hwcap2 = 0;
   // CPU features already initialized.
-  if (__aarch64_cpu_features.features)
+  if (__atomic_load_n(&__aarch64_cpu_features.features, __ATOMIC_RELAXED))
     return;
 
   int res = 0;
diff --git a/contrib/libs/cxxsupp/builtins/cpu_model/aarch64/fmv/fuchsia.inc b/contrib/libs/cxxsupp/builtins/cpu_model/aarch64/fmv/fuchsia.inc
index 329b6b43a8a..fd0800dd11e 100644
--- a/contrib/libs/cxxsupp/builtins/cpu_model/aarch64/fmv/fuchsia.inc
+++ b/contrib/libs/cxxsupp/builtins/cpu_model/aarch64/fmv/fuchsia.inc
@@ -2,7 +2,7 @@
 #include <zircon/syscalls.h>
 
 void __init_cpu_features_resolver() {
-  if (__aarch64_cpu_features.features)
+  if (__atomic_load_n(&__aarch64_cpu_features.features, __ATOMIC_RELAXED))
     return;
 
     // This ensures the vDSO is a direct link-time dependency of anything that
@@ -13,8 +13,8 @@ void __init_cpu_features_resolver() {
   if (status != ZX_OK)
     return;
 
-#define setCPUFeature(cpu_feature)                                             \
-  __aarch64_cpu_features.features |= 1ULL << cpu_feature
+  unsigned long long feat = 0;
+#define setCPUFeature(cpu_feature) feat |= 1ULL << cpu_feature
 
   if (features & ZX_ARM64_FEATURE_ISA_FP)
     setCPUFeature(FEAT_FP);
@@ -48,4 +48,6 @@ void __init_cpu_features_resolver() {
     setCPUFeature(FEAT_SVE);
 
   setCPUFeature(FEAT_INIT);
+
+  __atomic_store_n(&__aarch64_cpu_features.features, feat, __ATOMIC_RELAXED);
 }
diff --git a/contrib/libs/cxxsupp/builtins/cpu_model/aarch64/fmv/mrs.inc b/contrib/libs/cxxsupp/builtins/cpu_model/aarch64/fmv/mrs.inc
index 32a21a2fba9..e4d5e7f2bd7 100644
--- a/contrib/libs/cxxsupp/builtins/cpu_model/aarch64/fmv/mrs.inc
+++ b/contrib/libs/cxxsupp/builtins/cpu_model/aarch64/fmv/mrs.inc
@@ -3,11 +3,10 @@
 #define HAVE_SYS_AUXV_H
 #endif
 
-
-
 static void __init_cpu_features_constructor(unsigned long hwcap,
                                             const __ifunc_arg_t *arg) {
-#define setCPUFeature(F) __aarch64_cpu_features.features |= 1ULL << F
+  unsigned long long feat = 0;
+#define setCPUFeature(F) feat |= 1ULL << F
 #define getCPUFeature(id, ftr) __asm__("mrs %0, " #id : "=r"(ftr))
 #define extractBits(val, start, number)                                        \
   (val & ((1ULL << number) - 1ULL) << start) >> start
@@ -20,26 +19,20 @@ static void __init_cpu_features_constructor(unsigned long hwcap,
     setCPUFeature(FEAT_PMULL);
   if (hwcap & HWCAP_FLAGM)
     setCPUFeature(FEAT_FLAGM);
-  if (hwcap2 & HWCAP2_FLAGM2) {
-    setCPUFeature(FEAT_FLAGM);
+  if (hwcap2 & HWCAP2_FLAGM2)
     setCPUFeature(FEAT_FLAGM2);
-  }
-  if (hwcap & HWCAP_SM3 && hwcap & HWCAP_SM4)
+  if (hwcap & HWCAP_SM4)
     setCPUFeature(FEAT_SM4);
   if (hwcap & HWCAP_ASIMDDP)
     setCPUFeature(FEAT_DOTPROD);
   if (hwcap & HWCAP_ASIMDFHM)
     setCPUFeature(FEAT_FP16FML);
-  if (hwcap & HWCAP_FPHP) {
+  if (hwcap & HWCAP_FPHP)
     setCPUFeature(FEAT_FP16);
-    setCPUFeature(FEAT_FP);
-  }
   if (hwcap & HWCAP_DIT)
     setCPUFeature(FEAT_DIT);
   if (hwcap & HWCAP_ASIMDRDM)
     setCPUFeature(FEAT_RDM);
-  if (hwcap & HWCAP_ILRCPC)
-    setCPUFeature(FEAT_RCPC2);
   if (hwcap & HWCAP_AES)
     setCPUFeature(FEAT_AES);
   if (hwcap & HWCAP_SHA1)
@@ -52,23 +45,20 @@ static void __init_cpu_features_constructor(unsigned long hwcap,
     setCPUFeature(FEAT_FCMA);
   if (hwcap & HWCAP_SB)
     setCPUFeature(FEAT_SB);
-  if (hwcap & HWCAP_SSBS)
+  if (hwcap & HWCAP_SSBS) {
+    setCPUFeature(FEAT_SSBS);
     setCPUFeature(FEAT_SSBS2);
+  }
   if (hwcap2 & HWCAP2_MTE) {
     setCPUFeature(FEAT_MEMTAG);
     setCPUFeature(FEAT_MEMTAG2);
   }
-  if (hwcap2 & HWCAP2_MTE3) {
-    setCPUFeature(FEAT_MEMTAG);
-    setCPUFeature(FEAT_MEMTAG2);
+  if (hwcap2 & HWCAP2_MTE3)
     setCPUFeature(FEAT_MEMTAG3);
-  }
   if (hwcap2 & HWCAP2_SVEAES)
     setCPUFeature(FEAT_SVE_AES);
-  if (hwcap2 & HWCAP2_SVEPMULL) {
-    setCPUFeature(FEAT_SVE_AES);
+  if (hwcap2 & HWCAP2_SVEPMULL)
     setCPUFeature(FEAT_SVE_PMULL128);
-  }
   if (hwcap2 & HWCAP2_SVEBITPERM)
     setCPUFeature(FEAT_SVE_BITPERM);
   if (hwcap2 & HWCAP2_SVESHA3)
@@ -105,6 +95,8 @@ static void __init_cpu_features_constructor(unsigned long hwcap,
     setCPUFeature(FEAT_WFXT);
   if (hwcap2 & HWCAP2_SME)
     setCPUFeature(FEAT_SME);
+  if (hwcap2 & HWCAP2_SME2)
+    setCPUFeature(FEAT_SME2);
   if (hwcap2 & HWCAP2_SME_I16I64)
     setCPUFeature(FEAT_SME_I64);
   if (hwcap2 & HWCAP2_SME_F64F64)
@@ -113,86 +105,45 @@ static void __init_cpu_features_constructor(unsigned long hwcap,
     setCPUFeature(FEAT_MOPS);
   if (hwcap & HWCAP_CPUID) {
     unsigned long ftr;
-    getCPUFeature(ID_AA64PFR1_EL1, ftr);
-    // ID_AA64PFR1_EL1.MTE >= 0b0001
-    if (extractBits(ftr, 8, 4) >= 0x1)
-      setCPUFeature(FEAT_MEMTAG);
-    // ID_AA64PFR1_EL1.SSBS == 0b0001
-    if (extractBits(ftr, 4, 4) == 0x1)
-      setCPUFeature(FEAT_SSBS);
-    // ID_AA64PFR1_EL1.SME == 0b0010
-    if (extractBits(ftr, 24, 4) == 0x2)
-      setCPUFeature(FEAT_SME2);
-    getCPUFeature(ID_AA64PFR0_EL1, ftr);
-    // ID_AA64PFR0_EL1.FP != 0b1111
-    if (extractBits(ftr, 16, 4) != 0xF) {
-      setCPUFeature(FEAT_FP);
-      // ID_AA64PFR0_EL1.AdvSIMD has the same value as ID_AA64PFR0_EL1.FP
-      setCPUFeature(FEAT_SIMD);
-    }
-    // ID_AA64PFR0_EL1.SVE != 0b0000
-    if (extractBits(ftr, 32, 4) != 0x0) {
-      // get ID_AA64ZFR0_EL1, that name supported
-      // if sve enabled only
-      getCPUFeature(S3_0_C0_C4_4, ftr);
-      // ID_AA64ZFR0_EL1.SVEver == 0b0000
-      if (extractBits(ftr, 0, 4) == 0x0)
-        setCPUFeature(FEAT_SVE);
-      // ID_AA64ZFR0_EL1.SVEver == 0b0001
-      if (extractBits(ftr, 0, 4) == 0x1)
-        setCPUFeature(FEAT_SVE2);
-      // ID_AA64ZFR0_EL1.BF16 != 0b0000
-      if (extractBits(ftr, 20, 4) != 0x0)
-        setCPUFeature(FEAT_SVE_BF16);
-    }
-    getCPUFeature(ID_AA64ISAR0_EL1, ftr);
-    // ID_AA64ISAR0_EL1.SHA3 != 0b0000
-    if (extractBits(ftr, 32, 4) != 0x0)
-      setCPUFeature(FEAT_SHA3);
+
     getCPUFeature(ID_AA64ISAR1_EL1, ftr);
-    // ID_AA64ISAR1_EL1.DPB >= 0b0001
-    if (extractBits(ftr, 0, 4) >= 0x1)
-      setCPUFeature(FEAT_DPB);
-    // ID_AA64ISAR1_EL1.LRCPC != 0b0000
-    if (extractBits(ftr, 20, 4) != 0x0)
-      setCPUFeature(FEAT_RCPC);
-    // ID_AA64ISAR1_EL1.LRCPC == 0b0011
-    if (extractBits(ftr, 20, 4) == 0x3)
-      setCPUFeature(FEAT_RCPC3);
-    // ID_AA64ISAR1_EL1.SPECRES == 0b0001
-    if (extractBits(ftr, 40, 4) == 0x2)
+    /* ID_AA64ISAR1_EL1.SPECRES >= 0b0001  */
+    if (extractBits(ftr, 40, 4) >= 0x1)
       setCPUFeature(FEAT_PREDRES);
-    // ID_AA64ISAR1_EL1.BF16 != 0b0000
-    if (extractBits(ftr, 44, 4) != 0x0)
-      setCPUFeature(FEAT_BF16);
-    // ID_AA64ISAR1_EL1.LS64 >= 0b0001
+    /* ID_AA64ISAR1_EL1.LS64 >= 0b0001  */
     if (extractBits(ftr, 60, 4) >= 0x1)
       setCPUFeature(FEAT_LS64);
-    // ID_AA64ISAR1_EL1.LS64 >= 0b0010
+    /* ID_AA64ISAR1_EL1.LS64 >= 0b0010  */
     if (extractBits(ftr, 60, 4) >= 0x2)
       setCPUFeature(FEAT_LS64_V);
-    // ID_AA64ISAR1_EL1.LS64 >= 0b0011
+    /* ID_AA64ISAR1_EL1.LS64 >= 0b0011  */
     if (extractBits(ftr, 60, 4) >= 0x3)
       setCPUFeature(FEAT_LS64_ACCDATA);
-  } else {
-    // Set some features in case of no CPUID support
-    if (hwcap & (HWCAP_FP | HWCAP_FPHP)) {
-      setCPUFeature(FEAT_FP);
-      // FP and AdvSIMD fields have the same value
-      setCPUFeature(FEAT_SIMD);
-    }
-    if (hwcap & HWCAP_DCPOP || hwcap2 & HWCAP2_DCPODP)
-      setCPUFeature(FEAT_DPB);
-    if (hwcap & HWCAP_LRCPC || hwcap & HWCAP_ILRCPC)
-      setCPUFeature(FEAT_RCPC);
-    if (hwcap2 & HWCAP2_BF16 || hwcap2 & HWCAP2_EBF16)
-      setCPUFeature(FEAT_BF16);
-    if (hwcap2 & HWCAP2_SVEBF16)
-      setCPUFeature(FEAT_SVE_BF16);
-    if (hwcap2 & HWCAP2_SVE2 && hwcap & HWCAP_SVE)
-      setCPUFeature(FEAT_SVE2);
-    if (hwcap & HWCAP_SHA3)
-      setCPUFeature(FEAT_SHA3);
   }
+  if (hwcap & HWCAP_FP) {
+    setCPUFeature(FEAT_FP);
+    // FP and AdvSIMD fields have the same value
+    setCPUFeature(FEAT_SIMD);
+  }
+  if (hwcap & HWCAP_DCPOP)
+    setCPUFeature(FEAT_DPB);
+  if (hwcap & HWCAP_LRCPC)
+    setCPUFeature(FEAT_RCPC);
+  if (hwcap & HWCAP_ILRCPC)
+    setCPUFeature(FEAT_RCPC2);
+  if (hwcap2 & HWCAP2_LRCPC3)
+    setCPUFeature(FEAT_RCPC3);
+  if (hwcap2 & HWCAP2_BF16)
+    setCPUFeature(FEAT_BF16);
+  if (hwcap2 & HWCAP2_SVEBF16)
+    setCPUFeature(FEAT_SVE_BF16);
+  if (hwcap & HWCAP_SVE)
+    setCPUFeature(FEAT_SVE);
+  if (hwcap2 & HWCAP2_SVE2)
+    setCPUFeature(FEAT_SVE2);
+  if (hwcap & HWCAP_SHA3)
+    setCPUFeature(FEAT_SHA3);
   setCPUFeature(FEAT_INIT);
+
+  __atomic_store_n(&__aarch64_cpu_features.features, feat, __ATOMIC_RELAXED);
 }
diff --git a/contrib/libs/cxxsupp/builtins/cpu_model/aarch64/fmv/sysauxv.inc b/contrib/libs/cxxsupp/builtins/cpu_model/aarch64/fmv/sysauxv.inc
index fb5722c4306..486f77a1e4d 100644
--- a/contrib/libs/cxxsupp/builtins/cpu_model/aarch64/fmv/sysauxv.inc
+++ b/contrib/libs/cxxsupp/builtins/cpu_model/aarch64/fmv/sysauxv.inc
@@ -1,13 +1,13 @@
 void __init_cpu_features_resolver(unsigned long hwcap,
                                   const __ifunc_arg_t *arg) {
-  if (__aarch64_cpu_features.features)
+  if (__atomic_load_n(&__aarch64_cpu_features.features, __ATOMIC_RELAXED))
     return;
   __init_cpu_features_constructor(hwcap, arg);
 }
 
 void CONSTRUCTOR_ATTRIBUTE __init_cpu_features(void) {
   // CPU features already initialized.
-  if (__aarch64_cpu_features.features)
+  if (__atomic_load_n(&__aarch64_cpu_features.features, __ATOMIC_RELAXED))
     return;
 
   unsigned long hwcap = getauxval(AT_HWCAP);
diff --git a/contrib/libs/cxxsupp/builtins/cpu_model/aarch64/hwcap.inc b/contrib/libs/cxxsupp/builtins/cpu_model/aarch64/hwcap.inc
index 7ddc125b26d..41aba82ef95 100644
--- a/contrib/libs/cxxsupp/builtins/cpu_model/aarch64/hwcap.inc
+++ b/contrib/libs/cxxsupp/builtins/cpu_model/aarch64/hwcap.inc
@@ -178,6 +178,12 @@
 #ifndef HWCAP2_SVE_EBF16
 #define HWCAP2_SVE_EBF16 (1ULL << 33)
 #endif
+#ifndef HWCAP2_SME2
+#define HWCAP2_SME2 (1UL << 37)
+#endif
 #ifndef HWCAP2_MOPS
 #define HWCAP2_MOPS (1ULL << 43)
 #endif
+#ifndef HWCAP2_LRCPC3
+#define HWCAP2_LRCPC3 (1UL << 46)
+#endif
diff --git a/contrib/libs/cxxsupp/builtins/cpu_model/x86.c b/contrib/libs/cxxsupp/builtins/cpu_model/x86.c
index 0750e29f989..b1c4abd9d11 100644
--- a/contrib/libs/cxxsupp/builtins/cpu_model/x86.c
+++ b/contrib/libs/cxxsupp/builtins/cpu_model/x86.c
@@ -59,6 +59,7 @@ enum ProcessorTypes {
   INTEL_SIERRAFOREST,
   INTEL_GRANDRIDGE,
   INTEL_CLEARWATERFOREST,
+  AMDFAM1AH,
   CPU_TYPE_MAX
 };
 
@@ -97,6 +98,7 @@ enum ProcessorSubtypes {
   INTEL_COREI7_ARROWLAKE,
   INTEL_COREI7_ARROWLAKE_S,
   INTEL_COREI7_PANTHERLAKE,
+  AMDFAM1AH_ZNVER5,
   CPU_SUBTYPE_MAX
 };
 
@@ -139,20 +141,88 @@ enum ProcessorFeatures {
   FEATURE_AVX512BITALG,
   FEATURE_AVX512BF16,
   FEATURE_AVX512VP2INTERSECT,
-
-  FEATURE_CMPXCHG16B = 46,
-  FEATURE_F16C = 49,
+  // FIXME: Below Features has some missings comparing to gcc, it's because gcc
+  // has some not one-to-one mapped in llvm.
+  // FEATURE_3DNOW,
+  // FEATURE_3DNOWP,
+  FEATURE_ADX = 40,
+  // FEATURE_ABM,
+  FEATURE_CLDEMOTE = 42,
+  FEATURE_CLFLUSHOPT,
+  FEATURE_CLWB,
+  FEATURE_CLZERO,
+  FEATURE_CMPXCHG16B,
+  // FIXME: Not adding FEATURE_CMPXCHG8B is a workaround to make 'generic' as
+  // a cpu string with no X86_FEATURE_COMPAT features, which is required in
+  // current implementantion of cpu_specific/cpu_dispatch FMV feature.
+  // FEATURE_CMPXCHG8B,
+  FEATURE_ENQCMD = 48,
+  FEATURE_F16C,
+  FEATURE_FSGSBASE,
+  // FEATURE_FXSAVE,
+  // FEATURE_HLE,
+  // FEATURE_IBT,
   FEATURE_LAHF_LM = 54,
   FEATURE_LM,
-  FEATURE_WP,
+  FEATURE_LWP,
   FEATURE_LZCNT,
   FEATURE_MOVBE,
-
-  FEATURE_AVX512FP16 = 94,
+  FEATURE_MOVDIR64B,
+  FEATURE_MOVDIRI,
+  FEATURE_MWAITX,
+  // FEATURE_OSXSAVE,
+  FEATURE_PCONFIG = 63,
+  FEATURE_PKU,
+  FEATURE_PREFETCHWT1,
+  FEATURE_PRFCHW,
+  FEATURE_PTWRITE,
+  FEATURE_RDPID,
+  FEATURE_RDRND,
+  FEATURE_RDSEED,
+  FEATURE_RTM,
+  FEATURE_SERIALIZE,
+  FEATURE_SGX,
+  FEATURE_SHA,
+  FEATURE_SHSTK,
+  FEATURE_TBM,
+  FEATURE_TSXLDTRK,
+  FEATURE_VAES,
+  FEATURE_WAITPKG,
+  FEATURE_WBNOINVD,
+  FEATURE_XSAVE,
+  FEATURE_XSAVEC,
+  FEATURE_XSAVEOPT,
+  FEATURE_XSAVES,
+  FEATURE_AMX_TILE,
+  FEATURE_AMX_INT8,
+  FEATURE_AMX_BF16,
+  FEATURE_UINTR,
+  FEATURE_HRESET,
+  FEATURE_KL,
+  // FEATURE_AESKLE,
+  FEATURE_WIDEKL = 92,
+  FEATURE_AVXVNNI,
+  FEATURE_AVX512FP16,
   FEATURE_X86_64_BASELINE,
   FEATURE_X86_64_V2,
   FEATURE_X86_64_V3,
   FEATURE_X86_64_V4,
+  FEATURE_AVXIFMA,
+  FEATURE_AVXVNNIINT8,
+  FEATURE_AVXNECONVERT,
+  FEATURE_CMPCCXADD,
+  FEATURE_AMX_FP16,
+  FEATURE_PREFETCHI,
+  FEATURE_RAOINT,
+  FEATURE_AMX_COMPLEX,
+  FEATURE_AVXVNNIINT16,
+  FEATURE_SM3,
+  FEATURE_SHA512,
+  FEATURE_SM4,
+  FEATURE_APXF,
+  FEATURE_USERMSR,
+  FEATURE_AVX10_1_256,
+  FEATURE_AVX10_1_512,
   CPU_FEATURE_MAX
 };
 
@@ -299,13 +369,13 @@ static void detectX86FamilyModel(unsigned EAX, unsigned *Family,
   }
 }
 
+#define testFeature(F) (Features[F / 32] & (1 << (F % 32))) != 0
+
 static const char *getIntelProcessorTypeAndSubtype(unsigned Family,
                                                    unsigned Model,
                                                    const unsigned *Features,
                                                    unsigned *Type,
                                                    unsigned *Subtype) {
-#define testFeature(F) (Features[F / 32] & (1 << (F % 32))) != 0
-
   // We select CPU strings to match the code in Host.cpp, but we don't use them
   // in compiler-rt.
   const char *CPU = 0;
@@ -594,14 +664,48 @@ static const char *getAMDProcessorTypeAndSubtype(unsigned Family,
                                                  const unsigned *Features,
                                                  unsigned *Type,
                                                  unsigned *Subtype) {
-  // We select CPU strings to match the code in Host.cpp, but we don't use them
-  // in compiler-rt.
   const char *CPU = 0;
 
   switch (Family) {
+  case 4:
+    CPU = "i486";
+    break;
+  case 5:
+    CPU = "pentium";
+    switch (Model) {
+    case 6:
+    case 7:
+      CPU = "k6";
+      break;
+    case 8:
+      CPU = "k6-2";
+      break;
+    case 9:
+    case 13:
+      CPU = "k6-3";
+      break;
+    case 10:
+      CPU = "geode";
+      break;
+    }
+    break;
+  case 6:
+    if (testFeature(FEATURE_SSE)) {
+      CPU = "athlon-xp";
+      break;
+    }
+    CPU = "athlon";
+    break;
+  case 15:
+    if (testFeature(FEATURE_SSE3)) {
+      CPU = "k8-sse3";
+      break;
+    }
+    CPU = "k8";
+    break;
   case 16:
     CPU = "amdfam10";
-    *Type = AMDFAM10H;
+    *Type = AMDFAM10H; // "amdfam10"
     switch (Model) {
     case 2:
       *Subtype = AMDFAM10H_BARCELONA;
@@ -677,7 +781,7 @@ static const char *getAMDProcessorTypeAndSubtype(unsigned Family,
   case 25:
     CPU = "znver3";
     *Type = AMDFAM19H;
-    if ((Model <= 0x0f) || (Model >= 0x20 && Model <= 0x2f) ||
+    if (Model <= 0x0f || (Model >= 0x20 && Model <= 0x2f) ||
         (Model >= 0x30 && Model <= 0x3f) || (Model >= 0x40 && Model <= 0x4f) ||
         (Model >= 0x50 && Model <= 0x5f)) {
       // Family 19h Models 00h-0Fh (Genesis, Chagall) Zen 3
@@ -701,6 +805,24 @@ static const char *getAMDProcessorTypeAndSubtype(unsigned Family,
       break; //  "znver4"
     }
     break; // family 19h
+  case 26:
+    CPU = "znver5";
+    *Type = AMDFAM1AH;
+    if (Model <= 0x77) {
+      // Models 00h-0Fh (Breithorn).
+      // Models 10h-1Fh (Breithorn-Dense).
+      // Models 20h-2Fh (Strix 1).
+      // Models 30h-37h (Strix 2).
+      // Models 38h-3Fh (Strix 3).
+      // Models 40h-4Fh (Granite Ridge).
+      // Models 50h-5Fh (Weisshorn).
+      // Models 60h-6Fh (Krackan1).
+      // Models 70h-77h (Sarlak).
+      CPU = "znver5";
+      *Subtype = AMDFAM1AH_ZNVER5;
+      break; //  "znver5"
+    }
+    break;
   default:
     break; // Unknown AMD CPU.
   }
@@ -708,6 +830,8 @@ static const char *getAMDProcessorTypeAndSubtype(unsigned Family,
   return CPU;
 }
 
+#undef testFeature
+
 static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
                                  unsigned *Features) {
   unsigned EAX = 0, EBX = 0;
@@ -746,13 +870,15 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
     setFeature(FEATURE_AES);
   if ((ECX >> 29) & 1)
     setFeature(FEATURE_F16C);
+  if ((ECX >> 30) & 1)
+    setFeature(FEATURE_RDRND);
 
   // If CPUID indicates support for XSAVE, XRESTORE and AVX, and XGETBV
   // indicates that the AVX registers will be saved and restored on context
   // switch, then we have full AVX support.
   const unsigned AVXBits = (1 << 27) | (1 << 28);
-  bool HasAVX = ((ECX & AVXBits) == AVXBits) && !getX86XCR0(&EAX, &EDX) &&
-                ((EAX & 0x6) == 0x6);
+  bool HasAVXSave = ((ECX & AVXBits) == AVXBits) && !getX86XCR0(&EAX, &EDX) &&
+                    ((EAX & 0x6) == 0x6);
 #if defined(__APPLE__)
   // Darwin lazily saves the AVX512 context on first use: trust that the OS will
   // save the AVX512 context if we use AVX512 instructions, even the bit is not
@@ -760,45 +886,76 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
   bool HasAVX512Save = true;
 #else
   // AVX512 requires additional context to be saved by the OS.
-  bool HasAVX512Save = HasAVX && ((EAX & 0xe0) == 0xe0);
+  bool HasAVX512Save = HasAVXSave && ((EAX & 0xe0) == 0xe0);
 #endif
+  // AMX requires additional context to be saved by the OS.
+  const unsigned AMXBits = (1 << 17) | (1 << 18);
+  bool HasXSave = ((ECX >> 27) & 1) && !getX86XCR0(&EAX, &EDX);
+  bool HasAMXSave = HasXSave && ((EAX & AMXBits) == AMXBits);
 
-  if (HasAVX)
+  if (HasAVXSave)
     setFeature(FEATURE_AVX);
 
+  if (((ECX >> 26) & 1) && HasAVXSave)
+    setFeature(FEATURE_XSAVE);
+
   bool HasLeaf7 =
       MaxLeaf >= 0x7 && !getX86CpuIDAndInfoEx(0x7, 0x0, &EAX, &EBX, &ECX, &EDX);
 
+  if (HasLeaf7 && ((EBX >> 0) & 1))
+    setFeature(FEATURE_FSGSBASE);
+  if (HasLeaf7 && ((EBX >> 2) & 1))
+    setFeature(FEATURE_SGX);
   if (HasLeaf7 && ((EBX >> 3) & 1))
     setFeature(FEATURE_BMI);
-  if (HasLeaf7 && ((EBX >> 5) & 1) && HasAVX)
+  if (HasLeaf7 && ((EBX >> 5) & 1) && HasAVXSave)
     setFeature(FEATURE_AVX2);
   if (HasLeaf7 && ((EBX >> 8) & 1))
     setFeature(FEATURE_BMI2);
+  if (HasLeaf7 && ((EBX >> 11) & 1))
+    setFeature(FEATURE_RTM);
   if (HasLeaf7 && ((EBX >> 16) & 1) && HasAVX512Save)
     setFeature(FEATURE_AVX512F);
   if (HasLeaf7 && ((EBX >> 17) & 1) && HasAVX512Save)
     setFeature(FEATURE_AVX512DQ);
+  if (HasLeaf7 && ((EBX >> 18) & 1))
+    setFeature(FEATURE_RDSEED);
+  if (HasLeaf7 && ((EBX >> 19) & 1))
+    setFeature(FEATURE_ADX);
   if (HasLeaf7 && ((EBX >> 21) & 1) && HasAVX512Save)
     setFeature(FEATURE_AVX512IFMA);
+  if (HasLeaf7 && ((EBX >> 24) & 1))
+    setFeature(FEATURE_CLWB);
   if (HasLeaf7 && ((EBX >> 26) & 1) && HasAVX512Save)
     setFeature(FEATURE_AVX512PF);
   if (HasLeaf7 && ((EBX >> 27) & 1) && HasAVX512Save)
     setFeature(FEATURE_AVX512ER);
   if (HasLeaf7 && ((EBX >> 28) & 1) && HasAVX512Save)
     setFeature(FEATURE_AVX512CD);
+  if (HasLeaf7 && ((EBX >> 29) & 1))
+    setFeature(FEATURE_SHA);
   if (HasLeaf7 && ((EBX >> 30) & 1) && HasAVX512Save)
     setFeature(FEATURE_AVX512BW);
   if (HasLeaf7 && ((EBX >> 31) & 1) && HasAVX512Save)
     setFeature(FEATURE_AVX512VL);
 
+  if (HasLeaf7 && ((ECX >> 0) & 1))
+    setFeature(FEATURE_PREFETCHWT1);
   if (HasLeaf7 && ((ECX >> 1) & 1) && HasAVX512Save)
     setFeature(FEATURE_AVX512VBMI);
+  if (HasLeaf7 && ((ECX >> 4) & 1))
+    setFeature(FEATURE_PKU);
+  if (HasLeaf7 && ((ECX >> 5) & 1))
+    setFeature(FEATURE_WAITPKG);
   if (HasLeaf7 && ((ECX >> 6) & 1) && HasAVX512Save)
     setFeature(FEATURE_AVX512VBMI2);
+  if (HasLeaf7 && ((ECX >> 7) & 1))
+    setFeature(FEATURE_SHSTK);
   if (HasLeaf7 && ((ECX >> 8) & 1))
     setFeature(FEATURE_GFNI);
-  if (HasLeaf7 && ((ECX >> 10) & 1) && HasAVX)
+  if (HasLeaf7 && ((ECX >> 9) & 1) && HasAVXSave)
+    setFeature(FEATURE_VAES);
+  if (HasLeaf7 && ((ECX >> 10) & 1) && HasAVXSave)
     setFeature(FEATURE_VPCLMULQDQ);
   if (HasLeaf7 && ((ECX >> 11) & 1) && HasAVX512Save)
     setFeature(FEATURE_AVX512VNNI);
@@ -806,23 +963,100 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
     setFeature(FEATURE_AVX512BITALG);
   if (HasLeaf7 && ((ECX >> 14) & 1) && HasAVX512Save)
     setFeature(FEATURE_AVX512VPOPCNTDQ);
+  if (HasLeaf7 && ((ECX >> 22) & 1))
+    setFeature(FEATURE_RDPID);
+  if (HasLeaf7 && ((ECX >> 23) & 1))
+    setFeature(FEATURE_KL);
+  if (HasLeaf7 && ((ECX >> 25) & 1))
+    setFeature(FEATURE_CLDEMOTE);
+  if (HasLeaf7 && ((ECX >> 27) & 1))
+    setFeature(FEATURE_MOVDIRI);
+  if (HasLeaf7 && ((ECX >> 28) & 1))
+    setFeature(FEATURE_MOVDIR64B);
+  if (HasLeaf7 && ((ECX >> 29) & 1))
+    setFeature(FEATURE_ENQCMD);
 
   if (HasLeaf7 && ((EDX >> 2) & 1) && HasAVX512Save)
     setFeature(FEATURE_AVX5124VNNIW);
   if (HasLeaf7 && ((EDX >> 3) & 1) && HasAVX512Save)
     setFeature(FEATURE_AVX5124FMAPS);
+  if (HasLeaf7 && ((EDX >> 5) & 1))
+    setFeature(FEATURE_UINTR);
   if (HasLeaf7 && ((EDX >> 8) & 1) && HasAVX512Save)
     setFeature(FEATURE_AVX512VP2INTERSECT);
+  if (HasLeaf7 && ((EDX >> 14) & 1))
+    setFeature(FEATURE_SERIALIZE);
+  if (HasLeaf7 && ((EDX >> 16) & 1))
+    setFeature(FEATURE_TSXLDTRK);
+  if (HasLeaf7 && ((EDX >> 18) & 1))
+    setFeature(FEATURE_PCONFIG);
+  if (HasLeaf7 && ((EDX >> 22) & 1) && HasAMXSave)
+    setFeature(FEATURE_AMX_BF16);
   if (HasLeaf7 && ((EDX >> 23) & 1) && HasAVX512Save)
     setFeature(FEATURE_AVX512FP16);
+  if (HasLeaf7 && ((EDX >> 24) & 1) && HasAMXSave)
+    setFeature(FEATURE_AMX_TILE);
+  if (HasLeaf7 && ((EDX >> 25) & 1) && HasAMXSave)
+    setFeature(FEATURE_AMX_INT8);
 
   // EAX from subleaf 0 is the maximum subleaf supported. Some CPUs don't
   // return all 0s for invalid subleaves so check the limit.
   bool HasLeaf7Subleaf1 =
       HasLeaf7 && EAX >= 1 &&
       !getX86CpuIDAndInfoEx(0x7, 0x1, &EAX, &EBX, &ECX, &EDX);
+  if (HasLeaf7Subleaf1 && ((EAX >> 0) & 1))
+    setFeature(FEATURE_SHA512);
+  if (HasLeaf7Subleaf1 && ((EAX >> 1) & 1))
+    setFeature(FEATURE_SM3);
+  if (HasLeaf7Subleaf1 && ((EAX >> 2) & 1))
+    setFeature(FEATURE_SM4);
+  if (HasLeaf7Subleaf1 && ((EAX >> 3) & 1))
+    setFeature(FEATURE_RAOINT);
+  if (HasLeaf7Subleaf1 && ((EAX >> 4) & 1) && HasAVXSave)
+    setFeature(FEATURE_AVXVNNI);
   if (HasLeaf7Subleaf1 && ((EAX >> 5) & 1) && HasAVX512Save)
     setFeature(FEATURE_AVX512BF16);
+  if (HasLeaf7Subleaf1 && ((EAX >> 7) & 1))
+    setFeature(FEATURE_CMPCCXADD);
+  if (HasLeaf7Subleaf1 && ((EAX >> 21) & 1) && HasAMXSave)
+    setFeature(FEATURE_AMX_FP16);
+  if (HasLeaf7Subleaf1 && ((EAX >> 22) & 1))
+    setFeature(FEATURE_HRESET);
+  if (HasLeaf7Subleaf1 && ((EAX >> 23) & 1) && HasAVXSave)
+    setFeature(FEATURE_AVXIFMA);
+
+  if (HasLeaf7Subleaf1 && ((EDX >> 4) & 1) && HasAVXSave)
+    setFeature(FEATURE_AVXVNNIINT8);
+  if (HasLeaf7Subleaf1 && ((EDX >> 5) & 1) && HasAVXSave)
+    setFeature(FEATURE_AVXNECONVERT);
+  if (HasLeaf7Subleaf1 && ((EDX >> 8) & 1) && HasAMXSave)
+    setFeature(FEATURE_AMX_COMPLEX);
+  if (HasLeaf7Subleaf1 && ((EDX >> 10) & 1) && HasAVXSave)
+    setFeature(FEATURE_AVXVNNIINT16);
+  if (HasLeaf7Subleaf1 && ((EDX >> 14) & 1))
+    setFeature(FEATURE_PREFETCHI);
+  if (HasLeaf7Subleaf1 && ((EDX >> 15) & 1))
+    setFeature(FEATURE_USERMSR);
+  if (HasLeaf7Subleaf1 && ((EDX >> 19) & 1))
+    setFeature(FEATURE_AVX10_1_256);
+  if (HasLeaf7Subleaf1 && ((EDX >> 21) & 1))
+    setFeature(FEATURE_APXF);
+
+  unsigned MaxLevel;
+  getX86CpuIDAndInfo(0, &MaxLevel, &EBX, &ECX, &EDX);
+  bool HasLeafD = MaxLevel >= 0xd &&
+                  !getX86CpuIDAndInfoEx(0xd, 0x1, &EAX, &EBX, &ECX, &EDX);
+  if (HasLeafD && ((EAX >> 0) & 1) && HasAVXSave)
+    setFeature(FEATURE_XSAVEOPT);
+  if (HasLeafD && ((EAX >> 1) & 1) && HasAVXSave)
+    setFeature(FEATURE_XSAVEC);
+  if (HasLeafD && ((EAX >> 3) & 1) && HasAVXSave)
+    setFeature(FEATURE_XSAVES);
+
+  bool HasLeaf24 =
+      MaxLevel >= 0x24 && !getX86CpuIDAndInfo(0x24, &EAX, &EBX, &ECX, &EDX);
+  if (HasLeaf7Subleaf1 && ((EDX >> 19) & 1) && HasLeaf24 && ((EBX >> 18) & 1))
+    setFeature(FEATURE_AVX10_1_512);
 
   unsigned MaxExtLevel;
   getX86CpuIDAndInfo(0x80000000, &MaxExtLevel, &EBX, &ECX, &EDX);
@@ -836,14 +1070,40 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
       setFeature(FEATURE_LZCNT);
     if (((ECX >> 6) & 1))
       setFeature(FEATURE_SSE4_A);
+    if (((ECX >> 8) & 1))
+      setFeature(FEATURE_PRFCHW);
     if (((ECX >> 11) & 1))
       setFeature(FEATURE_XOP);
+    if (((ECX >> 15) & 1))
+      setFeature(FEATURE_LWP);
     if (((ECX >> 16) & 1))
       setFeature(FEATURE_FMA4);
+    if (((ECX >> 21) & 1))
+      setFeature(FEATURE_TBM);
+    if (((ECX >> 29) & 1))
+      setFeature(FEATURE_MWAITX);
+
     if (((EDX >> 29) & 1))
       setFeature(FEATURE_LM);
   }
 
+  bool HasExtLeaf8 = MaxExtLevel >= 0x80000008 &&
+                     !getX86CpuIDAndInfo(0x80000008, &EAX, &EBX, &ECX, &EDX);
+  if (HasExtLeaf8 && ((EBX >> 0) & 1))
+    setFeature(FEATURE_CLZERO);
+  if (HasExtLeaf8 && ((EBX >> 9) & 1))
+    setFeature(FEATURE_WBNOINVD);
+
+  bool HasLeaf14 = MaxLevel >= 0x14 &&
+                   !getX86CpuIDAndInfoEx(0x14, 0x0, &EAX, &EBX, &ECX, &EDX);
+  if (HasLeaf14 && ((EBX >> 4) & 1))
+    setFeature(FEATURE_PTWRITE);
+
+  bool HasLeaf19 =
+      MaxLevel >= 0x19 && !getX86CpuIDAndInfo(0x19, &EAX, &EBX, &ECX, &EDX);
+  if (HasLeaf7 && HasLeaf19 && ((EBX >> 2) & 1))
+    setFeature(FEATURE_WIDEKL);
+
   if (hasFeature(FEATURE_LM) && hasFeature(FEATURE_SSE2)) {
     setFeature(FEATURE_X86_64_BASELINE);
     if (hasFeature(FEATURE_CMPXCHG16B) && hasFeature(FEATURE_POPCNT) &&
diff --git a/contrib/libs/cxxsupp/builtins/divtc3.c b/contrib/libs/cxxsupp/builtins/divtc3.c
index 099de5802da..c393de81533 100644
--- a/contrib/libs/cxxsupp/builtins/divtc3.c
+++ b/contrib/libs/cxxsupp/builtins/divtc3.c
@@ -13,7 +13,7 @@
 #define QUAD_PRECISION
 #include "fp_lib.h"
 
-#if defined(CRT_HAS_F128)
+#if defined(CRT_HAS_128BIT) && defined(CRT_HAS_F128)
 
 // Returns: the quotient of (a + ib) / (c + id)
 
diff --git a/contrib/libs/cxxsupp/builtins/extendbfsf2.c b/contrib/libs/cxxsupp/builtins/extendbfsf2.c
new file mode 100644
index 00000000000..e159d7997f6
--- /dev/null
+++ b/contrib/libs/cxxsupp/builtins/extendbfsf2.c
@@ -0,0 +1,13 @@
+//===-- lib/extendbfsf2.c - bfloat -> single conversion -----------*- C -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#define SRC_BFLOAT16
+#define DST_SINGLE
+#include "fp_extend_impl.inc"
+
+COMPILER_RT_ABI float __extendbfsf2(src_t a) { return __extendXfYf2__(a); }
diff --git a/contrib/libs/cxxsupp/builtins/fp_add_impl.inc b/contrib/libs/cxxsupp/builtins/fp_add_impl.inc
index 7133358df9b..d20599921e7 100644
--- a/contrib/libs/cxxsupp/builtins/fp_add_impl.inc
+++ b/contrib/libs/cxxsupp/builtins/fp_add_impl.inc
@@ -91,7 +91,7 @@ static __inline fp_t __addXf3__(fp_t a, fp_t b) {
 
   // Shift the significand of b by the difference in exponents, with a sticky
   // bottom bit to get rounding correct.
-  const unsigned int align = aExponent - bExponent;
+  const unsigned int align = (unsigned int)(aExponent - bExponent);
   if (align) {
     if (align < typeWidth) {
       const bool sticky = (bSignificand << (typeWidth - align)) != 0;
diff --git a/contrib/libs/cxxsupp/builtins/fp_extend.h b/contrib/libs/cxxsupp/builtins/fp_extend.h
index 95ea2a7ac4b..22bf2b2514e 100644
--- a/contrib/libs/cxxsupp/builtins/fp_extend.h
+++ b/contrib/libs/cxxsupp/builtins/fp_extend.h
@@ -37,16 +37,7 @@ static const int srcSigFracBits = 52;
 // srcBits - srcSigFracBits - 1
 static const int srcExpBits = 11;
 
-static inline int src_rep_t_clz_impl(src_rep_t a) {
-#if defined __LP64__
-  return __builtin_clzl(a);
-#else
-  if (a & REP_C(0xffffffff00000000))
-    return clzsi(a >> 32);
-  else
-    return 32 + clzsi(a & REP_C(0xffffffff));
-#endif
-}
+static inline int src_rep_t_clz_impl(src_rep_t a) { return __builtin_clzll(a); }
 #define src_rep_t_clz src_rep_t_clz_impl
 
 #elif defined SRC_80
@@ -81,6 +72,21 @@ static inline int src_rep_t_clz_impl(src_rep_t a) {
 
 #define src_rep_t_clz src_rep_t_clz_impl
 
+#elif defined SRC_BFLOAT16
+#ifdef COMPILER_RT_HAS_BFLOAT16
+typedef __bf16 src_t;
+#else
+typedef uint16_t src_t;
+#endif
+typedef uint16_t src_rep_t;
+#define SRC_REP_C UINT16_C
+static const int srcBits = sizeof(src_t) * CHAR_BIT;
+static const int srcSigFracBits = 7;
+// -1 accounts for the sign bit.
+// srcBits - srcSigFracBits - 1
+static const int srcExpBits = 8;
+#define src_rep_t_clz __builtin_clz
+
 #else
 #error Source should be half, single, or double precision!
 #endif // end source precision
diff --git a/contrib/libs/cxxsupp/builtins/fp_fixint_impl.inc b/contrib/libs/cxxsupp/builtins/fp_fixint_impl.inc
index 3556bad9990..2f2f77ce781 100644
--- a/contrib/libs/cxxsupp/builtins/fp_fixint_impl.inc
+++ b/contrib/libs/cxxsupp/builtins/fp_fixint_impl.inc
@@ -34,7 +34,7 @@ static __inline fixint_t __fixint(fp_t a) {
   // If 0 <= exponent < significandBits, right shift to get the result.
   // Otherwise, shift left.
   if (exponent < significandBits)
-    return sign * (significand >> (significandBits - exponent));
+    return (fixint_t)(sign * (significand >> (significandBits - exponent)));
   else
-    return sign * ((fixuint_t)significand << (exponent - significandBits));
+    return (fixint_t)(sign * ((fixuint_t)significand << (exponent - significandBits)));
 }
diff --git a/contrib/libs/cxxsupp/builtins/fp_lib.h b/contrib/libs/cxxsupp/builtins/fp_lib.h
index c4f0a5b9587..b2a89506135 100644
--- a/contrib/libs/cxxsupp/builtins/fp_lib.h
+++ b/contrib/libs/cxxsupp/builtins/fp_lib.h
@@ -43,8 +43,8 @@ static __inline int rep_clz(rep_t a) { return clzsi(a); }
 // 32x32 --> 64 bit multiply
 static __inline void wideMultiply(rep_t a, rep_t b, rep_t *hi, rep_t *lo) {
   const uint64_t product = (uint64_t)a * b;
-  *hi = product >> 32;
-  *lo = product;
+  *hi = (rep_t)(product >> 32);
+  *lo = (rep_t)product;
 }
 COMPILER_RT_ABI fp_t __addsf3(fp_t a, fp_t b);
 
@@ -58,16 +58,7 @@ typedef double fp_t;
 #define REP_C UINT64_C
 #define significandBits 52
 
-static __inline int rep_clz(rep_t a) {
-#if defined __LP64__
-  return __builtin_clzl(a);
-#else
-  if (a & REP_C(0xffffffff00000000))
-    return clzsi(a >> 32);
-  else
-    return 32 + clzsi(a & REP_C(0xffffffff));
-#endif
-}
+static inline int rep_clz(rep_t a) { return __builtin_clzll(a); }
 
 #define loWord(a) (a & 0xffffffffU)
 #define hiWord(a) (a >> 32)
@@ -239,7 +230,7 @@ static __inline int normalize(rep_t *significand) {
   return 1 - shift;
 }
 
-static __inline void wideLeftShift(rep_t *hi, rep_t *lo, int count) {
+static __inline void wideLeftShift(rep_t *hi, rep_t *lo, unsigned int count) {
   *hi = *hi << count | *lo >> (typeWidth - count);
   *lo = *lo << count;
 }
diff --git a/contrib/libs/cxxsupp/builtins/int_types.h b/contrib/libs/cxxsupp/builtins/int_types.h
index ca97391fc28..48862f36421 100644
--- a/contrib/libs/cxxsupp/builtins/int_types.h
+++ b/contrib/libs/cxxsupp/builtins/int_types.h
@@ -107,8 +107,8 @@ typedef union {
 
 static __inline ti_int make_ti(di_int h, di_int l) {
   twords r;
-  r.s.high = h;
-  r.s.low = l;
+  r.s.high = (du_int)h;
+  r.s.low = (du_int)l;
   return r.all;
 }
 
diff --git a/contrib/libs/cxxsupp/builtins/multc3.c b/contrib/libs/cxxsupp/builtins/multc3.c
index 61a3f45e472..a89832f0e88 100644
--- a/contrib/libs/cxxsupp/builtins/multc3.c
+++ b/contrib/libs/cxxsupp/builtins/multc3.c
@@ -15,7 +15,7 @@
 #include "int_lib.h"
 #include "int_math.h"
 
-#if defined(CRT_HAS_F128)
+#if defined(CRT_HAS_128BIT) && defined(CRT_HAS_F128)
 
 // Returns: the product of a + ib and c + id
 
diff --git a/contrib/libs/cxxsupp/builtins/os_version_check.c b/contrib/libs/cxxsupp/builtins/os_version_check.c
index 182eabe7a6a..01fae834ab2 100644
--- a/contrib/libs/cxxsupp/builtins/os_version_check.c
+++ b/contrib/libs/cxxsupp/builtins/os_version_check.c
@@ -316,8 +316,8 @@ int32_t __isOSVersionAtLeast(int32_t Major, int32_t Minor, int32_t Subminor) {
   static pthread_once_t once = PTHREAD_ONCE_INIT;
   pthread_once(&once, readSystemProperties);
 
-  return SdkVersion >= Major ||
-         (IsPreRelease && Major == __ANDROID_API_FUTURE__);
+  // Allow all on pre-release. Note that we still rely on compile-time checks.
+  return SdkVersion >= Major || IsPreRelease;
 }
 
 #else
diff --git a/contrib/libs/cxxsupp/builtins/riscv/restore.S b/contrib/libs/cxxsupp/builtins/riscv/restore.S
index 6f43842c8ca..d87dfc1ac71 100644
--- a/contrib/libs/cxxsupp/builtins/riscv/restore.S
+++ b/contrib/libs/cxxsupp/builtins/riscv/restore.S
@@ -22,7 +22,7 @@
 
 #if __riscv_xlen == 32
 
-#ifndef __riscv_32e
+#ifndef __riscv_abi_rve
 
   .globl  __riscv_restore_12
   .type   __riscv_restore_12,@function
@@ -109,7 +109,7 @@ __riscv_restore_0:
 
 #elif __riscv_xlen == 64
 
-#ifndef __riscv_64e
+#ifndef __riscv_abi_rve
 
   .globl  __riscv_restore_12
   .type   __riscv_restore_12,@function
diff --git a/contrib/libs/cxxsupp/builtins/riscv/save.S b/contrib/libs/cxxsupp/builtins/riscv/save.S
index 3e044179ff7..6324e05e971 100644
--- a/contrib/libs/cxxsupp/builtins/riscv/save.S
+++ b/contrib/libs/cxxsupp/builtins/riscv/save.S
@@ -18,7 +18,7 @@
 
 #if __riscv_xlen == 32
 
-#ifndef __riscv_32e
+#ifndef __riscv_abi_rve
 
   .globl  __riscv_save_12
   .type   __riscv_save_12,@function
@@ -115,7 +115,7 @@ __riscv_save_0:
 
 #elif __riscv_xlen == 64
 
-#ifndef __riscv_64e
+#ifndef __riscv_abi_rve
 
   .globl  __riscv_save_12
   .type   __riscv_save_12,@function
diff --git a/contrib/libs/cxxsupp/builtins/trampoline_setup.c b/contrib/libs/cxxsupp/builtins/trampoline_setup.c
index 844eb279441..830e25e4c03 100644
--- a/contrib/libs/cxxsupp/builtins/trampoline_setup.c
+++ b/contrib/libs/cxxsupp/builtins/trampoline_setup.c
@@ -41,3 +41,45 @@ COMPILER_RT_ABI void __trampoline_setup(uint32_t *trampOnStack,
   __clear_cache(trampOnStack, &trampOnStack[10]);
 }
 #endif // __powerpc__ && !defined(__powerpc64__)
+
+// The AArch64 compiler generates calls to __trampoline_setup() when creating
+// trampoline functions on the stack for use with nested functions.
+// This function creates a custom 36-byte trampoline function on the stack
+// which loads x18 with a pointer to the outer function's locals
+// and then jumps to the target nested function.
+// Note: x18 is a reserved platform register on Windows and macOS.
+
+#if defined(__aarch64__) && defined(__ELF__)
+COMPILER_RT_ABI void __trampoline_setup(uint32_t *trampOnStack,
+                                        int trampSizeAllocated,
+                                        const void *realFunc, void *localsPtr) {
+  // This should never happen, but if compiler did not allocate
+  // enough space on stack for the trampoline, abort.
+  if (trampSizeAllocated < 36)
+    compilerrt_abort();
+
+  // create trampoline
+  // Load realFunc into x17. mov/movk 16 bits at a time.
+  trampOnStack[0] =
+      0xd2800000u | ((((uint64_t)realFunc >> 0) & 0xffffu) << 5) | 0x11;
+  trampOnStack[1] =
+      0xf2a00000u | ((((uint64_t)realFunc >> 16) & 0xffffu) << 5) | 0x11;
+  trampOnStack[2] =
+      0xf2c00000u | ((((uint64_t)realFunc >> 32) & 0xffffu) << 5) | 0x11;
+  trampOnStack[3] =
+      0xf2e00000u | ((((uint64_t)realFunc >> 48) & 0xffffu) << 5) | 0x11;
+  // Load localsPtr into x18
+  trampOnStack[4] =
+      0xd2800000u | ((((uint64_t)localsPtr >> 0) & 0xffffu) << 5) | 0x12;
+  trampOnStack[5] =
+      0xf2a00000u | ((((uint64_t)localsPtr >> 16) & 0xffffu) << 5) | 0x12;
+  trampOnStack[6] =
+      0xf2c00000u | ((((uint64_t)localsPtr >> 32) & 0xffffu) << 5) | 0x12;
+  trampOnStack[7] =
+      0xf2e00000u | ((((uint64_t)localsPtr >> 48) & 0xffffu) << 5) | 0x12;
+  trampOnStack[8] = 0xd61f0220; // br x17
+
+  // Clear instruction cache.
+  __clear_cache(trampOnStack, &trampOnStack[9]);
+}
+#endif // defined(__aarch64__) && !defined(__APPLE__) && !defined(_WIN64)
diff --git a/contrib/libs/cxxsupp/builtins/ya.make b/contrib/libs/cxxsupp/builtins/ya.make
index 5f9c60552aa..ae250c5db79 100644
--- a/contrib/libs/cxxsupp/builtins/ya.make
+++ b/contrib/libs/cxxsupp/builtins/ya.make
@@ -12,9 +12,9 @@ LICENSE(
 
 LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
 
-VERSION(18.1.8)
+VERSION(19.1.3)
 
-ORIGINAL_SOURCE(https://github.com/llvm/llvm-project/releases/download/llvmorg-18.1.8/compiler-rt-18.1.8.src.tar.xz)
+ORIGINAL_SOURCE(https://github.com/llvm/llvm-project/releases/download/llvmorg-19.1.3/compiler-rt-19.1.3.src.tar.xz)
 
 NO_COMPILER_WARNINGS()
 
@@ -65,7 +65,9 @@ IF (ARCH_AARCH64)
         aarch64/chkstk.S
         aarch64/fp_mode.c
         aarch64/sme-abi-init.c
+        aarch64/sme-abi-vg.c
         aarch64/sme-abi.S
+        aarch64/sme-libc-mem-routines.S
         absvdi2.c
         absvsi2.c
         absvti2.c
@@ -117,6 +119,7 @@ IF (ARCH_AARCH64)
         emutls.c
         enable_execute_stack.c
         eprintf.c
+        extendbfsf2.c
         extenddftf2.c
         extendhfsf2.c
         extendhftf2.c
@@ -284,6 +287,7 @@ ELSEIF (ARCH_X86_64)
         emutls.c
         enable_execute_stack.c
         eprintf.c
+        extendbfsf2.c
         extenddftf2.c
         extendhfsf2.c
         extendhftf2.c
@@ -467,6 +471,7 @@ ELSE()
         emutls.c
         enable_execute_stack.c
         eprintf.c
+        extendbfsf2.c
         extenddftf2.c
         extendhfsf2.c
         extendhftf2.c
author	robot-contrib <robot-contrib@yandex-team.com>	2024-12-02 20:23:41 +0300
committer	robot-contrib <robot-contrib@yandex-team.com>	2024-12-02 20:47:52 +0300
commit	81518d12414d1a5f8f1e3d3e13d884306230609b (patch)
tree	3eea40f5490048f98fcca4f5b22e1597d2364f54
parent	c6bd6398f1bec61405c83f91872481e3b5e33510 (diff)
download	ydb-81518d12414d1a5f8f1e3d3e13d884306230609b.tar.gz