aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorthegeorg <thegeorg@yandex-team.com>2023-08-22 18:56:30 +0300
committerthegeorg <thegeorg@yandex-team.com>2023-08-22 19:13:38 +0300
commit769d14120ef8e30363c7dd6870ce1b82552587c3 (patch)
treec407d1d3f152b9f6eb13f50abc3f5b06db82f9b3
parent494eee7cbbaf3e7d71a133c80c96aec26e518c2a (diff)
downloadydb-769d14120ef8e30363c7dd6870ce1b82552587c3.tar.gz
Extract asmlib manipulations into separate block
-rw-r--r--contrib/libs/CMakeLists.darwin-x86_64.txt2
-rw-r--r--contrib/libs/CMakeLists.linux-aarch64.txt1
-rw-r--r--contrib/libs/CMakeLists.linux-x86_64.txt1
-rw-r--r--contrib/libs/CMakeLists.windows-x86_64.txt1
-rw-r--r--contrib/libs/asmglibc/CMakeLists.darwin-x86_64.txt13
-rw-r--r--contrib/libs/asmglibc/CMakeLists.txt11
-rw-r--r--contrib/libs/asmglibc/memchr.S330
-rw-r--r--contrib/libs/asmglibc/sysdep.h12
-rw-r--r--contrib/libs/asmglibc/ya.make17
-rw-r--r--contrib/libs/asmlib/CMakeLists.darwin-x86_64.txt192
-rw-r--r--contrib/libs/asmlib/CMakeLists.linux-aarch64.txt16
-rw-r--r--contrib/libs/asmlib/CMakeLists.linux-x86_64.txt216
-rw-r--r--contrib/libs/asmlib/CMakeLists.txt17
-rw-r--r--contrib/libs/asmlib/CMakeLists.windows-x86_64.txt213
-rw-r--r--contrib/libs/asmlib/cachesize64.asm335
-rw-r--r--contrib/libs/asmlib/cpuid64.asm55
-rw-r--r--contrib/libs/asmlib/cputype64.asm127
-rw-r--r--contrib/libs/asmlib/debugbreak64.asm33
-rw-r--r--contrib/libs/asmlib/defs.asm22
-rw-r--r--contrib/libs/asmlib/dispatchpatch64.asm303
-rw-r--r--contrib/libs/asmlib/divfixedi64.asm173
-rw-r--r--contrib/libs/asmlib/divfixedv64.asm498
-rw-r--r--contrib/libs/asmlib/dummy.c0
-rw-r--r--contrib/libs/asmlib/instrset64.asm184
-rw-r--r--contrib/libs/asmlib/memcmp64.asm295
-rw-r--r--contrib/libs/asmlib/memcpy64.asm1332
-rw-r--r--contrib/libs/asmlib/memmove64.asm1090
-rw-r--r--contrib/libs/asmlib/memset64.asm372
-rw-r--r--contrib/libs/asmlib/mersenne64.asm616
-rw-r--r--contrib/libs/asmlib/mother64.asm242
-rw-r--r--contrib/libs/asmlib/physseed64.asm396
-rw-r--r--contrib/libs/asmlib/popcount64.asm112
-rw-r--r--contrib/libs/asmlib/procname64.asm145
-rw-r--r--contrib/libs/asmlib/randomah.asi290
-rw-r--r--contrib/libs/asmlib/rdtsc64.asm53
-rw-r--r--contrib/libs/asmlib/round64.asm40
-rw-r--r--contrib/libs/asmlib/sfmt64.asm889
-rw-r--r--contrib/libs/asmlib/strcat64.asm70
-rw-r--r--contrib/libs/asmlib/strcpy64.asm66
-rw-r--r--contrib/libs/asmlib/stricmp64.asm86
-rw-r--r--contrib/libs/asmlib/strlen64.asm86
-rw-r--r--contrib/libs/asmlib/substring64.asm75
-rw-r--r--contrib/libs/asmlib/unalignedisfaster64.asm188
-rw-r--r--contrib/libs/asmlib/ya.make110
-rw-r--r--ydb/apps/ydb/CMakeLists.darwin-x86_64.txt1
-rw-r--r--ydb/apps/ydb/CMakeLists.linux-aarch64.txt1
-rw-r--r--ydb/apps/ydb/CMakeLists.linux-x86_64.txt1
-rw-r--r--ydb/apps/ydb/CMakeLists.windows-x86_64.txt1
-rw-r--r--ydb/apps/ydb/ya.make12
49 files changed, 9 insertions, 9332 deletions
diff --git a/contrib/libs/CMakeLists.darwin-x86_64.txt b/contrib/libs/CMakeLists.darwin-x86_64.txt
index fa9b8be410..d96017ec15 100644
--- a/contrib/libs/CMakeLists.darwin-x86_64.txt
+++ b/contrib/libs/CMakeLists.darwin-x86_64.txt
@@ -8,8 +8,6 @@
add_subdirectory(antlr3_cpp_runtime)
add_subdirectory(apache)
-add_subdirectory(asmglibc)
-add_subdirectory(asmlib)
add_subdirectory(aws-sdk-cpp)
add_subdirectory(base64)
add_subdirectory(brotli)
diff --git a/contrib/libs/CMakeLists.linux-aarch64.txt b/contrib/libs/CMakeLists.linux-aarch64.txt
index 358f97add0..ced3728c58 100644
--- a/contrib/libs/CMakeLists.linux-aarch64.txt
+++ b/contrib/libs/CMakeLists.linux-aarch64.txt
@@ -8,7 +8,6 @@
add_subdirectory(antlr3_cpp_runtime)
add_subdirectory(apache)
-add_subdirectory(asmlib)
add_subdirectory(aws-sdk-cpp)
add_subdirectory(base64)
add_subdirectory(brotli)
diff --git a/contrib/libs/CMakeLists.linux-x86_64.txt b/contrib/libs/CMakeLists.linux-x86_64.txt
index ac47fb1732..a1dec91afd 100644
--- a/contrib/libs/CMakeLists.linux-x86_64.txt
+++ b/contrib/libs/CMakeLists.linux-x86_64.txt
@@ -8,7 +8,6 @@
add_subdirectory(antlr3_cpp_runtime)
add_subdirectory(apache)
-add_subdirectory(asmlib)
add_subdirectory(aws-sdk-cpp)
add_subdirectory(base64)
add_subdirectory(brotli)
diff --git a/contrib/libs/CMakeLists.windows-x86_64.txt b/contrib/libs/CMakeLists.windows-x86_64.txt
index 99a7d95650..0c3e7223c9 100644
--- a/contrib/libs/CMakeLists.windows-x86_64.txt
+++ b/contrib/libs/CMakeLists.windows-x86_64.txt
@@ -8,7 +8,6 @@
add_subdirectory(antlr3_cpp_runtime)
add_subdirectory(apache)
-add_subdirectory(asmlib)
add_subdirectory(aws-sdk-cpp)
add_subdirectory(base64)
add_subdirectory(brotli)
diff --git a/contrib/libs/asmglibc/CMakeLists.darwin-x86_64.txt b/contrib/libs/asmglibc/CMakeLists.darwin-x86_64.txt
deleted file mode 100644
index e2b4e37fbb..0000000000
--- a/contrib/libs/asmglibc/CMakeLists.darwin-x86_64.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-
-# This file was generated by the build system used internally in the Yandex monorepo.
-# Only simple modifications are allowed (adding source-files to targets, adding simple properties
-# like target_include_directories). These modifications will be ported to original
-# ya.make files by maintainers. Any complex modifications which can't be ported back to the
-# original buildsystem will not be accepted.
-
-
-
-add_library(contrib-libs-asmglibc)
-target_sources(contrib-libs-asmglibc PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmglibc/memchr.S
-)
diff --git a/contrib/libs/asmglibc/CMakeLists.txt b/contrib/libs/asmglibc/CMakeLists.txt
deleted file mode 100644
index 661b6431cc..0000000000
--- a/contrib/libs/asmglibc/CMakeLists.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-
-# This file was generated by the build system used internally in the Yandex monorepo.
-# Only simple modifications are allowed (adding source-files to targets, adding simple properties
-# like target_include_directories). These modifications will be ported to original
-# ya.make files by maintainers. Any complex modifications which can't be ported back to the
-# original buildsystem will not be accepted.
-
-
-if (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
- include(CMakeLists.darwin-x86_64.txt)
-endif()
diff --git a/contrib/libs/asmglibc/memchr.S b/contrib/libs/asmglibc/memchr.S
deleted file mode 100644
index b0a51115c4..0000000000
--- a/contrib/libs/asmglibc/memchr.S
+++ /dev/null
@@ -1,330 +0,0 @@
-/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include "sysdep.h"
-
-#ifdef USE_AS_WMEMCHR
-# define MEMCHR wmemchr
-# define PCMPEQ pcmpeqd
-#else
-# define MEMCHR memchr
-# define PCMPEQ pcmpeqb
-#endif
-
-/* fast SSE2 version with using pmaxub and 64 byte loop */
-
- .text
-ENTRY(MEMCHR)
- movd %esi, %xmm1
- mov %edi, %ecx
-
-#ifdef USE_AS_WMEMCHR
- test %rdx, %rdx
- jz L(return_null)
- shl $2, %rdx
-#else
- punpcklbw %xmm1, %xmm1
- test %rdx, %rdx
- jz L(return_null)
- punpcklbw %xmm1, %xmm1
-#endif
-
- and $63, %ecx
- pshufd $0, %xmm1, %xmm1
-
- cmp $48, %ecx
- ja L(crosscache)
-
- movdqu (%rdi), %xmm0
- PCMPEQ %xmm1, %xmm0
- pmovmskb %xmm0, %eax
- test %eax, %eax
-
- jnz L(matches_1)
- sub $16, %rdx
- jbe L(return_null)
- add $16, %rdi
- and $15, %ecx
- and $-16, %rdi
- add %rcx, %rdx
- sub $64, %rdx
- jbe L(exit_loop)
- jmp L(loop_prolog)
-
- .p2align 4
-L(crosscache):
- and $15, %ecx
- and $-16, %rdi
- movdqa (%rdi), %xmm0
-
- PCMPEQ %xmm1, %xmm0
-/* Check if there is a match. */
- pmovmskb %xmm0, %eax
-/* Remove the leading bytes. */
- sar %cl, %eax
- test %eax, %eax
- je L(unaligned_no_match)
-/* Check which byte is a match. */
- bsf %eax, %eax
-
- sub %rax, %rdx
- jbe L(return_null)
- add %rdi, %rax
- add %rcx, %rax
- ret
-
- .p2align 4
-L(unaligned_no_match):
- /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using
- "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void
- possible addition overflow. */
- neg %rcx
- add $16, %rcx
- sub %rcx, %rdx
- jbe L(return_null)
- add $16, %rdi
- sub $64, %rdx
- jbe L(exit_loop)
-
- .p2align 4
-L(loop_prolog):
- movdqa (%rdi), %xmm0
- PCMPEQ %xmm1, %xmm0
- pmovmskb %xmm0, %eax
- test %eax, %eax
- jnz L(matches)
-
- movdqa 16(%rdi), %xmm2
- PCMPEQ %xmm1, %xmm2
- pmovmskb %xmm2, %eax
- test %eax, %eax
- jnz L(matches16)
-
- movdqa 32(%rdi), %xmm3
- PCMPEQ %xmm1, %xmm3
- pmovmskb %xmm3, %eax
- test %eax, %eax
- jnz L(matches32)
-
- movdqa 48(%rdi), %xmm4
- PCMPEQ %xmm1, %xmm4
- add $64, %rdi
- pmovmskb %xmm4, %eax
- test %eax, %eax
- jnz L(matches0)
-
- test $0x3f, %rdi
- jz L(align64_loop)
-
- sub $64, %rdx
- jbe L(exit_loop)
-
- movdqa (%rdi), %xmm0
- PCMPEQ %xmm1, %xmm0
- pmovmskb %xmm0, %eax
- test %eax, %eax
- jnz L(matches)
-
- movdqa 16(%rdi), %xmm2
- PCMPEQ %xmm1, %xmm2
- pmovmskb %xmm2, %eax
- test %eax, %eax
- jnz L(matches16)
-
- movdqa 32(%rdi), %xmm3
- PCMPEQ %xmm1, %xmm3
- pmovmskb %xmm3, %eax
- test %eax, %eax
- jnz L(matches32)
-
- movdqa 48(%rdi), %xmm3
- PCMPEQ %xmm1, %xmm3
- pmovmskb %xmm3, %eax
-
- add $64, %rdi
- test %eax, %eax
- jnz L(matches0)
-
- mov %rdi, %rcx
- and $-64, %rdi
- and $63, %ecx
- add %rcx, %rdx
-
- .p2align 4
-L(align64_loop):
- sub $64, %rdx
- jbe L(exit_loop)
- movdqa (%rdi), %xmm0
- movdqa 16(%rdi), %xmm2
- movdqa 32(%rdi), %xmm3
- movdqa 48(%rdi), %xmm4
-
- PCMPEQ %xmm1, %xmm0
- PCMPEQ %xmm1, %xmm2
- PCMPEQ %xmm1, %xmm3
- PCMPEQ %xmm1, %xmm4
-
- pmaxub %xmm0, %xmm3
- pmaxub %xmm2, %xmm4
- pmaxub %xmm3, %xmm4
- pmovmskb %xmm4, %eax
-
- add $64, %rdi
-
- test %eax, %eax
- jz L(align64_loop)
-
- sub $64, %rdi
-
- pmovmskb %xmm0, %eax
- test %eax, %eax
- jnz L(matches)
-
- pmovmskb %xmm2, %eax
- test %eax, %eax
- jnz L(matches16)
-
- movdqa 32(%rdi), %xmm3
- PCMPEQ %xmm1, %xmm3
-
- PCMPEQ 48(%rdi), %xmm1
- pmovmskb %xmm3, %eax
- test %eax, %eax
- jnz L(matches32)
-
- pmovmskb %xmm1, %eax
- bsf %eax, %eax
- lea 48(%rdi, %rax), %rax
- ret
-
- .p2align 4
-L(exit_loop):
- add $32, %edx
- jle L(exit_loop_32)
-
- movdqa (%rdi), %xmm0
- PCMPEQ %xmm1, %xmm0
- pmovmskb %xmm0, %eax
- test %eax, %eax
- jnz L(matches)
-
- movdqa 16(%rdi), %xmm2
- PCMPEQ %xmm1, %xmm2
- pmovmskb %xmm2, %eax
- test %eax, %eax
- jnz L(matches16)
-
- movdqa 32(%rdi), %xmm3
- PCMPEQ %xmm1, %xmm3
- pmovmskb %xmm3, %eax
- test %eax, %eax
- jnz L(matches32_1)
- sub $16, %edx
- jle L(return_null)
-
- PCMPEQ 48(%rdi), %xmm1
- pmovmskb %xmm1, %eax
- test %eax, %eax
- jnz L(matches48_1)
- xor %eax, %eax
- ret
-
- .p2align 4
-L(exit_loop_32):
- add $32, %edx
- movdqa (%rdi), %xmm0
- PCMPEQ %xmm1, %xmm0
- pmovmskb %xmm0, %eax
- test %eax, %eax
- jnz L(matches_1)
- sub $16, %edx
- jbe L(return_null)
-
- PCMPEQ 16(%rdi), %xmm1
- pmovmskb %xmm1, %eax
- test %eax, %eax
- jnz L(matches16_1)
- xor %eax, %eax
- ret
-
- .p2align 4
-L(matches0):
- bsf %eax, %eax
- lea -16(%rax, %rdi), %rax
- ret
-
- .p2align 4
-L(matches):
- bsf %eax, %eax
- add %rdi, %rax
- ret
-
- .p2align 4
-L(matches16):
- bsf %eax, %eax
- lea 16(%rax, %rdi), %rax
- ret
-
- .p2align 4
-L(matches32):
- bsf %eax, %eax
- lea 32(%rax, %rdi), %rax
- ret
-
- .p2align 4
-L(matches_1):
- bsf %eax, %eax
- sub %rax, %rdx
- jbe L(return_null)
- add %rdi, %rax
- ret
-
- .p2align 4
-L(matches16_1):
- bsf %eax, %eax
- sub %rax, %rdx
- jbe L(return_null)
- lea 16(%rdi, %rax), %rax
- ret
-
- .p2align 4
-L(matches32_1):
- bsf %eax, %eax
- sub %rax, %rdx
- jbe L(return_null)
- lea 32(%rdi, %rax), %rax
- ret
-
- .p2align 4
-L(matches48_1):
- bsf %eax, %eax
- sub %rax, %rdx
- jbe L(return_null)
- lea 48(%rdi, %rax), %rax
- ret
-
- .p2align 4
-L(return_null):
- xor %eax, %eax
- ret
-END(MEMCHR)
-
-#ifndef USE_AS_WMEMCHR
-strong_alias (memchr, __memchr)
-libc_hidden_builtin_def(memchr)
-#endif \ No newline at end of file
diff --git a/contrib/libs/asmglibc/sysdep.h b/contrib/libs/asmglibc/sysdep.h
deleted file mode 100644
index 1cfb71673e..0000000000
--- a/contrib/libs/asmglibc/sysdep.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#if defined(__APPLE__)
- #define ENTRY(X) .globl _## X; .align 1<<3; _ ## X:
- #define END(X)
- #define L(X) L ## X
-#else
- #define ENTRY(X) .globl X; .type X,@function; .align 1<<4; X: .cfi_startproc;
- #define END(X) .cfi_endproc; .size X,.-X;
- #define L(X) .L ## X
-#endif
-
-#define libc_hidden_builtin_def(X)
-#define strong_alias(X, Y)
diff --git a/contrib/libs/asmglibc/ya.make b/contrib/libs/asmglibc/ya.make
deleted file mode 100644
index c64ea8388a..0000000000
--- a/contrib/libs/asmglibc/ya.make
+++ /dev/null
@@ -1,17 +0,0 @@
-LIBRARY()
-
-LICENSE(LGPL-2.1-or-later)
-
-LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
-
-VERSION(2.27)
-
-ORIGINAL_SOURCE(http://ftp.gnu.org/gnu/glibc/)
-
-NO_PLATFORM()
-
-SRCS(
- memchr.S
-)
-
-END()
diff --git a/contrib/libs/asmlib/CMakeLists.darwin-x86_64.txt b/contrib/libs/asmlib/CMakeLists.darwin-x86_64.txt
deleted file mode 100644
index 56e892f3a2..0000000000
--- a/contrib/libs/asmlib/CMakeLists.darwin-x86_64.txt
+++ /dev/null
@@ -1,192 +0,0 @@
-
-# This file was generated by the build system used internally in the Yandex monorepo.
-# Only simple modifications are allowed (adding source-files to targets, adding simple properties
-# like target_include_directories). These modifications will be ported to original
-# ya.make files by maintainers. Any complex modifications which can't be ported back to the
-# original buildsystem will not be accepted.
-
-
-
-add_library(contrib-libs-asmlib)
-target_link_libraries(contrib-libs-asmlib PUBLIC
- contrib-libs-asmglibc
-)
-target_sources(contrib-libs-asmlib PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/dummy.c
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/debugbreak64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/cachesize64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/divfixedi64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/rdtsc64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/strcat64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/unalignedisfaster64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/strcpy64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/substring64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/strlen64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/cputype64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memcmp64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memmove64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/stricmp64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/divfixedv64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/physseed64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/cpuid64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/round64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memcpy64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/popcount64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/dispatchpatch64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/procname64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memset64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
diff --git a/contrib/libs/asmlib/CMakeLists.linux-aarch64.txt b/contrib/libs/asmlib/CMakeLists.linux-aarch64.txt
deleted file mode 100644
index d29b43c90a..0000000000
--- a/contrib/libs/asmlib/CMakeLists.linux-aarch64.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-
-# This file was generated by the build system used internally in the Yandex monorepo.
-# Only simple modifications are allowed (adding source-files to targets, adding simple properties
-# like target_include_directories). These modifications will be ported to original
-# ya.make files by maintainers. Any complex modifications which can't be ported back to the
-# original buildsystem will not be accepted.
-
-
-
-add_library(contrib-libs-asmlib)
-target_link_libraries(contrib-libs-asmlib PUBLIC
- contrib-libs-linux-headers
-)
-target_sources(contrib-libs-asmlib PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/dummy.c
-)
diff --git a/contrib/libs/asmlib/CMakeLists.linux-x86_64.txt b/contrib/libs/asmlib/CMakeLists.linux-x86_64.txt
deleted file mode 100644
index e4b9975e9f..0000000000
--- a/contrib/libs/asmlib/CMakeLists.linux-x86_64.txt
+++ /dev/null
@@ -1,216 +0,0 @@
-
-# This file was generated by the build system used internally in the Yandex monorepo.
-# Only simple modifications are allowed (adding source-files to targets, adding simple properties
-# like target_include_directories). These modifications will be ported to original
-# ya.make files by maintainers. Any complex modifications which can't be ported back to the
-# original buildsystem will not be accepted.
-
-
-
-add_library(contrib-libs-asmlib)
-target_link_libraries(contrib-libs-asmlib PUBLIC
- contrib-libs-linux-headers
-)
-target_sources(contrib-libs-asmlib PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/dummy.c
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/sfmt64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/mother64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/mersenne64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/debugbreak64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/cachesize64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/divfixedi64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/rdtsc64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/strcat64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/unalignedisfaster64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/strcpy64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/substring64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/strlen64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/cputype64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memcmp64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memmove64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/stricmp64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/divfixedv64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/physseed64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/cpuid64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/round64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memcpy64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/popcount64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/dispatchpatch64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/procname64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memset64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
diff --git a/contrib/libs/asmlib/CMakeLists.txt b/contrib/libs/asmlib/CMakeLists.txt
deleted file mode 100644
index f8b31df0c1..0000000000
--- a/contrib/libs/asmlib/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-
-# This file was generated by the build system used internally in the Yandex monorepo.
-# Only simple modifications are allowed (adding source-files to targets, adding simple properties
-# like target_include_directories). These modifications will be ported to original
-# ya.make files by maintainers. Any complex modifications which can't be ported back to the
-# original buildsystem will not be accepted.
-
-
-if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA)
- include(CMakeLists.linux-aarch64.txt)
-elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
- include(CMakeLists.darwin-x86_64.txt)
-elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA)
- include(CMakeLists.windows-x86_64.txt)
-elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA)
- include(CMakeLists.linux-x86_64.txt)
-endif()
diff --git a/contrib/libs/asmlib/CMakeLists.windows-x86_64.txt b/contrib/libs/asmlib/CMakeLists.windows-x86_64.txt
deleted file mode 100644
index 6e1a2adde6..0000000000
--- a/contrib/libs/asmlib/CMakeLists.windows-x86_64.txt
+++ /dev/null
@@ -1,213 +0,0 @@
-
-# This file was generated by the build system used internally in the Yandex monorepo.
-# Only simple modifications are allowed (adding source-files to targets, adding simple properties
-# like target_include_directories). These modifications will be ported to original
-# ya.make files by maintainers. Any complex modifications which can't be ported back to the
-# original buildsystem will not be accepted.
-
-
-
-add_library(contrib-libs-asmlib)
-target_sources(contrib-libs-asmlib PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/dummy.c
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/sfmt64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/mother64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/mersenne64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/debugbreak64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/cachesize64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/divfixedi64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/rdtsc64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/strcat64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/unalignedisfaster64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/strcpy64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/substring64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/strlen64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/cputype64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memcmp64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memmove64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/stricmp64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/divfixedv64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/physseed64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/cpuid64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/round64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memcpy64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/popcount64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/dispatchpatch64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/procname64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
- PRIVATE
- ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memset64.asm
- -I
- ${CMAKE_BINARY_DIR}
- -I
- ${CMAKE_SOURCE_DIR}
-)
diff --git a/contrib/libs/asmlib/cachesize64.asm b/contrib/libs/asmlib/cachesize64.asm
deleted file mode 100644
index c0bce8cf74..0000000000
--- a/contrib/libs/asmlib/cachesize64.asm
+++ /dev/null
@@ -1,335 +0,0 @@
-%include "defs.asm"
-
-;************************* cachesize64.asm *************************************
-; Author: Agner Fog
-; Date created: 2011-07-11
-; Last modified: 2013-08-14
-; Description:
-; Determines the size of the data caches
-;
-; extern "C" site_t DataCacheSize(int level);
-; Input:
-; level: n = 1 - 4: level n data cache
-; 0 = largest level data cache
-; Return value: size in bytes of data cache
-;
-; The latest version of this file is available at:
-; www.agner.org/optimize/asmexamples.zip
-; Copyright (c) 2011-2013 GNU General Public License www.gnu.org/licenses
-;******************************************************************************
-
-default rel
-
-global DataCacheSize: function
-
-; Imported from cputype64.asm
-extern CpuType ; near. Determine CPU vendor
-
-struc data_layout
-ok: resd 2
-level1: resq 1
-level2: resq 1
-level3: resq 1
-level4: resq 1
-descriptortable: resd 60
-endstruc
-
-struc descriptor_record ; record for table of cache descriptors
-d_key: resb 1 ; key from cpuid instruction
-d_level: resb 1 ; cache level
-d_sizem: resb 1 ; size multiplier
-d_2pow: resb 1 ; power of 2. size = d_sizem << d_2pow
-endstruc
-
-SECTION .data
-
-dataref: ; reference point
-ok_: DD 0, 0 ; 1 when values are determined
-level1_: DQ 0 ; level 1 data cache size
-level2_: DQ 0 ; level 2 data cache size
-level3_: DQ 0 ; level 3 data cache size
-level4_: DQ 0 ; level 4 data cache size
-numlevels equ 4 ; max level
-
-; From "Intel Processor Identification and the CPUID Instruction, Application note 485
-descriptortable_: ; table of Intel cache descriptors
-db 0Ah, 1, 1, 13 ; 8 kb L1 data cache
-db 0Ch, 1, 1, 14 ; 16 kb L1 data cache
-db 0Dh, 1, 1, 14 ; 16 kb L1 data cache
-db 21h, 2, 1, 18 ; 256 kb L2 data cache
-db 22h, 3, 1, 19 ; 512 kb L3 data cache
-db 23h, 3, 1, 20 ; 1 Mb L3 data cache
-db 25h, 3, 1, 21 ; 2 Mb L3 data cache
-db 29h, 3, 1, 22 ; 4 Mb L3 data cache
-db 2Ch, 1, 1, 15 ; 32 kb L1 data cache
-db 39h, 2, 1, 17 ; 128 kb L2 data cache
-db 3Ah, 2, 3, 16 ; 192 kb L2 data cache
-db 3Bh, 2, 1, 17 ; 128 kb L1 data cache
-db 3Ch, 2, 1, 18 ; 256 kb L1 data cache
-db 3Dh, 2, 3, 17 ; 384 kb L2 data cache
-db 3Eh, 2, 1, 19 ; 512 kb L2 data cache
-db 41h, 2, 1, 17 ; 128 kb L2 data cache
-db 42h, 2, 1, 18 ; 256 kb L2 data cache
-db 43h, 2, 1, 19 ; 512 kb L2 data cache
-db 44h, 2, 1, 20 ; 1 Mb L2 data cache
-db 45h, 2, 1, 21 ; 2 Mb L2 data cache
-db 46h, 3, 1, 22 ; 4 Mb L3 data cache
-db 47h, 3, 1, 23 ; 8 Mb L3 data cache
-db 48h, 2, 3, 20 ; 3 Mb L2 data cache
-db 49h, 2, 1, 22 ; 4 Mb L2 or 3 data cache
-db 4Ah, 3, 3, 21 ; 6 Mb L3 data cache
-db 4Bh, 3, 1, 23 ; 8 Mb L3 data cache
-db 4Ch, 3, 3, 22 ; 12 Mb L3 data cache
-db 4Dh, 3, 1, 24 ; 16 Mb L3 data cache
-db 4Eh, 2, 3, 21 ; 6 Mb L2 data cache
-db 60h, 1, 1, 14 ; 16 kb L1 data cache
-db 66h, 1, 1, 13 ; 8 kb L1 data cache
-db 67h, 1, 1, 14 ; 16 kb L1 data cache
-db 68h, 1, 1, 15 ; 32 kb L1 data cache
-db 78h, 2, 1, 20 ; 1 Mb L2 data cache
-db 79h, 2, 1, 17 ; 128 kb L2 data cache
-db 7Ah, 2, 1, 18 ; 256 kb L2 data cache
-db 7Bh, 2, 1, 19 ; 512 kb L2 data cache
-db 7Ch, 2, 1, 20 ; 1 Mb L2 data cache
-db 7Dh, 2, 1, 21 ; 2 Mb L2 data cache
-db 7Fh, 2, 1, 19 ; 512 kb L2 data cache
-db 82h, 2, 1, 18 ; 256 kb L2 data cache
-db 83h, 2, 1, 19 ; 512 kb L2 data cache
-db 84h, 2, 1, 20 ; 1 Mb L2 data cache
-db 85h, 2, 1, 21 ; 2 Mb L2 data cache
-db 86h, 2, 1, 19 ; 512 kb L2 data cache
-db 87h, 2, 1, 20 ; 1 Mb L2 data cache
-db 0D0h, 3, 1, 19 ; 512 kb L3 data cache
-db 0D1h, 3, 1, 20 ; 1 Mb L3 data cache
-db 0D2h, 3, 1, 21 ; 2 Mb L3 data cache
-db 0D6h, 3, 1, 20 ; 1 Mb L3 data cache
-db 0D7h, 3, 1, 21 ; 2 Mb L3 data cache
-db 0D8h, 3, 1, 22 ; 4 Mb L3 data cache
-db 0DCh, 3, 3, 19 ; 1.5 Mb L3 data cache
-db 0DDh, 3, 3, 20 ; 3 Mb L3 data cache
-db 0DEh, 3, 3, 21 ; 6 Mb L3 data cache
-db 0E2h, 3, 1, 21 ; 2 Mb L3 data cache
-db 0E3h, 3, 1, 22 ; 4 Mb L3 data cache
-db 0E4h, 3, 1, 23 ; 8 Mb L3 data cache
-db 0EAh, 3, 3, 22 ; 12 Mb L3 data cache
-db 0EBh, 3, 9, 21 ; 18 Mb L3 data cache
-db 0ECh, 3, 3, 23 ; 24 Mb L3 data cache
-descriptortablelength equ ($ - descriptortable_) / descriptor_record_size
-
-
-SECTION .text
-
-; extern "C" site_t DataCacheSize(int level);
-
-; Function entry:
-DataCacheSize:
- push rbx
- push r14
-%ifdef WINDOWS
- push rsi
- push rdi
- mov r14d, ecx ; level
-%else ; UNIX
- mov r14d, edi ; level
-%endif
- ; check if called before
- lea r9, [dataref]
- cmp dword [r9+ok], 1 ; ok
- je D800
-
- ; find cpu vendor
- push 0
-%ifdef WINDOWS
- mov rcx, rsp
- xor edx, edx
- xor r8d, r8d
-%else ; UNIX
- mov rdi, rsp
- xor esi, esi
- xor edx, edx
-%endif
- call CpuType
- lea r9, [dataref]
- pop rax ; eax = vendor
- dec eax
- jz Intel
- dec eax
- jz AMD
- dec eax
- jz VIA
- ; unknown vendor, try all methods
- call IntelNewMethod
- jnc D800 ; not carry = success
- call AMDMethod
- jnc D800 ; not carry = success
- call IntelOldMethod
- jmp D800 ; return whether success or not
-
-Intel: call IntelNewMethod
- jnc D800 ; not carry = success
- call IntelOldMethod
- jmp D800 ; return whether success or not
-
-AMD: ; AMD and VIA use same method
-VIA: call AMDMethod
-
-D800: ; cache data known, get desired return value
- xor eax, eax
- cmp r14d, numlevels
- ja D900
- cmp r14d, 0
- je D820
- ; level = 1 .. numlevels
- mov rax, [r9 + r14*8] ; size of selected cache
- jmp D850
-D820: ; level = 0. Get size of largest level cache
- mov rax, [r9 + level3] ; level3
- test rax, rax
- jnz D850
- mov rax, [r9 + level2] ; level2
- test rax, rax
- jnz D850
- mov eax, [r9 + level1] ; level1
-D850: mov dword [r9 + ok], 1 ; remember called, whether success or not
-D900:
-%ifdef WINDOWS
- pop rdi
- pop rsi
-%endif
- pop r14
- pop rbx
- ret
-
-
-; Determine cache sizes by CPUID function 4
-; input: esi = pointer to dataref
-; output: values returned in dataref + level1, level2, level3
-; carry flag = 0 on succes
-IntelNewMethod:
- xor eax, eax
- cpuid ; get number of CPUID functions
- cmp eax, 4
- jb I900 ; fail
- xor esi, esi ; loop counter
-I100: mov eax, 4
- mov ecx, esi
- cpuid ; get cache parameters
- mov edx, eax
- and edx, 11111b ; cache type
- jz I500 ; no more caches
- cmp edx, 2
- je I200 ; code cache, ignore
- inc ecx ; sets
- mov edx, ebx
- shr edx, 22
- inc edx ; ways
- imul ecx, edx
- mov edx, ebx
- shr edx, 12
- and edx, 1111111111b
- inc edx ; partitions
- imul ecx, edx
- and ebx, 111111111111b
- inc ebx ; line size
- imul rcx, rbx ; calculated cache size (64 bit)
- shr eax, 5
- and eax, 111b ; cache level
- cmp eax, numlevels
- jna I180
- mov eax, numlevels ; limit higher levels
-I180: mov [r9+rax*8], rcx ; store size of data cache level eax
-I200: inc esi
- cmp esi, 100h ; avoid infinite loop
- jb I100 ; next cache
-I500: ; loop finished
- ; check if OK
- mov eax, [r9+level1] ; level1
- cmp eax, 1024
-I900: ret ; carry flag set if fail
-
-; Determine cache sizes by CPUID function 2
-; input: esi = pointer to dataref
-; output: values returned in dataref + level1, level2, level3
-; carry flag = 0 on succes
-IntelOldMethod:
- xor eax, eax
- cpuid ; get number of CPUID functions
- cmp eax, 2
- jb J900 ; fail
- mov eax, 2
- xor ecx, ecx
- cpuid ; get 16 descriptor bytes in eax, ebx, ecx, edx
- mov al, 0 ; al does not contain a descriptor
- sub rsp, 16
- mov [rsp], eax ; save all descriptors
- mov [rsp+4], ebx
- mov [rsp+8], ecx
- mov [rsp+12], edx
- mov edx, 15 ; loop counter
- ; loop to read 16 descriptor bytes
-J100: mov al, byte [rsp+rdx]
- ; find in table
- mov ebx, descriptortablelength-1 ; loop counter
- ; loop to search in descriptortable
-J200: cmp al, [r9 + descriptortable + rbx*4 + d_key]
- jne J300
- ; descriptor found
- movzx eax, byte [r9 + descriptortable + rbx*4 + d_sizem]
- mov cl, [r9 + descriptortable + rbx*4 + d_2pow]
- shl eax, cl ; compute size
- movzx ecx, byte [r9 + descriptortable + rbx*4 + d_level]
- ; check that level = 1-3
- cmp ecx, 3
- ja J300
- mov [r9+rcx*8], rax ; store size eax of data cache level ecx
-J300: dec ebx
- jns J200 ; inner loop
- dec edx
- jns J100 ; outer loop
- add rsp, 16 ; remove from stack
- ; check if OK
- mov eax, [r9 + level1]
- cmp eax, 1024
-J900: ret ; carry flag set if fail
-
-
-; Determine cache sizes by CPUID function 80000005H - 80000006H
-; input: esi = pointer to dataref
-; output: values returned in dataref
-; carry flag = 0 on succes
-AMDMethod:
- mov eax, 80000000H
- cpuid ; get number of CPUID functions
- cmp eax, 6
- jb K900 ; fail
- mov eax, 80000005H
- cpuid ; get L1 cache size
- shr ecx, 24 ; L1 data cache size in kbytes
- shl ecx, 10 ; L1 data cache size in bytes
- mov [r9 + level1], ecx ; store L1 data cache size
- mov eax, 80000006H
- cpuid ; get L2 and L3 cache sizes
- shr ecx, 16 ; L2 data cache size in kbytes
- shl ecx, 10 ; L2 data cache size in bytes
- mov [r9 + level2], ecx ; store L2 data cache size
- mov ecx, edx
- shr ecx, 18 ; L3 data cache size / 512 kbytes
- shl rcx, 19 ; L3 data cache size in bytes
-%if 0 ; AMD manual is unclear:
- ; do we have to increase the value if the number of ways is not a power or 2?
- shr edx, 12
- and edx, 1111b ; L3 associativity
- cmp edx, 3
- jb K100
- test edx, 1
- jz K100
- ; number of ways is not a power of 2, multiply by 1.5 ?
- mov rax, rcx
- shr rax, 1
- add rcx, rax
-%endif
-K100: mov [r9 + level3], rcx ; store L3 data cache size
- ; check if OK
- mov eax, [r9 + level1]
- cmp eax, 1024
-K900: ret ; carry flag set if fail
diff --git a/contrib/libs/asmlib/cpuid64.asm b/contrib/libs/asmlib/cpuid64.asm
deleted file mode 100644
index 95f1b5a22d..0000000000
--- a/contrib/libs/asmlib/cpuid64.asm
+++ /dev/null
@@ -1,55 +0,0 @@
-%include "defs.asm"
-
-;************************* cpuid64.asm *********************************
-; Author: Agner Fog
-; Date created: 2008-12-14
-; Last modified: 2011-07-01
-; Source URL: www.agner.org/optimize
-; Project: asmlib.zip
-; Description:
-; This function calls the CPUID instruction.
-;
-; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
-;******************************************************************************
-
-default rel
-
-global cpuid_ex: function
-
-SECTION .text align=16
-
-; ********** cpuid_ex function **********
-; C++ prototype:
-; extern "C" void cpuid_ex (int abcd[4], int a, int c);
-; Input: a = eax, c = ecx
-; Output: abcd[0] = eax, abcd[1] = ebx, abcd[2] = ecx, abcd[3] = edx
-
-
-cpuid_ex:
-
-%IFDEF WINDOWS
-; parameters: rcx = abcd, edx = a, r8d = c
- push rbx
- xchg rcx, r8
- mov eax, edx
- cpuid ; input eax, ecx. output eax, ebx, ecx, edx
- mov [r8], eax
- mov [r8+4], ebx
- mov [r8+8], ecx
- mov [r8+12], edx
- pop rbx
-%ENDIF
-%IFDEF UNIX
-; parameters: rdi = abcd, esi = a, edx = c
- push rbx
- mov eax, esi
- mov ecx, edx
- cpuid ; input eax, ecx. output eax, ebx, ecx, edx
- mov [rdi], eax
- mov [rdi+4], ebx
- mov [rdi+8], ecx
- mov [rdi+12], edx
- pop rbx
-%ENDIF
- ret
-;cpuid_ex END
diff --git a/contrib/libs/asmlib/cputype64.asm b/contrib/libs/asmlib/cputype64.asm
deleted file mode 100644
index 633ebee86a..0000000000
--- a/contrib/libs/asmlib/cputype64.asm
+++ /dev/null
@@ -1,127 +0,0 @@
-%include "defs.asm"
-
-;************************* cputype64.asm **********************************
-; Author: Agner Fog
-; Date created: 2011-07-09
-; Last modified: 2011-07-09
-; Source URL: www.agner.org/optimize
-; Project: asmlib.zip
-; Language: assembly, NASM/YASM syntax, 64 bit
-;
-; C++ prototype:
-; extern "C" void CpuType(int * vendor, int * family, int * model);
-;
-; Description:
-; This function finds the vendor, family and model number of the CPU
-; and returns the values through the pointers. If a pointer is zero
-; then the value is not returned.
-;
-; Vendor:
-; 0 = unknown
-; 1 = Intel
-; 2 = AMD
-; 3 = VIA/Centaur
-; 4 = Cyrix
-; 5 = NexGen
-;
-; Family: This is the sum of the family and extended family fields of the cpuid
-; Model: This is the model + (extended model << 8)
-;
-; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
-;******************************************************************************
-;
-; C++ prototype:
-; extern "C" void CpuType(int * vendor, int * family, int * model);
-
-global CpuType: function
-
-
-SECTION .text
-
-CpuType:
- push rbx
-%ifdef UNIX
- mov r8, rdx
-%endif
-%ifdef WINDOWS
- push rsi
- push rdi
- mov rdi, rcx
- mov rsi, rdx
-%endif
-
-; parameters
-; vendor rdi
-; family rsi
-; model r8
-
- xor r9d, r9d ; vendor
- xor r10d, r10d ; family
- xor r11d, r11d ; model
-
- xor eax, eax
- cpuid ; get vendor
- ; ecx = last 4 characters of vendor string
- ; ebx = first 4 characters of vendor string
- cmp ecx, 'ntel' ; 'GenuineIntel'
- je C110
- cmp ecx, 'cAMD' ; 'AuthenticAMD'
- je C120
- cmp ebx, 'Cent' ; 'CentaurHauls'
- je C130
- cmp ebx, 'VIA ' ; 'VIA VIA VIA '
- je C130
- cmp ebx, 'Cyri' ; 'CyrixInstead'
- je C140
- cmp ebx, 'NexG' ; 'NexGenDriven'
- je C150
- jmp C200 ; other
-C110: or r9d, 1
- jmp C200
-C120: or r9d, 2
- jmp C200
-C130: or r9d, 3
- jmp C200
-C140: or r9d, 4
- jmp C200
-C150: or r9d, 5
- ;jmp C200
-C200:
-
- ; Get family and model
- mov eax, 1
- cpuid
- mov ebx, eax
- mov r10d, eax
- shr ebx, 8
- and ebx, 0FH ; Family
- shr r10d, 20
- and r10d, 0FFH ; Extended family
- add r10d, ebx ; Family + extended family
-
- mov r11d, eax
- shr r11d, 4
- and r11d, 0FH ; Model
- shr eax, 12
- and eax, 0F0H ; Extended model
- or r11d, eax ; extended model | Model
-
-C300: ; return r9d = vendor, r10d = family, r11d = model
- test rdi, rdi
- jz C310
- mov [rdi], r9d
-C310: test rsi, rsi
- jz C320
- mov [rsi], r10d
-C320: test r8, r8
- jz C330
- mov [r8], r11d
-C330: xor eax, eax
- ; return
-%ifdef WINDOWS
- pop rdi
- pop rsi
-%endif
- pop rbx
- ret
-;CpuType ENDP
diff --git a/contrib/libs/asmlib/debugbreak64.asm b/contrib/libs/asmlib/debugbreak64.asm
deleted file mode 100644
index ed2971cd24..0000000000
--- a/contrib/libs/asmlib/debugbreak64.asm
+++ /dev/null
@@ -1,33 +0,0 @@
-%include "defs.asm"
-
-;************************* debugbreak64.asm **********************************
-; Author: Agner Fog
-; Date created: 2011-07-09
-; Last modified: 2011-07-09
-; Source URL: www.agner.org/optimize
-; Project: asmlib.zip
-; Language: assembly, NASM/YASM syntax, 32 bit
-;
-; C++ prototype:
-; extern "C" void A_DebugBreak(void);
-;
-; Description:
-; Makes a debug breakpoint. Works only when running under a debugger
-;
-;
-; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
-;******************************************************************************
-;
-; C++ prototype:
-; extern "C" void A_DebugBreak(void);
-
-global A_DebugBreak: function
-
-
-SECTION .text
-
-A_DebugBreak:
- int3
- nop
- ret
-;A_DebugBreak ENDP
diff --git a/contrib/libs/asmlib/defs.asm b/contrib/libs/asmlib/defs.asm
deleted file mode 100644
index db313e6cf1..0000000000
--- a/contrib/libs/asmlib/defs.asm
+++ /dev/null
@@ -1,22 +0,0 @@
-%ifdef UNIX
- %ifdef DARWIN
- %define EXP(x) _ %+ x
- %else
- %define EXP(x) x
- %endif
-%else
- %define EXP(x) _ %+ x
- %define WINDOWS
-%endif
-
-%define ALLOW_OVERRIDE 1
-
-%ifdef WINDOWS
- %define WEAK_SYM(x) global x
-%else
- %ifdef DARWIN
- %define WEAK_SYM(x) global x
- %else
- %define WEAK_SYM(x) weak x
- %endif
-%endif
diff --git a/contrib/libs/asmlib/dispatchpatch64.asm b/contrib/libs/asmlib/dispatchpatch64.asm
deleted file mode 100644
index 205fac543d..0000000000
--- a/contrib/libs/asmlib/dispatchpatch64.asm
+++ /dev/null
@@ -1,303 +0,0 @@
-%include "defs.asm"
-
-;*********************** dispatchpatch64.asm ********************************
-; Author: Agner Fog
-; Date created: 2007-07-20
-; Last modified: 2013-08-21
-; Source URL: www.agner.org/optimize
-; Project: asmlib.zip
-; Language: assembly, NASM/YASM syntax, 64 bit
-;
-; C++ prototype:
-; extern "C" int __intel_cpu_indicator = 0;
-; extern "C" void __intel_cpu_indicator_init()
-;
-; Description:
-; Example of how to replace Intel CPU dispatcher in order to improve
-; compatibility of Intel function libraries with non-Intel processors.
-; Only works with static link libraries (*.lib, *.a), not dynamic libraries
-; (*.dll, *.so). Linking in this as an object file will override the functions
-; with the same name in the library.;
-;
-; Copyright (c) 2007-2013 GNU LGPL License v. 3.0 www.gnu.org/licenses/lgpl.html
-;******************************************************************************
-
-; extern InstructionSet: function
-%include "instrset64.asm" ; include code for InstructionSet function
-
-; InstructionSet function return value:
-; 4 or above = SSE2 supported
-; 5 or above = SSE3 supported
-; 6 or above = Supplementary SSE3
-; 8 or above = SSE4.1 supported
-; 9 or above = POPCNT supported
-; 10 or above = SSE4.2 supported
-; 11 or above = AVX supported by processor and operating system
-; 12 or above = PCLMUL and AES supported
-; 13 or above = AVX2 supported
-; 14 or above = FMA3, F16C, BMI1, BMI2, LZCNT
-; 15 or above = HLE + RTM supported
-
-
-global __intel_cpu_indicator
-global __intel_cpu_indicator_init
-
-
-SECTION .data
-intel_cpu_indicator@: ; local name
-__intel_cpu_indicator: dd 0
-
-; table of indicator values
-itable DD 1 ; 0: generic version, 80386 instruction set
- DD 8, 8 ; 1, 2: MMX
- DD 0x80 ; 3: SSE
- DD 0x200 ; 4: SSE2
- DD 0x800 ; 5: SSE3
- DD 0x1000, 0x1000 ; 6, 7: SSSE3
- DD 0x2000, 0x2000 ; 8, 9: SSE4.1
- DD 0x8000, 0x8000 ; 10, 11: SSE4.2 and popcnt
- DD 0x20000, 0x20000 ; 12, 13: AVX, pclmul, aes
- DD 0x400000 ; 14: AVX2, F16C, BMI1, BMI2, LZCNT, FMA3
- DD 0x800000 ; 15: HLE, RTM
-itablelen equ ($ - itable) / 4 ; length of table
-
-SECTION .text
-
-__intel_cpu_indicator_init:
- push rax ; registers must be pushed
- push rcx
- push rdx
- push r8
- push r9
- push r10
- push r11
- push rsi
- push rdi
- call InstructionSet
- cmp eax, itablelen
- jb L100
- mov eax, itablelen - 1 ; limit to table length
-L100: lea rdx, [rel itable]
- mov eax, [rdx + 4*rax]
- mov [rel intel_cpu_indicator@], eax ; store in __intel_cpu_indicator
- pop rdi
- pop rsi
- pop r11
- pop r10
- pop r9
- pop r8
- pop rdx
- pop rcx
- pop rax
- ret
-
-;__intel_cpu_indicator_init ENDP
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; Dispatcher for Math Kernel Library (MKL),
-; version 10.2 and higher
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-WEAK_SYM(mkl_serv_cpu_detect)
-
-SECTION .data
-; table of indicator values
-; Note: the table is different in 32 bit and 64 bit mode
-
-mkltab DD 0, 0, 0, 0 ; 0-3: generic version, 80386 instruction set
- DD 0 ; 4: SSE2
- DD 1 ; 5: SSE3
- DD 2, 2, 2, 2 ; 6-9: SSSE3
- DD 3 ; 10: SSE4.2
- DD 4, 4, 4 ; 11-13: AVX
- DD 5 ; 14: AVX2, FMA3, BMI1, BMI2, LZCNT, PCLMUL
-mkltablen equ ($ - mkltab) / 4 ; length of table
-
-SECTION .text
-
-mkl_serv_cpu_detect:
- push rcx ; Perhaps not needed
- push rdx
- push r8
- push r9
-%ifdef WINDOWS
- push rsi
- push rdi
-%endif
- call InstructionSet
- cmp eax, mkltablen
- jb M100
- mov eax, mkltablen - 1 ; limit to table length
-M100:
- lea rdx, [rel mkltab]
- mov eax, [rdx + 4*rax]
-%ifdef WINDOWS
- pop rdi
- pop rsi
-%endif
- pop r9
- pop r8
- pop rdx
- pop rcx
- ret
-; end mkl_serv_cpu_detect
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; Dispatcher for Vector Math Library (VML)
-; version 10.0 and higher
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-WEAK_SYM(mkl_vml_serv_cpu_detect)
-
-SECTION .data
-; table of indicator values
-; Note: the table is different in 32 bit and 64 bit mode
-
-vmltab DD 0, 0, 0, 0 ; 0-3: generic version, 80386 instruction set
- DD 1, 1 ; 4-5: SSE2
- DD 2, 2 ; 6-7: SSSE3
- DD 3, 3 ; 8-9: SSE4.1
- DD 4 ; 10: SSE4.2
- DD 5, 5, 5 ; 11: AVX
-; DD 6 ??
-vmltablen equ ($ - vmltab) / 4 ; length of table
-
-SECTION .text
-
-mkl_vml_serv_cpu_detect:
- push rcx ; Perhaps not needed
- push rdx
- push r8
- push r9
-%ifdef WINDOWS
- push rsi
- push rdi
-%endif
- call InstructionSet
- cmp eax, vmltablen
- jb V100
- mov eax, vmltablen - 1 ; limit to table length
-V100:
- lea rdx, [rel vmltab]
- mov eax, [rdx + 4*rax]
-%ifdef WINDOWS
- pop rdi
- pop rsi
-%endif
- pop r9
- pop r8
- pop rdx
- pop rcx
- ret
-; end mkl_vml_serv_cpu_detect
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; Dispatcher for __intel_cpu_feature_indicator
-; version 13 and higher
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-global __intel_cpu_features_init
-global __intel_cpu_feature_indicator
-global __intel_cpu_fms_indicator
-global __intel_cpu_features_init_x
-global __intel_cpu_feature_indicator_x
-global __intel_cpu_fms_indicator_x
-
-SECTION .data
-; table of indicator values
-
-intel_cpu_feature_indicator@:
-__intel_cpu_feature_indicator:
-__intel_cpu_feature_indicator_x DD 0, 0
-intel_cpu_fms_indicator@:
-__intel_cpu_fms_indicator:
-__intel_cpu_fms_indicator_x: DD 0, 0
-
-
-feattab DD 1 ; 0 default
- DD 0BH ; 1 MMX
- DD 0FH ; 2 conditional move and FCOMI supported
- DD 3FH ; 3 SSE
- DD 7FH ; 4 SSE2
- DD 0FFH ; 5 SSE3
- DD 1FFH, 1FFH ; 6 Supplementary SSE3
- DD 3FFH ; 8 SSE4.1
- DD 0BFFH ; 9 POPCNT
- DD 0FFFH ; 10 SSE4.2
- DD 10FFFH ; 11 AVX
- DD 16FFFH ; 12 PCLMUL and AES
- DD 816FFFH ; 13 AVX2
- DD 9DEFFFH ; 14 FMA3, F16C, BMI1, BMI2, LZCNT
- DD 0FDEFFFH ; 15 HLE, RTM
-
-feattablen equ ($ - feattab) / 4 ; length of table
-
-SECTION .text
-
-__intel_cpu_features_init:
-__intel_cpu_features_init_x:
- push rbx
- push rcx ; Perhaps not needed
- push rdx
- push r8
- push r9
-%ifdef WINDOWS
- push rsi
- push rdi
-%endif
- call InstructionSet
- cmp eax, feattablen
- jb F100
- mov eax, vmltablen - 1 ; limit to table length
-F100:
- lea rdx, [rel feattab]
- mov ebx, [rdx + 4*rax] ; look up in table
- push rbx
- mov eax, 1
- cpuid
- pop rbx
- bt ecx, 22 ; MOVBE
- jnc F200
- or ebx, 1000H
-F200: mov [intel_cpu_feature_indicator@], rbx
-
- ; get family and model
- mov edx, eax
- and eax, 0FH ; stepping bit 0-3
- mov ecx, edx
- shr ecx, 4
- and ecx, 0FH ; model
- mov ebx, edx
- shr ebx, 12
- and ebx, 0F0H ; x model
- or ecx, ebx ; full model
- mov ah, cl ; model bit 8 - 15
- mov ecx, edx
- shr ecx, 8
- and ecx, 0FH ; family
- mov ebx, edx
- shr ebx, 20
- and ebx, 0FFH ; x family
- add ecx, ebx ; full family
- shl ecx, 16
- or eax, ecx ; full family bit 16 - 23
- mov [intel_cpu_fms_indicator@], eax
-
-%ifdef WINDOWS
- pop rdi
- pop rsi
-%endif
- pop r9
- pop r8
- pop rdx
- pop rcx
- pop rbx
- ret
-; end __intel_cpu_features_init
-
-
-
-
diff --git a/contrib/libs/asmlib/divfixedi64.asm b/contrib/libs/asmlib/divfixedi64.asm
deleted file mode 100644
index bf8ab137a9..0000000000
--- a/contrib/libs/asmlib/divfixedi64.asm
+++ /dev/null
@@ -1,173 +0,0 @@
-%include "defs.asm"
-
-;************************* divfixedi64.asm *********************************
-; Author: Agner Fog
-; Date created: 2011-07-22
-; Last modified: 2011-07-22
-;
-; Function prototypes:
-; void setdivisori32(int buffer[2], int d);
-; int dividefixedi32(const int buffer[2], int x);
-; void setdivisoru32(uint32_t buffer[2], uint32_t d);
-; uint32_t dividefixedu32(const uint32_t buffer[2], uint32_t x);
-;
-; Description:
-; Functions for fast repeated integer division by the same divisor, signed
-; and unsigned 32-bit integer versions. The divisor must be positive.
-;
-; The setdivisor functions calculate the reciprocal divisor and shift counts,
-; the dividefixed functions do the division by multiplication and shift.
-;
-; The methods used are described by:
-; T. Granlund and P. L. Montgomery: Division by Invariant Integers Using Multiplication,
-; Proceedings of the SIGPLAN 1994 Conference on Programming Language Design and Implementation.
-; http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556
-;
-; Mathematical formula, unsigned division:
-; x = dividend
-; d = divisor
-; n = integer size, bits
-; L = ceil(log2(d))
-; m = 1 + 2^n * (2^L-d) / d [2^L should overflow to 0 if L = n]
-; sh1 = min(L,1)
-; sh2 = max(L-1,0)
-; t = m*x >> n [high part of unsigned multiplication]
-; x/d = (((x-t) >> sh1) + t) >> sh2
-;
-; Mathematical formula, signed division:
-; x = dividend
-; d = abs(divisor)
-; n = integer size, bits
-; L = ceil(log2(d))
-; L = max(L,1)
-; m = 1 + 2^(n+L-1)/d - 2^n [division should overflow to 0 if d = 1]
-; sh1 = L-1
-; q = x + (m*x >> n) [high part of signed multiplication]
-; q = (q >> sh1) - (x<0 ? -1 : 0)
-; if (divisor < 0) q = -q [negative divisor not supported in present implementation]
-; x/d = q
-;
-; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
-;******************************************************************************
-
-%IFDEF WINDOWS
-%define par1 rcx ; function parameter 1
-%define par2 edx ; function parameter 2
-%define buf r9 ; copy of function parameter 1: buffer
-%define rx r8
-%define rxd r8d ; d or x
-%ELSE ; UNIX
-%define par1 rdi ; function parameter 1
-%define par2 esi ; function parameter 2
-%define buf rdi ; function parameter 1: buffer
-%define rx rsi
-%define rxd esi ; d or x
-%ENDIF
-
-
-section .text
-
-; extern "C" void setdivisori32(int buffer[2], int d);
-; 32 bit signed
-
-global setdivisori32: function
-setdivisori32:
-%IFDEF WINDOWS
- mov rxd, edx ; x
- mov buf, rcx ; buffer
-%ENDIF
- dec rxd ; rxd = r8d or esi
- mov ecx, -1 ; value for bsr if rxd = 0 (assuming bsr leaves dest unchanged if src = 0, this works on both Intel, AMD and VIA processors)
- bsr ecx, rxd ; floor(log2(d-1))
- inc rxd
- js H120 ; d < 0. Generate error
- inc ecx ; L = ceil(log2(d))
- sub ecx, 1 ; shift count = L - 1
- adc ecx, 0 ; avoid negative shift count
- xor eax, eax
- mov edx, 1
- cmp rxd, edx
- je H110 ; avoid overflow when d = 1
- shl edx, cl
- div rxd
-H110: inc eax
- mov [buf], eax ; multiplier
- mov [buf+4], ecx ; shift count
- ret
-
-H120: ; d <= 0 not supported. Generate error
- mov edx, 1
- div edx ; will overflow
- ud2
-
-
-; extern "C" int dividefixedi32(int buffer[2], int x);
-global dividefixedi32: function
-dividefixedi32:
-%IFDEF WINDOWS
- mov eax, edx
- mov rxd, edx ; x
- mov buf, rcx ; buffer
-%ELSE
- mov eax, esi
-%ENDIF
- imul dword [buf] ; m
- lea eax, [rdx+rx] ; rx = r8 or rsi
- mov ecx, [buf+4] ; shift count
- sar eax, cl
- sar rxd, 31 ; sign(x)
- sub eax, rxd
- ret
-
-
-;extern "C" void setdivisoru32(int buffer[2], int d);
-; 32 bit unsigned
-
-global setdivisoru32: function
-setdivisoru32:
-%IFDEF WINDOWS
- mov rxd, edx ; x
- mov buf, rcx ; buffer
-%ENDIF
- dec rxd ; rxd = r8d or esi
- mov ecx, -1 ; value for bsr if r8d = 0
- bsr ecx, rxd ; floor(log2(d-1))
- inc rxd
- inc ecx ; L = ceil(log2(d))
- mov edx, 1
- shl rdx, cl ; 2^L (64 bit shift because cl may be 32)
- sub edx, rxd
- xor eax, eax
- div rxd
- inc eax
- mov [buf], eax ; multiplier
- sub ecx, 1
- setae dl
- movzx edx, dl ; shift1
- seta al
- neg al
- and al,cl
- movzx eax, al ; shift 2
- shl eax, 8
- or eax, edx
- mov [buf+4], eax ; shift 1 and shift 2
- ret
-
-;extern "C" int dividefixedu32(int buffer[2], int x);
-global dividefixedu32: function ; unsigned
-dividefixedu32:
-%IFDEF WINDOWS
- mov eax, edx
- mov rxd, edx ; x
- mov buf, rcx ; buffer
-%ELSE
- mov eax, esi
-%ENDIF
- mul dword [buf] ; m
- sub rxd, edx ; x-t
- mov ecx, [buf+4] ; shift 1 and shift 2
- shr rxd, cl
- lea eax, [rx+rdx]
- shr ecx, 8
- shr eax, cl
- ret
diff --git a/contrib/libs/asmlib/divfixedv64.asm b/contrib/libs/asmlib/divfixedv64.asm
deleted file mode 100644
index a4f0e177ec..0000000000
--- a/contrib/libs/asmlib/divfixedv64.asm
+++ /dev/null
@@ -1,498 +0,0 @@
-%include "defs.asm"
-
-;************************* divfixedv64.asm *********************************
-; Author: Agner Fog
-; Date created: 2011-07-25
-; Last modified: 2012-03-10
-;
-; Function prototypes:
-; void setdivisorV8i16(__m128i buf[2], int16_t d);
-; void setdivisorV8u16(__m128i buf[2], uint16_t d);
-; void setdivisorV4i32(__m128i buf[2], int32_t d);
-; void setdivisorV4u32(__m128i buf[2], uint32_t d);
-;
-; __m128i dividefixedV8i16(const __m128i buf[2], __m128i x);
-; __m128i dividefixedV8u16(const __m128i buf[2], __m128i x);;
-; __m128i dividefixedV4i32(const __m128i buf[2], __m128i x);
-; __m128i dividefixedV4u32(const __m128i buf[2], __m128i x);
-;
-; Alternative versions for VectorClass.h:
-; (These versions pack all parameters into a single register)
-; __m128i setdivisor8s(int16_t d);
-; __m128i setdivisor8us(uint16_t d);
-; __m128i setdivisor4i(int32_t d);
-; __m128i setdivisor4ui(uint32_t d);
-;
-; Description:
-; Functions for integer vector division by the same divisor, signed
-; and unsigned 16-bit and 32-bit integer versions.
-;
-; The setdivisor functions calculate the reciprocal divisor and shift counts,
-; the dividefixed functions do the division by multiplication and shift of the
-; vector elements of packed 16-bit or 32-bit signed or unsigned integers.
-;
-; The divisor must be positive. A zero divisor generated a divide by zero error.
-; A negative divisor generates a division overflow error. To divide by a negative
-; divisor, change the sign of the divisor and the result.
-;
-; The methods used are described in this article:
-; T. Granlund and P. L. Montgomery: Division by Invariant Integers Using Multiplication,
-; Proceedings of the SIGPLAN 1994 Conference on Programming Language Design and Implementation.
-; http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556
-;
-; Mathematical formula, unsigned division:
-; x = dividend
-; d = divisor
-; n = integer size, bits
-; L = ceil(log2(d))
-; m = 1 + 2^n * (2^L-d) / d [2^L should overflow to 0 if L = n]
-; sh1 = min(L,1)
-; sh2 = max(L-1,0)
-; t = m*x >> n [high part of unsigned multiplication]
-; x/d = (((x-t) >> sh1) + t) >> sh2
-;
-; Mathematical formula, signed division:
-; x = dividend
-; d = abs(divisor)
-; n = integer size, bits
-; L = ceil(log2(d))
-; L = max(L,1)
-; m = 1 + 2^(n+L-1)/d - 2^n [division should overflow to 0 if d = 1]
-; sh1 = L-1
-; q = x + (m*x >> n) [high part of signed multiplication]
-; q = (q >> sh1) - (x<0 ? -1 : 0)
-; if (divisor < 0) q = -q [negative divisor not supported in present implementation]
-; x/d = q
-;
-; Copyright (c) 2011 - 2012 GNU General Public License www.gnu.org/licenses
-;******************************************************************************
-default rel
-
-%IFDEF WINDOWS
-%define par1 rcx ; function parameter 1
-%define par1d ecx
-%define par1w cx
-%define par2 rdx ; function parameter 2
-%define par2d edx
-%define par2w dx
-%define buf r8 ; pointer to buffer
-%ENDIF
-%IFDEF UNIX
-%define par1 rdi ; function parameter 1
-%define par1d edi
-%define par1w di
-%define par2 rsi ; function parameter 2
-%define par2d esi
-%define par2w si
-%define buf rdi ; pointer to buffer
-%ENDIF
-
-
-; Imported from instrset64.asm:
-extern InstructionSet ; Instruction set for CPU dispatcher
-
-section .text align = 16
-
-;******************************************************************************
-; 16 bit signed integers
-;******************************************************************************
-
-; extern "C" __m128i setdivisor8s(int16_t d);
-; vector of 8 x 16 bit signed integers
-
-global setdivisor8s: function
-setdivisor8s:
- push rbx
- movsx ebx, par1w ; d
- dec ebx
- mov ecx, -1 ; value for bsr if ebx = 0
- bsr ecx, ebx ; floor(log2(d-1))
- inc ebx
- js H120 ; Generate error if d < 0. (error for d=0 will come in the div instruction)
- inc ecx ; L = ceil(log2(d))
- sub ecx, 1 ; shift count = L - 1
- adc ecx, 0 ; avoid negative shift count
- xor eax, eax
- mov edx, 1
- cmp ebx, edx
- je H110 ; avoid division overflow when d = 1
- shl edx, cl
- div bx ; 2^(16+L-1)/d
-H110: inc eax
- movd xmm0, eax ; multiplier
- pshuflw xmm0, xmm0, 0 ; broadcast into lower 4 words
- movd xmm1, ecx ; shift count
- punpcklqdq xmm0, xmm1 ; insert shift count into upper half
- pop rbx
- ret
-H120: ; d < 0 not supported. Generate error
- mov edx, 1
- div edx
- ud2
-; setdivisor8s end
-
-
-; extern "C" void setdivisorV8i16(__m128i buf[2], int16_t d);
-; vector of 8 x 16 bit signed integers
-
-global setdivisorV8i16: function
-setdivisorV8i16:
- push par1 ; buf
- mov par1d, par2d ; d
- call setdivisor8s
- pop rax ; buf
- punpcklqdq xmm0, xmm0 ; copy multiplier into upper 4 words
- movdqa [rax], xmm0 ; multiplier
- movdqa [rax+16], xmm1 ; shift count is still in xmm1
- ret
-; setdivisorV8i16 end
-
-
-; extern "C" int dividefixedV8i16(const __m128i buf[2], __m128i x);
-global dividefixedV8i16: function
-
-dividefixedV8i16:
-; buf = par1
-; x = xmm0 (UNIX) or [par2] (Windows)
-%IFDEF WINDOWS
- movdqa xmm0, [par2] ; x
-%ENDIF
- movdqa xmm1, xmm0 ; x
- pmulhw xmm0, [par1] ; multiply high signed words
- paddw xmm0, xmm1
- movd xmm2, [par1+16] ; shift count
- psraw xmm0, xmm2 ; shift right arithmetic
- psraw xmm1, 15 ; sign of x
- psubw xmm0, xmm1
- ret
-;dividefixedV8i16 end
-
-
-
-;******************************************************************************
-; 16 bit unsigned integers
-;******************************************************************************
-
-; extern "C" __m128i setdivisor8us(uint16_t d);
-; vector of 8 x 16 bit unsigned integers
-
-align 16
-global setdivisor8us: function
-setdivisor8us:
- push rbx
- movzx ebx, par1w ; d
- dec ebx
- mov ecx, -1 ; value for bsr if ebx = 0
- bsr ecx, ebx ; floor(log2(d-1))
- inc ebx
- inc ecx ; L = ceil(log2(d))
- mov edx, 1
- shl edx, cl ; 2^L [32-bit shift to allow overflow]
- sub edx, ebx
- xor eax, eax
- div bx
- inc eax
- movd xmm0, eax
- pshuflw xmm0, xmm0, 0 ; broadcast into lower 4 words
- sub ecx, 1
- setae dl
- movzx edx, dl ; shift 1
- seta al
- neg al
- and al,cl
- movzx eax, al ; shift 2
- movd xmm1, edx ; shift 1
- movd xmm2, eax ; shift 2
- punpckldq xmm1, xmm2 ; combine into two dwords
- punpcklqdq xmm0, xmm1 ; multipliers, shift1, shift2
- pop rbx
- ret
-; setdivisor8us end
-
-
-;extern "C" void setdivisorV8u16(__m128i buf[2], uint16_t d);
-; 8 x 16 bit unsigned
-
-global setdivisorV8u16: function
-setdivisorV8u16:
- push par1 ; buf
- mov par1d, par2d ; d
- call setdivisor8us
- pop rax ; buf
- punpcklqdq xmm0, xmm0 ; copy multiplier into upper 4 words
- movdqa [rax], xmm0 ; multiplier
- movdqa [rax+16], xmm1 ; shift counts are still in xmm1
- ret
-; setdivisorV8u16 end
-
-
-;extern "C" __m128i dividefixedV8u16(const __m128i buf[2], __m128i x);
-global dividefixedV8u16: function
-
-align 16
-dividefixedV8u16:
-; buf = par1
-; x = xmm0 (UNIX) or [par2] (Windows)
-%IFDEF WINDOWS
- movdqa xmm0, [par2] ; x
-%ENDIF
- movdqa xmm1, xmm0 ; x
- pmulhuw xmm0, [par1] ; multiply high unsigned words
- psubw xmm1, xmm0
- movd xmm2, [par1+16] ; shift1
- psrlw xmm1, xmm2
- paddw xmm0, xmm1
- movd xmm2, [par1+20] ; shift2
- psrlw xmm0, xmm2
- ret
-;dividefixedV8u16 end
-
-
-
-;******************************************************************************
-; 32 bit signed integers
-;******************************************************************************
-
-; extern "C" __m128i setdivisor4i(int32_t d);
-; vector of 4 x 32 bit signed integers
-
-align 16
-global setdivisor4i: function
-setdivisor4i:
- push rbx
- mov ebx, par1d ; d
- dec ebx
- mov ecx, -1 ; value for bsr if ebx = 0
- bsr ecx, ebx ; floor(log2(d-1))
- inc ebx
- js K120 ; Generate error if d < 0. (error for d=0 will come in the div instruction)
- inc ecx ; L = ceil(log2(d))
- sub ecx, 1 ; shift count = L - 1
- adc ecx, 0 ; avoid negative shift count
- xor eax, eax
- mov edx, 1
- cmp ebx, edx
- je K110 ; avoid division overflow when d = 1
- shl edx, cl
- div ebx ; 2^(16+L-1)/d
-K110: inc eax
- movd xmm0, eax ; multiplier
- pshufd xmm0, xmm0, 0 ; broadcast into 4 dwords
- movd xmm1, ecx ; shift count
- punpcklqdq xmm0, xmm1 ; insert shift count into upper half
- pop rbx
- ret
-
-K120: ; d < 0 not supported. Generate error
- mov edx, 1
- div edx
- ud2
-; setdivisor4i end
-
-
-; extern "C" void setdivisorV4i32(__m128i buf[2], int32_t d);
-; vector of 4 x 32 bit signed integers
-
-global setdivisorV4i32: function
-setdivisorV4i32:
- push par1 ; buf
- mov par1d, par2d ; d
- call setdivisor4i
- pop rax ; buf
- punpcklqdq xmm0, xmm0 ; copy multiplier into upper 4 words
- movdqa [rax], xmm0 ; multiplier
- movdqa [rax+16], xmm1 ; shift count is still in xmm1
- ret
-; setdivisorV4i32 end
-
-
-; extern "C" int dividefixedV4i32(const __m128i buf[2], __m128i x);
-global dividefixedV4i32: function
-
-; Direct entries to CPU-specific versions
-global dividefixedV4i32SSE2: function
-global dividefixedV4i32SSE41: function
-
-align 8
-dividefixedV4i32: ; function dispatching
- jmp near [dividefixedV4i32Dispatch] ; Go to appropriate version, depending on instruction set
-
-align 16
-dividefixedV4i32SSE41:
-; buf = par1
-; x = xmm0 (UNIX) or [par2] (Windows)
-%IFDEF WINDOWS
- movdqa xmm0,[par2] ; x
-%ENDIF
- movdqa xmm1, xmm0 ; x
- movdqa xmm2, xmm0 ; x
- movdqa xmm3, [par1] ; multiplier
- pmuldq xmm0, xmm3 ; 32 x 32 -> 64 bit signed multiplication of x[0] and x[2]
- psrlq xmm0, 32 ; high dword of result 0 and 2
- psrlq xmm1, 32 ; get x[1] and x[3] into position for multiplication
- pmuldq xmm1, xmm3 ; 32 x 32 -> 64 bit signed multiplication of x[1] and x[3]
- pcmpeqd xmm3, xmm3
- psllq xmm3, 32 ; generate mask of dword 1 and 3
- pand xmm1, xmm3 ; high dword of result 1 and 3
- por xmm0, xmm1 ; combine all four results into one vector
- paddd xmm0, xmm2
- movd xmm3, [par1+16] ; shift count
- psrad xmm0, xmm3 ; shift right arithmetic
- psrad xmm2, 31 ; sign of x
- psubd xmm0, xmm2
- ret
-;dividefixedV4i32SSE41 end
-
-dividefixedV4i32SSE2:
-; I have tried to change sign and use pmuludq, but get rounding error (gives 9/10 = 1).
-; This solution, with 4 separate multiplications, is probably faster anyway despite store forwarding stall
- push rbp
- mov rbp, rsp
-%IFDEF WINDOWS
- movdqa xmm0,[par2] ; x
- mov buf, par1
-%ENDIF
- sub rsp, 16 ; allocate stack space
- and rsp, -16 ; stack should be aligned already. align anyway to be safe
- movdqa [rsp], xmm0 ; store x
- movdqa xmm2, xmm0 ; x
- mov ecx, [buf] ; multiplier
- ; do four signed high multiplications
- mov eax, [rsp]
- imul ecx
- mov [rsp], edx
- mov eax, [rsp+4]
- imul ecx
- mov [rsp+4], edx
- mov eax, [rsp+8]
- imul ecx
- mov [rsp+8], edx
- mov eax, [rsp+12]
- imul ecx
- mov [rsp+12], edx
- movdqa xmm0, [rsp] ; x*m vector
- paddd xmm0, xmm2
- movd xmm3, [buf+16] ; shift count
- psrad xmm0, xmm3 ; shift right arithmetic
- psrad xmm2, 31 ; sign of x
- psubd xmm0, xmm2
- mov rsp, rbp
- pop rbp
- ret
-;dividefixedV4i32SSE2 end
-
-
-; ********************************************************************************
-; CPU dispatching for dividefixedV4i32. This is executed only once
-; ********************************************************************************
-
-dividefixedV4i32CPUDispatch:
- ; get supported instruction set
- push par1
- push par2
- call InstructionSet
- pop par2
- pop par1
- ; Point to generic version
- lea r8, [dividefixedV4i32SSE2]
- cmp eax, 8 ; check if PMULDQ supported
- jb Q100
- ; SSE4.1 supported
- ; Point to SSE4.1 version of strstr
- lea r8, [dividefixedV4i32SSE41]
-Q100: mov [dividefixedV4i32Dispatch], r8
- ; Continue in appropriate version
- jmp r8
-
-SECTION .data
-
-; Pointer to appropriate versions. Initially point to dispatcher
-dividefixedV4i32Dispatch Dq dividefixedV4i32CPUDispatch
-
-section .text
-
-
-;******************************************************************************
-; 32 bit unsigned integers
-;******************************************************************************
-
-; extern "C" __m128i setdivisor4ui(uint32_t d);
-; vector of 4 x 32 bit unsigned integers
-
-align 16
-global setdivisor4ui: function
-setdivisor4ui:
- push rbx
- mov ebx, par1d ; d
- dec ebx
- mov ecx, -1 ; value for bsr if ebx = 0
- bsr ecx, ebx ; floor(log2(d-1))
- inc ebx
- inc ecx ; L = ceil(log2(d))
- mov edx, 1
- shl rdx, cl ; 2^L [64 bit shift to allow overflow]
- sub edx, ebx
- xor eax, eax
- div ebx
- inc eax
- movd xmm0, eax
- pshufd xmm0, xmm0, 0 ; broadcast into 4 dwords
- sub ecx, 1
- setae dl
- movzx edx, dl ; shift1
- seta al
- neg al
- and al,cl
- movzx eax, al
- movd xmm1, edx ; shift 1
- movd xmm2, eax ; shift 2
- punpckldq xmm1, xmm2 ; combine into two dwords
- punpcklqdq xmm0, xmm1 ; multipliers, shift1, shift2
- pop rbx
- ret
-; setdivisor4ui end
-
-;extern "C" void setdivisorV4u32(__m128i buf[2], uint32_t d);
-; 4 x 32 bit unsigned
-
-global setdivisorV4u32: function
-setdivisorV4u32:
- push par1 ; buf
- mov par1d, par2d ; d
- call setdivisor4ui
- pop rax ; buf
- punpcklqdq xmm0, xmm0 ; copy multiplier into upper 4 words
- movdqa [rax], xmm0 ; multiplier
- movdqa [rax+16], xmm1 ; shift counts are still in xmm1
- ret
-; setdivisorV4u32 end
-
-;extern "C" __m128i dividefixedV4u32(const __m128i buf[2], __m128i x);
-global dividefixedV4u32: function
-
-align 16
-dividefixedV4u32:
-; buf = par1
-; x = xmm0 (UNIX) or [par2] (Windows)
-%IFDEF WINDOWS
- movdqa xmm0,[par2] ; x
-%ENDIF
- movdqa xmm1, xmm0 ; x
- movdqa xmm2, xmm0 ; x
- movdqa xmm3, [par1] ; multiplier
- pmuludq xmm0, xmm3 ; 32 x 32 -> 64 bit unsigned multiplication of x[0] and x[2]
- psrlq xmm0, 32 ; high dword of result 0 and 2
- psrlq xmm1, 32 ; get x[1] and x[3] into position for multiplication
- pmuludq xmm1, xmm3 ; 32 x 32 -> 64 bit unsigned multiplication of x[1] and x[3]
- pcmpeqd xmm3, xmm3
- psllq xmm3, 32 ; generate mask of dword 1 and 3
- pand xmm1, xmm3 ; high dword of result 1 and 3
- por xmm0, xmm1 ; combine all four results into one vector
- psubd xmm2, xmm0
- movd xmm3, [par1+16] ; shift1
- psrld xmm2, xmm3
- paddd xmm0, xmm2
- movd xmm3, [par1+20] ; shift2
- psrld xmm0, xmm3
- ret
-;dividefixedV4u32 end
diff --git a/contrib/libs/asmlib/dummy.c b/contrib/libs/asmlib/dummy.c
deleted file mode 100644
index e69de29bb2..0000000000
--- a/contrib/libs/asmlib/dummy.c
+++ /dev/null
diff --git a/contrib/libs/asmlib/instrset64.asm b/contrib/libs/asmlib/instrset64.asm
deleted file mode 100644
index c8cdd34a19..0000000000
--- a/contrib/libs/asmlib/instrset64.asm
+++ /dev/null
@@ -1,184 +0,0 @@
-%include "defs.asm"
-
-;************************* instrset64.asm **********************************
-; Author: Agner Fog
-; Date created: 2003-12-12
-; Last modified: 2013-09-11
-; Source URL: www.agner.org/optimize
-; Project: asmlib.zip
-; Language: assembly, NASM/YASM syntax, 64 bit
-;
-; C++ prototype:
-; extern "C" int InstructionSet (void);
-;
-; Description:
-; This function returns an integer indicating which instruction set is
-; supported by the microprocessor and operating system. A program can
-; call this function to determine if a particular set of instructions can
-; be used.
-;
-; The method used here for detecting whether XMM instructions are enabled by
-; the operating system is different from the method recommended by Intel.
-; The method used here has the advantage that it is independent of the
-; ability of the operating system to catch invalid opcode exceptions. The
-; method used here has been thoroughly tested on many different versions of
-; Intel and AMD microprocessors, and is believed to work reliably. For further
-; discussion of this method, see my manual "Optimizing subroutines in assembly
-; language" (www.agner.org/optimize/).
-;
-; Copyright (c) 2003-2013 GNU General Public License www.gnu.org/licenses
-;******************************************************************************
-;
-; ********** InstructionSet function **********
-; C++ prototype:
-; extern "C" int InstructionSet (void);
-;
-; return value:
-; 0 = 80386 instruction set only
-; 1 or above = MMX instructions supported
-; 2 or above = conditional move and FCOMI supported
-; 3 or above = SSE (XMM) supported by processor and operating system
-; 4 or above = SSE2 supported
-; 5 or above = SSE3 supported
-; 6 or above = Supplementary SSE3
-; 8 or above = SSE4.1 supported
-; 9 or above = POPCNT supported
-; 10 or above = SSE4.2 supported
-; 11 or above = AVX supported by processor and operating system
-; 12 or above = PCLMUL and AES supported
-; 13 or above = AVX2 supported
-; 14 or above = FMA3, F16C, BMI1, BMI2, LZCNT
-; 15 or above = HLE + RTM supported
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-default rel
-
-global InstructionSet: function
-global IInstrSet
-
-
-SECTION .data
-align 16
-
-IInstrSet@: ; local name to avoid problems in shared objects
-IInstrSet: dd -1 ; this global variable is valid after first call
-
-
-SECTION .text align=16
-
-; ********** InstructionSet function **********
-; C++ prototype:
-; extern "C" int InstructionSet (void);
-
-; return value:
-; 4 or above = SSE2 supported
-; 5 or above = SSE3 supported
-; 6 or above = Supplementary SSE3 supported
-; 8 or above = SSE4.1 supported
-; 9 or above = POPCNT supported
-; 10 or above = SSE4.2 supported
-; 11 or above = AVX supported by processor and operating system
-; 12 or above = PCLMUL and AES supported
-
-
-InstructionSet:
- ; Check if this function has been called before
- mov eax, [IInstrSet@]
- test eax, eax
- js FirstTime ; Negative means first time
- ; Early return. Has been called before
- ret ; Return value is in eax
-
-FirstTime:
- push rbx
-
- mov eax, 1
- cpuid ; get features into edx and ecx
-
- mov eax, 4 ; at least SSE2 supported in 64 bit mode
- test ecx, 1 ; SSE3 support by microprocessor
- jz ISEND
- inc eax ; 5
-
- bt ecx, 9 ; Suppl-SSE3 support by microprocessor
- jnc ISEND
- inc eax ; 6
-
- bt ecx, 19 ; SSE4.1 support by microprocessor
- jnc ISEND
- mov al, 8 ; 8
-
- bt ecx, 23 ; POPCNT support by microprocessor
- jnc ISEND
- inc eax ; 9
-
- bt ecx, 20 ; SSE4.2 support by microprocessor
- jnc ISEND
- inc eax ; 10
-
- ; check OS support for YMM registers (AVX)
- bt ecx, 27 ; OSXSAVE: XGETBV supported
- jnc ISEND
- push rax
- push rcx
- push rdx
- xor ecx, ecx
- db 0FH, 01H, 0D0H ; XGETBV
- and eax, 6
- cmp eax, 6 ; AVX support by OS
- pop rdx
- pop rcx
- pop rax
- jne ISEND
-
- bt ecx, 28 ; AVX support by microprocessor
- jnc ISEND
- inc eax ; 11
-
- bt ecx, 1 ; PCLMUL support
- jnc ISEND
- bt ecx, 25 ; AES support
- jnc ISEND
- inc eax ; 12
-
- push rax
- push rcx
- mov eax, 7
- xor ecx, ecx
- cpuid ; check for AVX2
- bt ebx, 5
- pop rcx
- pop rax
- jnc ISEND
- inc eax ; 13
-
-; 14 or above = FMA3, F16C, BMI1, BMI2, LZCNT
- bt ecx, 12 ; FMA3
- jnc ISEND
- bt ecx, 29 ; F16C
- jnc ISEND
- bt ebx, 3 ; BMI1
- jnc ISEND
- bt ebx, 8 ; BMI2
- jnc ISEND
-
- push rax
- push rbx
- push rcx
- mov eax, 80000001H
- cpuid
- bt ecx, 5 ; LZCNT
- pop rcx
- pop rbx
- pop rax
- jnc ISEND
-
- inc eax ; 14
-
-ISEND: mov [IInstrSet@], eax ; save value in global variable
-
- pop rbx
- ret ; return value is in eax
-
-;InstructionSet ENDP
diff --git a/contrib/libs/asmlib/memcmp64.asm b/contrib/libs/asmlib/memcmp64.asm
deleted file mode 100644
index b8a8ab5fbc..0000000000
--- a/contrib/libs/asmlib/memcmp64.asm
+++ /dev/null
@@ -1,295 +0,0 @@
-%include "defs.asm"
-
-;************************* memcmp64.asm *************************************
-; Author: Agner Fog
-; Date created: 2013-10-03
-; Last modified: 2013-10-03
-; Description:
-; Faster version of the standard memcmp function:
-;
-; int A_memcmp (const void * ptr1, const void * ptr2, size_t count);
-;
-; Compares two memory blocks of size num.
-; The return value is zero if the two memory blocks ptr1 and ptr2 are equal
-; The return value is positive if the first differing byte of ptr1 is bigger
-; than ptr2 when compared as unsigned bytes.
-; The return value is negative if the first differing byte of ptr1 is smaller
-; than ptr2 when compared as unsigned bytes.
-;
-; Overriding standard function memcmp:
-; The alias ?OVR_memcmp is changed to _memcmp in the object file if
-; it is desired to override the standard library function memcmp.
-;
-; Optimization:
-; Uses XMM registers if SSE2 is available, uses YMM registers if AVX2.
-;
-; The latest version of this file is available at:
-; www.agner.org/optimize/asmexamples.zip
-; Copyright (c) 2013 GNU General Public License www.gnu.org/licenses
-;******************************************************************************
-
-global A_memcmp: function ; Function memcmp
-global EXP(memcmp): function ; ?OVR_ removed if standard function memcmp overridden
-; Direct entries to CPU-specific versions
-global memcmpSSE2: function ; SSE2 version
-global memcmpAVX2: function ; AVX2 version
-
-; Imported from instrset64.asm
-extern InstructionSet ; Instruction set for CPU dispatcher
-
-default rel
-
-; define registers used for parameters
-%IFDEF WINDOWS
-%define par1 rcx ; function parameter 1
-%define par2 rdx ; function parameter 2
-%define par3 r8 ; function parameter 3
-%define par4 r9 ; scratch register
-%define par4d r9d ; scratch register
-%ENDIF
-%IFDEF UNIX
-%define par1 rdi ; function parameter 1
-%define par2 rsi ; function parameter 2
-%define par3 rdx ; function parameter 3
-%define par4 rcx ; scratch register
-%define par4d ecx ; scratch register
-%ENDIF
-
-
-
-SECTION .text align=16
-
-; extern "C" int A_memcmp (const void * ptr1, const void * ptr2, size_t count);
-; Function entry:
-A_memcmp:
-EXP(memcmp):
- jmp qword [memcmpDispatch] ; Go to appropriate version, depending on instruction set
-
-
-align 16
-memcmpAVX2: ; AVX2 version. Use ymm register
-memcmpAVX2@: ; internal reference
-
- add par1, par3 ; use negative index from end of memory block
- add par2, par3
- neg par3
- jz A900
- mov par4d, 0FFFFH
- cmp par3, -32
- ja A100
-
-A000: ; loop comparing 32 bytes
- vmovdqu ymm1, [par1+par3]
- vpcmpeqb ymm0, ymm1, [par2+par3] ; compare 32 bytes
- vpmovmskb eax, ymm0 ; get byte mask
- xor eax, -1 ; not eax would not set flags
- jnz A700 ; difference found
- add par3, 32
- jz A900 ; finished, equal
- cmp par3, -32
- jna A000 ; next 32 bytes
- vzeroupper ; end ymm state
-
-A100: ; less than 32 bytes left
- cmp par3, -16
- ja A200
- movdqu xmm1, [par1+par3]
- movdqu xmm2, [par2+par3]
- pcmpeqb xmm1, xmm2 ; compare 16 bytes
- pmovmskb eax, xmm1 ; get byte mask
- xor eax, par4d ; invert lower 16 bits
- jnz A701 ; difference found
- add par3, 16
- jz A901 ; finished, equal
-
-A200: ; less than 16 bytes left
- cmp par3, -8
- ja A300
- ; compare 8 bytes
- movq xmm1, [par1+par3]
- movq xmm2, [par2+par3]
- pcmpeqb xmm1, xmm2 ; compare 8 bytes
- pmovmskb eax, xmm1 ; get byte mask
- xor eax, par4d
- jnz A701 ; difference found
- add par3, 8
- jz A901
-
-A300: ; less than 8 bytes left
- cmp par3, -4
- ja A400
- ; compare 4 bytes
- movd xmm1, [par1+par3]
- movd xmm2, [par2+par3]
- pcmpeqb xmm1, xmm2 ; compare 4 bytes
- pmovmskb eax, xmm1 ; get byte mask
- xor eax, par4d ; not ax
- jnz A701 ; difference found
- add par3, 4
- jz A901
-
-A400: ; less than 4 bytes left
- cmp par3, -2
- ja A500
- movzx eax, word [par1+par3]
- movzx par4d, word [par2+par3]
- sub eax, par4d
- jnz A800 ; difference in byte 0 or 1
- add par3, 2
- jz A901
-
-A500: ; less than 2 bytes left
- test par3, par3
- jz A901 ; no bytes left
-
-A600: ; one byte left
- movzx eax, byte [par1+par3]
- movzx par4d, byte [par2+par3]
- sub eax, par4d ; return result
- ret
-
-A700: ; difference found. find position
- vzeroupper
-A701:
- bsf eax, eax
- add par3, rax
- movzx eax, byte [par1+par3]
- movzx par4d, byte [par2+par3]
- sub eax, par4d ; return result
- ret
-
-A800: ; difference in byte 0 or 1
- neg al
- sbb par3, -1 ; add 1 to par3 if al == 0
- movzx eax, byte [par1+par3]
- movzx par4d, byte [par2+par3]
- sub eax, par4d ; return result
- ret
-
-A900: ; equal
- vzeroupper
-A901: xor eax, eax
- ret
-
-
-memcmpSSE2: ; SSE2 version. Use xmm register
-memcmpSSE2@: ; internal reference
-
- add par1, par3 ; use negative index from end of memory block
- add par2, par3
- neg par3
- jz S900
- mov par4d, 0FFFFH
- cmp par3, -16
- ja S200
-
-S100: ; loop comparing 16 bytes
- movdqu xmm1, [par1+par3]
- movdqu xmm2, [par2+par3]
- pcmpeqb xmm1, xmm2 ; compare 16 bytes
- pmovmskb eax, xmm1 ; get byte mask
- xor eax, par4d ; not ax
- jnz S700 ; difference found
- add par3, 16
- jz S900 ; finished, equal
- cmp par3, -16
- jna S100 ; next 16 bytes
-
-S200: ; less than 16 bytes left
- cmp par3, -8
- ja S300
- ; compare 8 bytes
- movq xmm1, [par1+par3]
- movq xmm2, [par2+par3]
- pcmpeqb xmm1, xmm2 ; compare 8 bytes
- pmovmskb eax, xmm1 ; get byte mask
- xor eax, par4d ; not ax
- jnz S700 ; difference found
- add par3, 8
- jz S900
-
-S300: ; less than 8 bytes left
- cmp par3, -4
- ja S400
- ; compare 4 bytes
- movd xmm1, [par1+par3]
- movd xmm2, [par2+par3]
- pcmpeqb xmm1, xmm2 ; compare 4 bytes
- pmovmskb eax, xmm1 ; get byte mask
- xor eax, par4d ; not ax
- jnz S700 ; difference found
- add par3, 4
- jz S900
-
-S400: ; less than 4 bytes left
- cmp par3, -2
- ja S500
- movzx eax, word [par1+par3]
- movzx par4d, word [par2+par3]
- sub eax, par4d
- jnz S800 ; difference in byte 0 or 1
- add par3, 2
- jz S900
-
-S500: ; less than 2 bytes left
- test par3, par3
- jz S900 ; no bytes left
-
- ; one byte left
- movzx eax, byte [par1+par3]
- movzx par4d, byte [par2+par3]
- sub eax, par4d ; return result
- ret
-
-S700: ; difference found. find position
- bsf eax, eax
- add par3, rax
- movzx eax, byte [par1+par3]
- movzx par4d, byte [par2+par3]
- sub eax, par4d ; return result
- ret
-
-S800: ; difference in byte 0 or 1
- neg al
- sbb par3, -1 ; add 1 to par3 if al == 0
-S820: movzx eax, byte [par1+par3]
- movzx par4d, byte [par2+par3]
- sub eax, par4d ; return result
- ret
-
-S900: ; equal
- xor eax, eax
- ret
-
-
-; CPU dispatching for memcmp. This is executed only once
-memcmpCPUDispatch:
- push par1
- push par2
- push par3
- call InstructionSet ; get supported instruction set
- ; SSE2 always supported
- lea par4, [memcmpSSE2@]
- cmp eax, 13 ; check AVX2
- jb Q100
- ; AVX2 supported
- lea par4, [memcmpAVX2@]
-Q100: ; save pointer
- mov qword [memcmpDispatch], par4
-; Continue in appropriate version of memcmp
- pop par3
- pop par2
- pop par1
- jmp par4
-
-
-SECTION .data
-align 16
-
-
-; Pointer to appropriate version.
-; This initially points to memcmpCPUDispatch. memcmpCPUDispatch will
-; change this to the appropriate version of memcmp, so that
-; memcmpCPUDispatch is only executed once:
-memcmpDispatch DQ memcmpCPUDispatch
-
diff --git a/contrib/libs/asmlib/memcpy64.asm b/contrib/libs/asmlib/memcpy64.asm
deleted file mode 100644
index d590990b99..0000000000
--- a/contrib/libs/asmlib/memcpy64.asm
+++ /dev/null
@@ -1,1332 +0,0 @@
-%include "defs.asm"
-
-;************************* memcpy64.asm ************************************
-; Author: Agner Fog
-; Date created: 2008-07-19
-; Last modified: 2016-11-12 (patched version with AVX512 support removed)
-;
-; Description:
-; Faster version of the standard memcpy function:
-; void * A_memcpy(void *dest, const void *src, size_t count);
-; Copies 'count' bytes from 'src' to 'dest'
-;
-; Overriding standard function memcpy:
-; The alias ?OVR_memcpy is changed to _memcpy in the object file if
-; it is desired to override the standard library function memcpy.
-;
-; The function uses non-temporal writes to bypass the cache when the size is
-; bigger than half the size of the largest_level cache. This limit can be
-; read with GetMemcpyCacheLimit and changed with SetMemcpyCacheLimit
-; C++ prototypes:
-; extern "C" size_t GetMemcpyCacheLimit(); // in memcpy64.asm
-; extern "C" void SetMemcpyCacheLimit(); // in memmove64.asm
-; extern "C" void SetMemcpyCacheLimit1(); // used internally
-;
-; Position-independent code is generated if POSITIONINDEPENDENT is defined.
-;
-; CPU dispatching included SSE2, Suppl-SSE3 and AVX instruction sets.
-;
-; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
-;******************************************************************************
-
-default rel
-
-global A_memcpy: function ; Function A_memcpy
-global EXP(memcpy): function ; ?OVR removed if standard function memcpy overridden
-global memcpySSE2: function ; Version for processors with only SSE2
-global memcpySSSE3: function ; Version for processors with SSSE3
-global memcpyU: function ; Version for processors with fast unaligned read
-global memcpyU256: function ; Version for processors with fast 256-bit read/write
-
-global GetMemcpyCacheLimit: function ; Get the size limit for bypassing cache when copying with memcpy and memmove
-global SetMemcpyCacheLimit1: function ; Set the size limit for bypassing cache when copying with memcpy
-
-
-; Imported from instrset64.asm
-extern InstructionSet ; Instruction set for CPU dispatcher
-
-; Imported from unalignedisfaster64.asm:
-extern UnalignedIsFaster ; Tells if unaligned read is faster than PALIGNR
-extern Store256BitIsFaster ; Tells if a 256 bit store is faster than two 128 bit stores
-
-; Imported from cachesize32.asm:
-extern DataCacheSize ; Gets size of data cache
-
-
-; Define prolog for this function
-%MACRO PROLOGM 0
-%IFDEF WINDOWS
- push rsi
- push rdi
- mov rdi, rcx ; dest
- mov r9, rcx ; dest
- mov rsi, rdx ; src
- mov rcx, r8 ; count
-%ELSE ; Unix
- mov rcx, rdx ; count
- mov r9, rdi ; dest
-%ENDIF
-%ENDM
-
-; Define return from this function
-%MACRO RETURNM 0
-%IFDEF WINDOWS
- pop rdi
- pop rsi
-%ENDIF
- mov rax, r9 ; Return value = dest
- ret
-%ENDM
-
-
-SECTION .text align=16
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-; Common entry for dispatch
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-; extern "C" void * A_memcpy(void * dest, const void * src, size_t count);
-; Function entry:
-A_memcpy:
-EXP(memcpy):
- jmp qword [memcpyDispatch] ; Go to appropriate version, depending on instruction set
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-; AVX Version for processors with fast unaligned read and fast 32 bytes write
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-align 16
-memcpyU256: ; global label
-memcpyU256@: ; local label
- PROLOGM
- cmp rcx, 40H
- jb A1000 ; Use simpler code if count < 64
-
- ; count >= 64
- ; Calculate size of first block up to first regular boundary of dest
- mov edx, edi
- neg edx
- and edx, 1FH
- jz B3100 ; Skip if dest aligned by 32
-
- ; edx = size of first partial block, 1 - 31 bytes
- test dl, 3
- jz B3030
- test dl, 1
- jz B3020
- ; move 1 byte
- movzx eax, byte [rsi]
- mov [rdi], al
- inc rsi
- inc rdi
-B3020: test dl, 2
- jz B3030
- ; move 2 bytes
- movzx eax, word [rsi]
- mov [rdi], ax
- add rsi, 2
- add rdi, 2
-B3030: test dl, 4
- jz B3040
- ; move 4 bytes
- mov eax, [rsi]
- mov [rdi], eax
- add rsi, 4
- add rdi, 4
-B3040: test dl, 8
- jz B3050
- ; move 8 bytes
- mov rax, [rsi]
- mov [rdi], rax
- add rsi, 8
- add rdi, 8
-B3050: test dl, 16
- jz B3060
- ; move 16 bytes
- movups xmm0, [rsi]
- movaps [rdi], xmm0
- add rsi, 16
- add rdi, 16
-B3060: sub rcx, rdx
-
-B3100: ; Now dest is aligned by 32. Any partial block has been moved
-
- ; Set up for loop moving 32 bytes per iteration:
- mov rdx, rcx ; Save count
- and rcx, -20H ; Round down to nearest multiple of 32
- add rsi, rcx ; Point to the end
- add rdi, rcx ; Point to the end
- sub rdx, rcx ; Remaining data after loop
- ; Check if count very big
- cmp rcx, [CacheBypassLimit]
- ja I3100 ; Use non-temporal store if count > CacheBypassLimit
- neg rcx ; Negative index from the end
-
-H3100: ; copy -rcx bytes in blocks of 32 bytes.
-
- ; Check for false memory dependence: The CPU may falsely assume
- ; a partial overlap between the written destination and the following
- ; read source if source is unaligned and
- ; (src-dest) modulo 4096 is close to 4096
- test sil, 1FH
- jz H3110 ; aligned
- mov eax, esi
- sub eax, edi
- and eax, 0FFFH ; modulo 4096
- cmp eax, 1000H - 200H
- ja J3100
-
-align 16
-H3110: ; main copy loop, 32 bytes at a time
- ; rcx has negative index from the end, counting up to zero
- vmovups ymm0, [rsi+rcx]
- vmovaps [rdi+rcx], ymm0
- add rcx, 20H
- jnz H3110
- sfence
- vzeroupper ; end of AVX mode
-
-H3120: ; Move the remaining edx bytes (0 - 31):
- add rsi, rdx
- add rdi, rdx
- neg rdx
- jz H3500 ; Skip if no more data
- ; move 16-8-4-2-1 bytes, aligned
- cmp edx, -10H
- jg H3200
- ; move 16 bytes
- movups xmm0, [rsi+rdx]
- movaps [rdi+rdx], xmm0
- add rdx, 10H
-H3200: cmp edx, -8
- jg H3210
- ; move 8 bytes
- movq xmm0, qword [rsi+rdx]
- movq qword [rdi+rdx], xmm0
- add rdx, 8
- jz H500 ; Early skip if count divisible by 8
-H3210: cmp edx, -4
- jg H3220
- ; move 4 bytes
- mov eax, [rsi+rdx]
- mov [rdi+rdx], eax
- add rdx, 4
-H3220: cmp edx, -2
- jg H3230
- ; move 2 bytes
- movzx eax, word [rsi+rdx]
- mov [rdi+rdx], ax
- add rdx, 2
-H3230: cmp edx, -1
- jg H3500
- ; move 1 byte
- movzx eax, byte [rsi+rdx]
- mov [rdi+rdx], al
-H3500: ; finished
- RETURNM
-
-I3100: ; non-temporal move
- neg rcx ; Negative index from the end
-
-align 16
-I3110: ; main copy loop, 32 bytes at a time
- ; rcx has negative index from the end, counting up to zero
- vmovups ymm0, [rsi+rcx]
- vmovntps [rdi+rcx], ymm0
- add rcx, 20H
- jnz I3110
- sfence
- vzeroupper ; end of AVX mode
- jmp H3120 ; Move the remaining edx bytes (0 - 31)
-
-
-align 16
-J3100: ; There is a false memory dependence.
- ; check if src and dest overlap, if not then it is safe
- ; to copy backwards to avoid false memory dependence
-%if 1
- ; Use this version if you want consistent behavior in the case
- ; where dest > src and overlap. However, this case is undefined
- ; anyway because part of src is overwritten before copying
- push rdx
- mov rax, rsi
- sub rax, rdi
- cqo
- xor rax, rdx
- sub rax, rdx ; abs(src-dest)
- neg rcx ; size
- pop rdx ; restore rdx
- cmp rax, rcx
- jnb J3110
- neg rcx ; restore rcx
- jmp H3110 ; overlap between src and dest. Can't copy backwards
-%else
- ; save time by not checking the case that is undefined anyway
- mov rax, rsi
- sub rax, rdi
- neg rcx ; size
- cmp rax, rcx
- jnb J3110 ; OK to copy backwards
- ; must copy forwards
- neg rcx ; restore ecx
- jmp H3110 ; copy forwards
-
-%endif
-
-J3110: ; copy backwards, rcx = size. rsi, rdi = end of src, dest
- push rsi
- push rdi
- sub rsi, rcx
- sub rdi, rcx
-J3120: ; loop backwards
- vmovups ymm0, [rsi+rcx-20H]
- vmovaps [rdi+rcx-20H], ymm0
- sub rcx, 20H
- jnz J3120
- sfence
- vzeroupper
- pop rdi
- pop rsi
- jmp H3120
-
-align 16
- ; count < 64. Move 32-16-8-4-2-1 bytes
- ; multiple CPU versions (SSSE3 and above)
-A1000: add rsi, rcx ; end of src
- add rdi, rcx ; end of dest
- neg rcx ; negative index from the end
- cmp ecx, -20H
- jg A1100
- ; move 32 bytes
- ; movdqu is faster than 64-bit moves on processors with SSSE3
- movups xmm0, [rsi+rcx]
- movups xmm1, [rsi+rcx+10H]
- movups [rdi+rcx], xmm0
- movups [rdi+rcx+10H], xmm1
- add rcx, 20H
-A1100: cmp ecx, -10H
- jg A1200
- ; move 16 bytes
- movups xmm0, [rsi+rcx]
- movups [rdi+rcx], xmm0
- add rcx, 10H
-A1200: cmp ecx, -8
- jg A1300
- ; move 8 bytes
- mov rax, qword [rsi+rcx]
- mov qword [rdi+rcx], rax
- add rcx, 8
-A1300: cmp ecx, -4
- jg A1400
- ; move 4 bytes
- mov eax, [rsi+rcx]
- mov [rdi+rcx], eax
- add rcx, 4
- jz A1900 ; early out if count divisible by 4
-A1400: cmp ecx, -2
- jg A1500
- ; move 2 bytes
- movzx eax, word [rsi+rcx]
- mov [rdi+rcx], ax
- add rcx, 2
-A1500: cmp ecx, -1
- jg A1900
- ; move 1 byte
- movzx eax, byte [rsi+rcx]
- mov [rdi+rcx], al
-A1900: ; finished
- RETURNM
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-; Version for processors with fast unaligned read and fast 16 bytes write
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-align 16
-memcpyU: ; global label
-memcpyU@: ; local label
- PROLOGM
- cmp rcx, 40H
- jb A1000 ; Use simpler code if count < 64
-
- ; count >= 64
- ; Calculate size of first block up to first regular boundary of dest
- mov edx, edi
- neg edx
- and edx, 0FH
- jz B2100 ; Skip if dest aligned by 16
-
- ; edx = size of first partial block, 1 - 15 bytes
- test dl, 3
- jz B2030
- test dl, 1
- jz B2020
- ; move 1 byte
- movzx eax, byte [rsi]
- mov [rdi], al
- inc rsi
- inc rdi
-B2020: test dl, 2
- jz B2030
- ; move 2 bytes
- movzx eax, word [rsi]
- mov [rdi], ax
- add rsi, 2
- add rdi, 2
-B2030: test dl, 4
- jz B2040
- ; move 4 bytes
- mov eax, [rsi]
- mov [rdi], eax
- add rsi, 4
- add rdi, 4
-B2040: test dl, 8
- jz B2050
- ; move 8 bytes
- mov rax, [rsi]
- mov [rdi], rax
- add rsi, 8
- add rdi, 8
-B2050: sub rcx, rdx
-B2100: ; Now dest is aligned by 16. Any partial block has been moved
-
- ; Set up for loop moving 32 bytes per iteration:
- mov rdx, rcx ; Save count
- and rcx, -20H ; Round down to nearest multiple of 32
- add rsi, rcx ; Point to the end
- add rdi, rcx ; Point to the end
- sub rdx, rcx ; Remaining data after loop
-
- ; Check if count very big
- cmp rcx, [CacheBypassLimit]
- ja I100 ; Use non-temporal store if count > CacheBypassLimit
- neg rcx ; Negative index from the end
-
-H100: ; copy -rcx bytes in blocks of 32 bytes.
-
- ; Check for false memory dependence: The CPU may falsely assume
- ; a partial overlap between the written destination and the following
- ; read source if source is unaligned and
- ; (src-dest) modulo 4096 is close to 4096
- test sil, 0FH
- jz H110 ; aligned
- mov eax, esi
- sub eax, edi
- and eax, 0FFFH ; modulo 4096
- cmp eax, 1000H - 200H
- ja J100
-
-H110: ; main copy loop, 32 bytes at a time
- ; rcx has negative index from the end, counting up to zero
- movups xmm0, [rsi+rcx]
- movups xmm1, [rsi+rcx+10H]
- movaps [rdi+rcx], xmm0
- movaps [rdi+rcx+10H], xmm1
- add rcx, 20H
- jnz H110
-
-H120: ; Move the remaining edx bytes (0 - 31):
- add rsi, rdx
- add rdi, rdx
- neg rdx
- jz H500 ; Skip if no more data
- ; move 16-8-4-2-1 bytes, aligned
- cmp edx, -10H
- jg H200
- ; move 16 bytes
- movups xmm0, [rsi+rdx]
- movaps [rdi+rdx], xmm0
- add rdx, 10H
-H200: cmp edx, -8
- jg H210
- ; move 8 bytes
- movq xmm0, qword [rsi+rdx]
- movq qword [rdi+rdx], xmm0
- add rdx, 8
- jz H500 ; Early skip if count divisible by 8
-H210: cmp edx, -4
- jg H220
- ; move 4 bytes
- mov eax, [rsi+rdx]
- mov [rdi+rdx], eax
- add rdx, 4
-H220: cmp edx, -2
- jg H230
- ; move 2 bytes
- movzx eax, word [rsi+rdx]
- mov [rdi+rdx], ax
- add rdx, 2
-H230: cmp edx, -1
- jg H500
- ; move 1 byte
- movzx eax, byte [rsi+rdx]
- mov [rdi+rdx], al
-H500: ; finished
- RETURNM
-
-I100: ; non-temporal move
- neg rcx ; Negative index from the end
-
-align 16
-I110: ; main copy loop, 32 bytes at a time
- ; rcx has negative index from the end, counting up to zero
- movups xmm0, [rsi+rcx]
- movups xmm1, [rsi+rcx+10H]
- movntps [rdi+rcx], xmm0
- movntps [rdi+rcx+10H], xmm1
- add rcx, 20H
- jnz I110
- sfence
- jmp H120 ; Move the remaining edx bytes (0 - 31):
-
-
-align 16
-J100: ; There is a false memory dependence.
- ; check if src and dest overlap, if not then it is safe
- ; to copy backwards to avoid false memory dependence
-%if 1
- ; Use this version if you want consistent behavior in the case
- ; where dest > src and overlap. However, this case is undefined
- ; anyway because part of src is overwritten before copying
- push rdx
- mov rax, rsi
- sub rax, rdi
- cqo
- xor rax, rdx
- sub rax, rdx ; abs(src-dest)
- neg rcx ; size
- pop rdx ; restore rdx
- cmp rax, rcx
- jnb J110
- neg rcx ; restore rcx
- jmp H110 ; overlap between src and dest. Can't copy backwards
-%else
- ; save time by not checking the case that is undefined anyway
- mov rax, rsi
- sub rax, rdi
- neg rcx ; size
- cmp rax, rcx
- jnb J110 ; OK to copy backwards
- ; must copy forwards
- neg rcx ; restore ecx
- jmp H110 ; copy forwards
-
-%endif
-
-J110: ; copy backwards, rcx = size. rsi, rdi = end of src, dest
- push rsi
- push rdi
- sub rsi, rcx
- sub rdi, rcx
-J120: ; loop backwards
- movups xmm1, [rsi+rcx-20H]
- movups xmm0, [rsi+rcx-10H]
- movaps [rdi+rcx-20H], xmm1
- movaps [rdi+rcx-10H], xmm0
- sub rcx, 20H
- jnz J120
- pop rdi
- pop rsi
- jmp H120
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-; Version for processors with SSSE3. Aligned read + shift + aligned write
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-align 16
-memcpySSSE3: ; global label
-memcpySSSE3@: ; local label
- PROLOGM
- cmp rcx, 40H
- jb A1000 ; Use simpler code if count < 64
-
- ; count >= 64
- ; Calculate size of first block up to first regular boundary of dest
- mov edx, edi
- neg edx
- and edx, 0FH
- jz B1200 ; Skip if dest aligned by 16
-
- ; edx = size of first partial block, 1 - 15 bytes
- test dl, 3
- jz B1030
- test dl, 1
- jz B1020
- ; move 1 byte
- movzx eax, byte [rsi]
- mov [rdi], al
- inc rsi
- inc rdi
-B1020: test dl, 2
- jz B1030
- ; move 2 bytes
- movzx eax, word [rsi]
- mov [rdi], ax
- add rsi, 2
- add rdi, 2
-B1030: test dl, 4
- jz B1040
- ; move 4 bytes
- mov eax, [rsi]
- mov [rdi], eax
- add rsi, 4
- add rdi, 4
-B1040: test dl, 8
- jz B1050
- ; move 8 bytes
- mov rax, [rsi]
- mov [rdi], rax
- add rsi, 8
- add rdi, 8
-B1050: sub rcx, rdx
-B1200: ; Now dest is aligned by 16. Any partial block has been moved
- ; Find alignment of src modulo 16 at this point:
- mov eax, esi
- and eax, 0FH
-
- ; Set up for loop moving 32 bytes per iteration:
- mov edx, ecx ; Save count (lower 32 bits)
- and rcx, -20H ; Round down count to nearest multiple of 32
- add rsi, rcx ; Point to the end
- add rdi, rcx ; Point to the end
- sub edx, ecx ; Remaining data after loop (0-31)
- sub rsi, rax ; Nearest preceding aligned block of src
-
- ; Check if count very big
- cmp rcx, [CacheBypassLimit]
- ja B1400 ; Use non-temporal store if count > CacheBypassLimit
- neg rcx ; Negative index from the end
-
- ; Dispatch to different codes depending on src alignment
- lea r8, [AlignmentDispatchSSSE3]
- jmp near [r8+rax*8]
-
-B1400: neg rcx
- ; Dispatch to different codes depending on src alignment
- lea r8, [AlignmentDispatchNT]
- jmp near [r8+rax*8]
-
-
-align 16
-C100: ; Code for aligned src. SSE2 and SSSE3 versions
- ; The nice case, src and dest have same alignment.
-
- ; Loop. rcx has negative index from the end, counting up to zero
- movaps xmm0, [rsi+rcx]
- movaps xmm1, [rsi+rcx+10H]
- movaps [rdi+rcx], xmm0
- movaps [rdi+rcx+10H], xmm1
- add rcx, 20H
- jnz C100
-
- ; Move the remaining edx bytes (0 - 31):
- add rsi, rdx
- add rdi, rdx
- neg rdx
- jz C500 ; Skip if no more data
- ; move 16-8-4-2-1 bytes, aligned
- cmp edx, -10H
- jg C200
- ; move 16 bytes
- movaps xmm0, [rsi+rdx]
- movaps [rdi+rdx], xmm0
- add rdx, 10H
-C200: cmp edx, -8
- jg C210
- ; move 8 bytes
- mov rax, [rsi+rdx]
- mov [rdi+rdx], rax
- add rdx, 8
- jz C500 ; Early skip if count divisible by 8
-C210: cmp edx, -4
- jg C220
- ; move 4 bytes
- mov eax, [rsi+rdx]
- mov [rdi+rdx], eax
- add rdx, 4
-C220: cmp edx, -2
- jg C230
- ; move 2 bytes
- movzx eax, word [rsi+rdx]
- mov [rdi+rdx], ax
- add rdx, 2
-C230: cmp edx, -1
- jg C500
- ; move 1 byte
- movzx eax, byte [rsi+rdx]
- mov [rdi+rdx], al
-C500: ; finished
- RETURNM
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-; Version for processors with SSE2. Aligned read + shift + aligned write
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-memcpySSE2: ; global label
-memcpySSE2@: ; local label
- PROLOGM
- cmp rcx, 40H
- jae B0100 ; Use simpler code if count < 64
-
- ; count < 64. Move 32-16-8-4-2-1 bytes
- add rsi, rcx ; end of src
- add rdi, rcx ; end of dest
- neg rcx ; negative index from the end
- cmp ecx, -20H
- jg A100
- ; move 32 bytes
- ; mov r64 is faster than movdqu on Intel Pentium M and Core 1
- ; movdqu is fast on Nehalem and later
- mov rax, [rsi+rcx]
- mov rdx, [rsi+rcx+8]
- mov [rdi+rcx], rax
- mov [rdi+rcx+8], rdx
- mov rax, qword [rsi+rcx+10H]
- mov rdx, qword [rsi+rcx+18H]
- mov qword [rdi+rcx+10H], rax
- mov qword [rdi+rcx+18H], rdx
- add rcx, 20H
-A100: cmp ecx, -10H
- jg A200
- ; move 16 bytes
- mov rax, [rsi+rcx]
- mov rdx, [rsi+rcx+8]
- mov [rdi+rcx], rax
- mov [rdi+rcx+8], rdx
- add rcx, 10H
-A200: cmp ecx, -8
- jg A300
- ; move 8 bytes
- mov rax, qword [rsi+rcx]
- mov qword [rdi+rcx], rax
- add rcx, 8
-A300: cmp ecx, -4
- jg A400
- ; move 4 bytes
- mov eax, [rsi+rcx]
- mov [rdi+rcx], eax
- add rcx, 4
- jz A900 ; early out if count divisible by 4
-A400: cmp ecx, -2
- jg A500
- ; move 2 bytes
- movzx eax, word [rsi+rcx]
- mov [rdi+rcx], ax
- add rcx, 2
-A500: cmp ecx, -1
- jg A900
- ; move 1 byte
- movzx eax, byte [rsi+rcx]
- mov [rdi+rcx], al
-A900: ; finished
- RETURNM
-
-B0100: ; count >= 64
- ; Calculate size of first block up to first regular boundary of dest
- mov edx, edi
- neg edx
- and edx, 0FH
- jz B0200 ; Skip if dest aligned by 16
-
- ; edx = size of first partial block, 1 - 15 bytes
- test dl, 3
- jz B0030
- test dl, 1
- jz B0020
- ; move 1 byte
- movzx eax, byte [rsi]
- mov [rdi], al
- inc rsi
- inc rdi
-B0020: test dl, 2
- jz B0030
- ; move 2 bytes
- movzx eax, word [rsi]
- mov [rdi], ax
- add rsi, 2
- add rdi, 2
-B0030: test dl, 4
- jz B0040
- ; move 4 bytes
- mov eax, [rsi]
- mov [rdi], eax
- add rsi, 4
- add rdi, 4
-B0040: test dl, 8
- jz B0050
- ; move 8 bytes
- mov rax, [rsi]
- mov [rdi], rax
- add rsi, 8
- add rdi, 8
-B0050: sub rcx, rdx
-B0200: ; Now dest is aligned by 16. Any partial block has been moved
-
- ; This part will not always work if count < 64
- ; Calculate size of first block up to first regular boundary of dest
- mov edx, edi
- neg edx
- and edx, 0FH
- jz B300 ; Skip if dest aligned by 16
-
- ; rdx = size of first partial block, 1 - 15 bytes
- add rsi, rdx
- add rdi, rdx
- sub rcx, rdx
- neg rdx
- cmp edx, -8
- jg B200
- ; move 8 bytes
- mov rax, [rsi+rdx]
- mov [rdi+rdx], rax
- add rdx, 8
-B200: cmp edx, -4
- jg B210
- ; move 4 bytes
- mov eax, [rsi+rdx]
- mov [rdi+rdx], eax
- add rdx, 4
- jz B300 ; early out if aligned by 4
-B210: cmp edx, -2
- jg B220
- ; move 2 bytes
- movzx eax, word [rsi+rdx]
- mov [rdi+rdx], ax
- add rdx, 2
-B220: cmp edx, -1
- jg B300
- ; move 1 byte
- movzx eax, byte [rsi+rdx]
- mov [rdi+rdx], al
-
-B300: ; Now dest is aligned by 16. Any partial block has been moved
- ; Find alignment of src modulo 16 at this point:
- mov eax, esi
- and eax, 0FH
-
- ; Set up for loop moving 32 bytes per iteration:
- mov edx, ecx ; Save count (lower 32 bits)
- and rcx, -20H ; Round down count to nearest multiple of 32
- add rsi, rcx ; Point to the end
- add rdi, rcx ; Point to the end
- sub edx, ecx ; Remaining data after loop (0-31)
- sub rsi, rax ; Nearest preceding aligned block of src
-
- ; Check if count very big
- cmp rcx, [CacheBypassLimit]
- ja B400 ; Use non-temporal store if count > CacheBypassLimit
- neg rcx ; Negative index from the end
-
- ; Dispatch to different codes depending on src alignment
- lea r8, [AlignmentDispatchSSE2]
- jmp near [r8+rax*8]
-
-B400: neg rcx
- ; Dispatch to different codes depending on src alignment
- lea r8, [AlignmentDispatchNT]
- jmp near [r8+rax*8]
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-; Macros and alignment jump tables
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-; Macros for each src alignment, SSE2 instruction set:
-; Make separate code for each alignment u because the shift instructions
-; have the shift count as a constant:
-
-%MACRO MOVE_UNALIGNED_SSE2 2 ; u, nt
-; Move rcx + rdx bytes of data
-; Source is misaligned. (src-dest) modulo 16 = %1
-; %2 = 1 if non-temporal store desired
-; eax = %1
-; rsi = src - %1 = nearest preceding 16-bytes boundary
-; rdi = dest (aligned)
-; rcx = - (count rounded down to nearest divisible by 32)
-; edx = remaining bytes to move after loop
- movdqa xmm0, [rsi+rcx] ; Read from nearest preceding 16B boundary
-%%L1: ; Loop. rcx has negative index from the end, counting up to zero
- movdqa xmm1, [rsi+rcx+10H] ; Read next two blocks aligned
- movdqa xmm2, [rsi+rcx+20H]
- movdqa xmm3, xmm1 ; Copy because used twice
- psrldq xmm0, %1 ; shift right
- pslldq xmm1, 16-%1 ; shift left
- por xmm0, xmm1 ; combine blocks
- %IF %2 == 0
- movdqa [rdi+rcx], xmm0 ; Save aligned
- %ELSE
- movntdq [rdi+rcx], xmm0 ; non-temporal save
- %ENDIF
- movdqa xmm0, xmm2 ; Save for next iteration
- psrldq xmm3, %1 ; shift right
- pslldq xmm2, 16-%1 ; shift left
- por xmm3, xmm2 ; combine blocks
- %IF %2 == 0
- movdqa [rdi+rcx+10H], xmm3 ; Save aligned
- %ELSE
- movntdq [rdi+rcx+10H], xmm3 ; non-temporal save
- %ENDIF
- add rcx, 20H ; Loop through negative values up to zero
- jnz %%L1
-
- ; Set up for edx remaining bytes
- add rsi, rdx
- add rdi, rdx
- neg rdx
- cmp edx, -10H
- jg %%L2
- ; One more 16-bytes block to move
- movdqa xmm1, [rsi+rdx+10H]
- psrldq xmm0, %1 ; shift right
- pslldq xmm1, 16-%1 ; shift left
- por xmm0, xmm1 ; combine blocks
- %IF %2 == 0
- movdqa [rdi+rdx], xmm0 ; Save aligned
- %ELSE
- movntdq [rdi+rdx], xmm0 ; non-temporal save
- %ENDIF
- add rdx, 10H
-%%L2: ; Get src pointer back to misaligned state
- add rsi, rax
- %IF %2 == 1
- sfence
- %ENDIF
- ; Move remaining 0 - 15 bytes, unaligned
- jmp C200
-%ENDMACRO
-
-
-%MACRO MOVE_UNALIGNED_SSE2_4 1 ; nt
-; Special case for u = 4
-; %1 = 1 if non-temporal store desired
- movaps xmm0, [rsi+rcx] ; Read from nearest preceding 16B boundary
-%%L1: ; Loop. rcx has negative index from the end, counting up to zero
- movaps xmm1, [rsi+rcx+10H] ; Read next two blocks aligned
- movss xmm0, xmm1 ; Moves 4 bytes, leaves remaining bytes unchanged
- shufps xmm0, xmm0, 00111001B ; Rotate
- %IF %1 == 0
- movaps [rdi+rcx], xmm0 ; Save aligned
- %ELSE
- movntps [rdi+rcx], xmm0 ; Non-temporal save
- %ENDIF
- movaps xmm0, [rsi+rcx+20H]
- movss xmm1, xmm0
- shufps xmm1, xmm1, 00111001B
- %IF %1 == 0
- movaps [rdi+rcx+10H], xmm1 ; Save aligned
- %ELSE
- movntps [rdi+rcx+10H], xmm1 ; Non-temporal save
- %ENDIF
- add rcx, 20H ; Loop through negative values up to zero
- jnz %%L1
- ; Set up for edx remaining bytes
- add rsi, rdx
- add rdi, rdx
- neg rdx
- cmp edx, -10H
- jg %%L2
- ; One more 16-bytes block to move
- movaps xmm1, [rsi+rdx+10H] ; Read next two blocks aligned
- movss xmm0, xmm1
- shufps xmm0, xmm0, 00111001B
- %IF %1 == 0
- movaps [rdi+rdx], xmm0 ; Save aligned
- %ELSE
- movntps [rdi+rdx], xmm0 ; Non-temporal save
- %ENDIF
- add rdx, 10H
-%%L2: ; Get src pointer back to misaligned state
- add rsi, rax
- %IF %1 == 1
- sfence
- %ENDIF
- ; Move remaining 0 - 15 bytes, unaligned
- jmp C200
-%ENDMACRO
-
-
-%MACRO MOVE_UNALIGNED_SSE2_8 1 ; nt
-; Special case for u = 8
-; %1 = 1 if non-temporal store desired
- movaps xmm0, [rsi+rcx] ; Read from nearest preceding 16B boundary
-%%L1: ; Loop. rcx has negative index from the end, counting up to zero
- movaps xmm1, [rsi+rcx+10H] ; Read next two blocks aligned
- movsd xmm0, xmm1 ; Moves 8 bytes, leaves remaining bytes unchanged
- shufps xmm0, xmm0, 01001110B ; Rotate
- %IF %1 == 0
- movaps [rdi+rcx], xmm0 ; Save aligned
- %ELSE
- movntps [rdi+rcx], xmm0 ; Non-temporal save
- %ENDIF
- movaps xmm0, [rsi+rcx+20H]
- movsd xmm1, xmm0
- shufps xmm1, xmm1, 01001110B
- %IF %1 == 0
- movaps [rdi+rcx+10H], xmm1 ; Save aligned
- %ELSE
- movntps [rdi+rcx+10H], xmm1 ; Non-temporal save
- %ENDIF
- add rcx, 20H ; Loop through negative values up to zero
- jnz %%L1
- ; Set up for edx remaining bytes
- add rsi, rdx
- add rdi, rdx
- neg rdx
- cmp edx, -10H
- jg %%L2
- ; One more 16-bytes block to move
- movaps xmm1, [rsi+rdx+10H] ; Read next two blocks aligned
- movsd xmm0, xmm1
- shufps xmm0, xmm0, 01001110B
- %IF %1 == 0
- movaps [rdi+rdx], xmm0 ; Save aligned
- %ELSE
- movntps [rdi+rdx], xmm0 ; Non-temporal save
- %ENDIF
- add rdx, 10H
-%%L2: ; Get src pointer back to misaligned state
- add rsi, rax
- %IF %1 == 1
- sfence
- %ENDIF
- ; Move remaining 0 - 15 bytes, unaligned
- jmp C200
-%ENDMACRO
-
-
-%MACRO MOVE_UNALIGNED_SSE2_12 1 ; nt
-; Special case for u = 12
-; %1 = 1 if non-temporal store desired
- movaps xmm0, [rsi+rcx] ; Read from nearest preceding 16B boundary
- shufps xmm0, xmm0, 10010011B
-%%L1: ; Loop. rcx has negative index from the end, counting up to zero
- movaps xmm1, [rsi+rcx+10H] ; Read next two blocks aligned
- movaps xmm2, [rsi+rcx+20H]
- shufps xmm1, xmm1, 10010011B
- shufps xmm2, xmm2, 10010011B
- movaps xmm3, xmm2
- movss xmm2, xmm1 ; Moves 4 bytes, leaves remaining bytes unchanged
- movss xmm1, xmm0 ; Moves 4 bytes, leaves remaining bytes unchanged
- %IF %1 == 0
- movaps [rdi+rcx], xmm1 ; Save aligned
- movaps [rdi+rcx+10H], xmm2 ; Save aligned
- %ELSE
- movntps [rdi+rcx], xmm1 ; Non-temporal save
- movntps [rdi+rcx+10H], xmm2 ; Non-temporal save
- %ENDIF
- movaps xmm0, xmm3 ; Save for next iteration
- add rcx, 20H ; Loop through negative values up to zero
- jnz %%L1
- ; Set up for edx remaining bytes
- add rsi, rdx
- add rdi, rdx
- neg rdx
- cmp edx, -10H
- jg %%L2
- ; One more 16-bytes block to move
- movaps xmm1, [rsi+rdx+10H] ; Read next two blocks aligned
- shufps xmm1, xmm1, 10010011B
- movss xmm1, xmm0 ; Moves 4 bytes, leaves remaining bytes unchanged
- %IF %1 == 0
- movaps [rdi+rdx], xmm1 ; Save aligned
- %ELSE
- movntps [rdi+rdx], xmm1 ; Non-temporal save
- %ENDIF
- add rdx, 10H
-%%L2: ; Get src pointer back to misaligned state
- add rsi, rax
- %IF %1 == 1
- sfence
- %ENDIF
- ; Move remaining 0 - 15 bytes, unaligned
- jmp C200
-%ENDMACRO
-
-
-; Macros for each src alignment, Suppl.SSE3 instruction set:
-; Make separate code for each alignment u because the palignr instruction
-; has the shift count as a constant:
-
-%MACRO MOVE_UNALIGNED_SSSE3 1 ; u
-; Move rcx + rdx bytes of data
-; Source is misaligned. (src-dest) modulo 16 = %1
-; eax = %1
-; rsi = src - %1 = nearest preceding 16-bytes boundary
-; rdi = dest (aligned)
-; rcx = - (count rounded down to nearest divisible by 32)
-; edx = remaining bytes to move after loop
- movdqa xmm0, [rsi+rcx] ; Read from nearest preceding 16B boundary
-
-%%L1: ; Loop. rcx has negative index from the end, counting up to zero
- movdqa xmm2, [rsi+rcx+10H] ; Read next two blocks
- movdqa xmm3, [rsi+rcx+20H]
- movdqa xmm1, xmm0 ; Save xmm0
- movdqa xmm0, xmm3 ; Save for next iteration
- palignr xmm3, xmm2, %1 ; Combine parts into aligned block
- palignr xmm2, xmm1, %1 ; Combine parts into aligned block
- movdqa [rdi+rcx], xmm2 ; Save aligned
- movdqa [rdi+rcx+10H], xmm3 ; Save aligned
- add rcx, 20H
- jnz %%L1
-
- ; Set up for edx remaining bytes
- add rsi, rdx
- add rdi, rdx
- neg rdx
- cmp edx, -10H
- jg %%L2
- ; One more 16-bytes block to move
- movdqa xmm2, [rsi+rdx+10H]
- palignr xmm2, xmm0, %1
- movdqa [rdi+rdx], xmm2
- add rdx, 10H
-%%L2: ; Get src pointer back to misaligned state
- add rsi, rax
- ; Move remaining 0 - 15 bytes
- jmp C200
-%ENDMACRO
-
-
-; Make 15 instances of SSE2 macro for each value of the alignment u.
-; These are pointed to by the jump table AlignmentDispatchSSE2 below
-; (alignments and fillers are inserted manually to minimize the number
-; of 16-bytes boundaries inside loops)
-
-align 16
-D104: MOVE_UNALIGNED_SSE2_4 0
-times 4 nop
-D108: MOVE_UNALIGNED_SSE2_8 0
-times 4 nop
-D10C: MOVE_UNALIGNED_SSE2_12 0
-times 1 nop
-D101: MOVE_UNALIGNED_SSE2 1, 0
-D102: MOVE_UNALIGNED_SSE2 2, 0
-D103: MOVE_UNALIGNED_SSE2 3, 0
-D105: MOVE_UNALIGNED_SSE2 5, 0
-D106: MOVE_UNALIGNED_SSE2 6, 0
-D107: MOVE_UNALIGNED_SSE2 7, 0
-D109: MOVE_UNALIGNED_SSE2 9, 0
-times 1 nop
-D10A: MOVE_UNALIGNED_SSE2 0AH, 0
-D10B: MOVE_UNALIGNED_SSE2 0BH, 0
-D10D: MOVE_UNALIGNED_SSE2 0DH, 0
-D10E: MOVE_UNALIGNED_SSE2 0EH, 0
-D10F: MOVE_UNALIGNED_SSE2 0FH, 0
-
-; Make 15 instances of Suppl-SSE3 macro for each value of the alignment u.
-; These are pointed to by the jump table AlignmentDispatchSupSSE3 below
-
-align 16
-E104: MOVE_UNALIGNED_SSSE3 4
-E108: MOVE_UNALIGNED_SSSE3 8
-E10C: MOVE_UNALIGNED_SSSE3 0CH
-E101: MOVE_UNALIGNED_SSSE3 1
-E102: MOVE_UNALIGNED_SSSE3 2
-E103: MOVE_UNALIGNED_SSSE3 3
-E105: MOVE_UNALIGNED_SSSE3 5
-E106: MOVE_UNALIGNED_SSSE3 6
-E107: MOVE_UNALIGNED_SSSE3 7
-E109: MOVE_UNALIGNED_SSSE3 9
-times 1 nop
-E10A: MOVE_UNALIGNED_SSSE3 0AH
-E10B: MOVE_UNALIGNED_SSSE3 0BH
-E10D: MOVE_UNALIGNED_SSSE3 0DH
-E10E: MOVE_UNALIGNED_SSSE3 0EH
-E10F: MOVE_UNALIGNED_SSSE3 0FH
-
-; Codes for non-temporal move. Aligned case first
-
-align 16
-F100: ; Non-temporal move, src and dest have same alignment.
- ; Loop. rcx has negative index from the end, counting up to zero
- movaps xmm0, [rsi+rcx] ; Read
- movaps xmm1, [rsi+rcx+10H]
- movntps [rdi+rcx], xmm0 ; Write non-temporal (bypass cache)
- movntps [rdi+rcx+10H], xmm1
- add rcx, 20H
- jnz F100 ; Loop through negative rcx up to zero
-
- ; Move the remaining edx bytes (0 - 31):
- add rsi, rdx
- add rdi, rdx
- neg rdx
- jz C500 ; Skip if no more data
- ; Check if we can more one more 16-bytes block
- cmp edx, -10H
- jg C200
- ; move 16 bytes, aligned
- movaps xmm0, [rsi+rdx]
- movntps [rdi+rdx], xmm0
- add rdx, 10H
- sfence
- ; move the remaining 0 - 15 bytes
- jmp C200
-
-; Make 15 instances of MOVE_UNALIGNED_SSE2 macro for each value of
-; the alignment u.
-; These are pointed to by the jump table AlignmentDispatchNT below
-
-;align 16
-F104: MOVE_UNALIGNED_SSE2_4 1
-F108: MOVE_UNALIGNED_SSE2_8 1
-F10C: MOVE_UNALIGNED_SSE2_12 1
-F101: MOVE_UNALIGNED_SSE2 1, 1
-F102: MOVE_UNALIGNED_SSE2 2, 1
-F103: MOVE_UNALIGNED_SSE2 3, 1
-F105: MOVE_UNALIGNED_SSE2 5, 1
-F106: MOVE_UNALIGNED_SSE2 6, 1
-F107: MOVE_UNALIGNED_SSE2 7, 1
-F109: MOVE_UNALIGNED_SSE2 9, 1
-F10A: MOVE_UNALIGNED_SSE2 0AH, 1
-F10B: MOVE_UNALIGNED_SSE2 0BH, 1
-F10D: MOVE_UNALIGNED_SSE2 0DH, 1
-F10E: MOVE_UNALIGNED_SSE2 0EH, 1
-F10F: MOVE_UNALIGNED_SSE2 0FH, 1
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-; CPU dispatcher
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-memcpyCPUDispatch: ; CPU dispatcher, check for instruction sets and which method is fastest
- ; This part is executed only once
- push rbx
- push rcx
- push rdx
- push rsi
- push rdi
- push r8
- ; set CacheBypassLimit to half the size of the largest level cache
- call GetMemcpyCacheLimit@
- mov eax, 1
- cpuid ; Get feature flags
- lea rbx, [memcpySSE2@]
- bt ecx, 9 ; Test bit for SupplSSE3
- jnc Q100
- lea rbx, [memcpySSSE3@]
- call UnalignedIsFaster ; Test if unaligned read is faster than aligned read and shift
- test eax, eax
- jz Q100
- lea rbx, [memcpyU@]
- call Store256BitIsFaster ; Test if 256-bit read/write is available and faster than 128-bit read/write
- test eax, eax
- jz Q100
- lea rbx, [memcpyU256@]
-Q100:
- ; Insert appropriate pointer
- mov [memcpyDispatch], rbx
- mov rax, rbx
- pop r8
- pop rdi
- pop rsi
- pop rdx
- pop rcx
- pop rbx
- ; Jump according to the replaced function pointer
- jmp rax
-
-; extern "C" size_t GetMemcpyCacheLimit();
-GetMemcpyCacheLimit:
-GetMemcpyCacheLimit@: ; local limit
- mov rax, [CacheBypassLimit]
- test rax, rax
- jnz U200
- ; Get half the size of the largest level cache
-%ifdef WINDOWS
- xor ecx, ecx ; 0 means largest level cache
-%else
- xor edi, edi ; 0 means largest level cache
-%endif
- call DataCacheSize ; get cache size
- shr rax, 1 ; half the size
- jnz U100
- mov eax, 400000H ; cannot determine cache size. use 4 Mbytes
-U100: mov [CacheBypassLimit], rax
-U200: ret
-
-; Note: SetMemcpyCacheLimit is defined in memmove64.asm, calling SetMemcpyCacheLimit1
-SetMemcpyCacheLimit1:
-%ifdef WINDOWS
- mov rax, rcx
-%else
- mov rax, rdi
-%endif
- test rax, rax
- jnz U400
- ; zero, means default
- mov [CacheBypassLimit], rax
- call GetMemcpyCacheLimit@
-U400: mov [CacheBypassLimit], rax
- ret
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-; getDispatch, for testing only
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-getDispatch:
-mov rax,[memcpyDispatch]
-ret
-
-global getDispatch
-
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-; data section. jump tables, dispatch function pointer, cache size
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-; Data segment must be included in function namespace
-SECTION .data
-align 16
-
-; Jump tables for alignments 0 - 15:
-; The CPU dispatcher replaces AlignmentDispatch with
-; AlignmentDispatchSSE2 or AlignmentDispatchSupSSE3 if Suppl-SSE3
-; is supported.
-
-; Code pointer for each alignment for SSE2 instruction set
-AlignmentDispatchSSE2:
-DQ C100, D101, D102, D103, D104, D105, D106, D107
-DQ D108, D109, D10A, D10B, D10C, D10D, D10E, D10F
-
-; Code pointer for each alignment for Suppl-SSE3 instruction set
-AlignmentDispatchSSSE3:
-DQ C100, E101, E102, E103, E104, E105, E106, E107
-DQ E108, E109, E10A, E10B, E10C, E10D, E10E, E10F
-
-; Code pointer for each alignment for non-temporal store
-AlignmentDispatchNT:
-DQ F100, F101, F102, F103, F104, F105, F106, F107
-DQ F108, F109, F10A, F10B, F10C, F10D, F10E, F10F
-
-; Pointer to appropriate version.
-; This initially points to memcpyCPUDispatch. memcpyCPUDispatch will
-; change this to the appropriate version of memcpy, so that
-; memcpyCPUDispatch is only executed once:
-memcpyDispatch DQ memcpyCPUDispatch
-
-; Bypass cache by using non-temporal moves if count > CacheBypassLimit
-; The optimal value of _CacheBypassLimit is difficult to estimate, but
-; a reasonable value is half the size of the largest cache:
-CacheBypassLimit: DQ 0
diff --git a/contrib/libs/asmlib/memmove64.asm b/contrib/libs/asmlib/memmove64.asm
deleted file mode 100644
index 1c61032541..0000000000
--- a/contrib/libs/asmlib/memmove64.asm
+++ /dev/null
@@ -1,1090 +0,0 @@
-%include "defs.asm"
-
-;************************* memmove64.asm ***********************************
-; Author: Agner Fog
-; Date created: 2008-07-18
-; Last modified: 2016-11-16 (patched version with AVX512 support removed)
-; Description:
-; Faster version of the standard memmove function:
-; void * A_memmove(void *dest, const void *src, size_t count);
-; Moves 'count' bytes from 'src' to 'dest'. src and dest may overlap.
-;
-; Overriding standard function memmove:
-; The alias ?OVR_memmove is changed to _memmove in the object file if
-; it is desired to override the standard library function memmove.
-;
-; CPU dispatching included for different CPUs
-;
-; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
-;******************************************************************************
-
-default rel
-
-global A_memmove: function ; Function A_memmove
-global EXP(memmove): function ; ?OVR removed if standard function memmove overridden
-global memmoveSSE2: function ; Version for processors with only SSE2
-global memmoveSSSE3: function ; Version for processors with SSSE3
-global memmoveU: function ; Version for processors with fast unaligned read
-global memmoveU256: function ; Version for processors with fast 256-bit read/write
-global SetMemcpyCacheLimit ; Change limit for bypassing cache
-
-; Imported from memcpy64.asm:
-extern A_memcpy ; function entry
-extern memcpySSE2 ; CPU specific function entry
-extern memcpySSSE3 ; CPU specific function entry
-extern memcpyU ; CPU specific function entry
-extern memcpyU256 ; CPU specific function entry
-
-; Imported from instrset64.asm
-extern InstructionSet ; Instruction set for CPU dispatcher
-
-; Imported from unalignedisfaster64.asm:
-extern UnalignedIsFaster ; Tells if unaligned read is faster than PALIGNR
-extern Store256BitIsFaster ; Tells if a 256 bit store is faster than two 128 bit stores
-
-; Imported from memcpy64.asm
-extern GetMemcpyCacheLimit ; Get the size limit for bypassing cache when copying with memcpy and memmove
-extern SetMemcpyCacheLimit1 ; Set the size limit for bypassing cache when copying with memcpy
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-; Prolog macro. Determine if we should move forwards or backwards
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-; Define prolog for this function
-; Parameter 1 is forward function label
-%MACRO PROLOGM 1
-%IFDEF WINDOWS
- ; Check if dest overlaps src
- mov rax, rcx
- sub rax, rdx
- cmp rax, r8
- ; We can avoid testing for dest < src by using unsigned compare:
- ; (Assume that the memory block cannot span across address 0)
- ; Must move backwards if unsigned(dest-src) < count
- jae %1 ; Jump to memcpy if we can move forwards
- push rsi
- push rdi
- mov rdi, rcx ; dest
- mov r9, rcx ; dest
- mov rsi, rdx ; src
- mov rcx, r8 ; count
-%ELSE ; Unix
- ; Check if dest overlaps src
- mov rax, rdi
- sub rax, rsi
- cmp rax, rdx
- ; Must move backwards if unsigned(dest-src) < count
- jae %1 ; Jump to memcpy if we can move forwards
- mov rcx, rdx ; count
- mov r9, rdi ; dest
-%ENDIF
-%ENDM
-
-
-; Define return from this function
-%MACRO RETURNM 0
-%IFDEF WINDOWS
- pop rdi
- pop rsi
-%ENDIF
- mov rax, r9 ; Return value = dest
- ret
-%ENDMACRO
-
-
-SECTION .text align=16
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-; Common entry for dispatch
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-; extern "C" void * A_memmove(void * dest, const void * src, size_t count);
-; Function entry:
-A_memmove:
-EXP(memmove):
- jmp qword [memmoveDispatch] ; Go to appropriate version, depending on instruction set
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-; AVX Version for processors with fast unaligned read and fast 32 bytes write
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-align 16
-memmoveU256: ; Version for processors with fast 256-bit read/write
-memmoveU256@: ; local label
- PROLOGM memcpyU256
-
- cmp rcx, 40H
- jb A1000 ; Use simpler code if count < 64
-
- ; count >= 64
- ; Note: this part will not always work if count < 64
- ; Calculate size of last block after last regular boundary of dest
- lea edx, [rdi+rcx] ; end of dext
- and edx, 1FH
- jz B4300 ; Skip if end of dest aligned by 32
-
- ; edx = size of last partial block, 1 - 31 bytes
- test dl, 3
- jz B4210
- test dl, 1
- jz B4201 ; B4200 if we haven't tested edx,3
- ; move 1 byte
- dec rcx
- movzx eax, byte [rsi+rcx]
- mov [rdi+rcx], al
-B4200: test dl, 2
- jz B4210
-B4201: ; move 2 bytes
- sub rcx, 2
- movzx eax, word [rsi+rcx]
- mov [rdi+rcx], ax
-B4210: test dl, 4
- jz B4220
- ; move 4 bytes
- sub rcx, 4
- mov eax, [rsi+rcx]
- mov [rdi+rcx], eax
-B4220: test dl, 8
- jz B4230
- ; move 8 bytes
- sub rcx, 8
- mov rax, [rsi+rcx]
- mov [rdi+rcx], rax
-B4230: test dl, 16
- jz B4300
- ; move 16 bytes
- sub rcx, 16
- movups xmm0, [rsi+rcx]
- movaps [rdi+rcx], xmm0
-
-B4300: ; Now end of dest is aligned by 32. Any partial block has been moved
- mov rdx, rcx
- and ecx, 1FH ; remaining size after 32 bytes blocks moved
- and rdx, -20H ; number of 32 bytes blocks
- jz H4100
- add rsi, rcx
- add rdi, rcx
-
- ; Check if count very big
- cmp rdx, [CacheBypassLimit]
- ja H4800 ; Use non-temporal store if count > _CacheBypassLimit
-
-align 16
-H4000: ; 32 bytes move loop
- vmovups ymm0, [rsi+rdx-20H]
- vmovaps [rdi+rdx-20H], ymm0
- sub rdx, 20H
- jnz H4000
- vzeroupper
-
-H4090: sub rsi, rcx
- sub rdi, rcx
-
-H4100: ; remaining 0-31 bytes
- test ecx, ecx
- jz H4600
- test cl, 10H
- jz H4200
- ; move 16 bytes
- sub ecx, 10H
- movups xmm0, [rsi+rcx]
- movaps [rdi+rcx], xmm0
- jz H4600 ; early out if count divisible by 16
-H4200: test cl, 8
- jz H4300
- ; move 8 bytes
- sub ecx, 8
- mov rax, [rsi+rcx]
- mov [rdi+rcx], rax
-H4300: test cl, 4
- jz H4400
- ; move 4 bytes
- sub ecx, 4
- mov eax, [rsi+rcx]
- mov [rdi+rcx], eax
- jz H4600 ; early out if count divisible by 4
-H4400: test cl, 2
- jz H4500
- ; move 2 bytes
- sub ecx, 2
- movzx eax, word [rsi+rcx]
- mov [rdi+rcx], ax
-H4500: test cl, 1
- jz H4600
- ; move 1 byte
- movzx eax, byte [rsi] ; rcx-1 = 0
- mov [rdi], al
-H4600: ; finished
- RETURNM
-
-align 16
-H4800: ; 32 bytes move loop, bypass cache
- vmovups ymm0, [rsi+rdx-20H]
- vmovntps [rdi+rdx-20H], ymm0
- sub rdx, 20H
- jnz H4800
- sfence
- vzeroupper
- jmp H4090
-
-A1000: ; count < 64. Move 32-16-8-4-2-1 bytes
- test cl, 20H
- jz A1100
- ; move 32 bytes
- ; movups is faster on processors with SSSE3
- sub ecx, 20H
- movups xmm0, [rsi+rcx+10H]
- movups xmm1, [rsi+rcx]
- movups [rdi+rcx+10H], xmm0
- movups [rdi+rcx], xmm1
-A1100: test cl, 10H
- jz A1200
- ; move 16 bytes
- sub ecx, 10H
- movups xmm0, [rsi+rcx]
- movups [rdi+rcx], xmm0
-A1200: test cl, 8
- jz A1300
- ; move 8 bytes
- sub ecx, 8
- mov rax, [rsi+rcx]
- mov [rdi+rcx], rax
-A1300: test cl, 4
- jz A1400
- ; move 4 bytes
- sub ecx, 4
- mov eax, [rsi+rcx]
- mov [rdi+rcx], eax
- jz A1900 ; early out if count divisible by 4
-A1400: test cl, 2
- jz A1500
- ; move 2 bytes
- sub ecx, 2
- movzx eax, word [rsi+rcx]
- mov [rdi+rcx], ax
-A1500: test cl, 1
- jz A1900
- ; move 1 byte
- movzx eax, byte [rsi] ; rcx-1 = 0
- mov [rdi], al
-A1900: ; finished
- RETURNM
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-; Version for processors with fast unaligned read and fast 16 bytes write
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-align 16
-memmoveU: ; Version for processors with fast unaligned read
-memmoveU@: ; local label
- PROLOGM memcpyU
-
- cmp rcx, 40H
- jb A1000 ; Use simpler code if count < 64
-
- ; count >= 64
- ; Note: this part will not always work if count < 64
- ; Calculate size of last block after last regular boundary of dest
- lea edx, [rdi+rcx] ; end of dext
- and edx, 0FH
- jz B3300 ; Skip if end of dest aligned by 16
-
- ; edx = size of last partial block, 1 - 15 bytes
- test dl, 3
- jz B3210
- test dl, 1
- jz B3201 ; B3200 if we haven't tested edx,3
- ; move 1 byte
- dec rcx
- movzx eax, byte [rsi+rcx]
- mov [rdi+rcx], al
-B3200: test dl, 2
- jz B3210
-B3201: ; move 2 bytes
- sub rcx, 2
- movzx eax, word [rsi+rcx]
- mov [rdi+rcx], ax
-B3210: test dl, 4
- jz B3220
- ; move 4 bytes
- sub rcx, 4
- mov eax, [rsi+rcx]
- mov [rdi+rcx], eax
-B3220: test dl, 8
- jz B3300
- ; move 8 bytes
- sub rcx, 8
- mov rax, [rsi+rcx]
- mov [rdi+rcx], rax
-
-B3300: ; Now end of dest is aligned by 16. Any partial block has been moved
- mov rdx, rcx
- and ecx, 1FH ; remaining size after 32 bytes blocks moved
- and rdx, -20H ; number of 32 bytes blocks
- jz H1100
- add rsi, rcx
- add rdi, rcx
-
- ; Check if count very big
- cmp rdx, [CacheBypassLimit]
- ja H1800 ; Use non-temporal store if count > _CacheBypassLimit
-
-align 16 ; minimize 16-bytes boundaries in H1000 loop
-H1000: ; 32 bytes move loop
- movups xmm1, [rsi+rdx-20H]
- movups xmm0, [rsi+rdx-10H]
- movaps [rdi+rdx-20H], xmm1
- movaps [rdi+rdx-10H], xmm0
- sub rdx, 20H
- jnz H1000
-
-H1090: sub rsi, rcx
- sub rdi, rcx
-
-H1100: ; remaining 0-31 bytes
- test ecx, ecx
- jz H1600
- test cl, 10H
- jz H1200
- ; move 16 bytes
- sub ecx, 10H
- movups xmm0, [rsi+rcx]
- movaps [rdi+rcx], xmm0
- jz H1600 ; early out if count divisible by 16
-H1200: test cl, 8
- jz H1300
- ; move 8 bytes
- sub ecx, 8
- mov rax, [rsi+rcx]
- mov [rdi+rcx], rax
-H1300: test cl, 4
- jz H1400
- ; move 4 bytes
- sub ecx, 4
- mov eax, [rsi+rcx]
- mov [rdi+rcx], eax
- jz H1600 ; early out if count divisible by 4
-H1400: test cl, 2
- jz H1500
- ; move 2 bytes
- sub ecx, 2
- movzx eax, word [rsi+rcx]
- mov [rdi+rcx], ax
-H1500: test cl, 1
- jz H1600
- ; move 1 byte
- movzx eax, byte [rsi] ; rcx-1 = 0
- mov [rdi], al
-H1600: ; finished
- RETURNM
-
-align 16
-H1800: ; 32 bytes move loop, bypass cache
- movups xmm1, [rsi+rdx-20H]
- movups xmm0, [rsi+rdx-10H]
- movntps [rdi+rdx-20H], xmm1
- movntps [rdi+rdx-10H], xmm0
- sub rdx, 20H
- jnz H1800
- sfence
- jmp H1090
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-; Version for processors with SSSE3. Aligned read + shift + aligned write
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-align 16
-memmoveSSSE3: ; SSSE3 version begins here
-memmoveSSSE3@: ; local label
- PROLOGM memcpySSSE3
-
- ; Cannot use memcpy. Must move backwards because of overlap between src and dest
- cmp rcx, 40H
- jb A1000 ; Use simpler code if count < 64
- ; count >= 64
- ; Note: this part will not always work if count < 64
- ; Calculate size of last block after last regular boundary of dest
- lea edx, [rdi+rcx] ; end of dext
- and edx, 0FH
- jz B1300 ; Skip if end of dest aligned by 16
-
- ; edx = size of last partial block, 1 - 15 bytes
- test dl, 3
- jz B1210
- test dl, 1
- jz B1201 ; B1200 if we haven't tested edx,3
- ; move 1 byte
- dec rcx
- movzx eax, byte [rsi+rcx]
- mov [rdi+rcx], al
-B1200: test dl, 2
- jz B1210
-B1201: ; move 2 bytes
- sub rcx, 2
- movzx eax, word [rsi+rcx]
- mov [rdi+rcx], ax
-B1210: test dl, 4
- jz B1220
- ; move 4 bytes
- sub rcx, 4
- mov eax, [rsi+rcx]
- mov [rdi+rcx], eax
-B1220: test dl, 8
- jz B1300
- ; move 8 bytes
- sub rcx, 8
- mov rax, [rsi+rcx]
- mov [rdi+rcx], rax
-
-B1300: ; Now end of dest is aligned by 16. Any partial block has been moved
- ; Find alignment of end of src modulo 16 at this point:
- lea eax, [rsi+rcx]
- and eax, 0FH
-
- ; Set up for loop moving 32 bytes per iteration:
- mov edx, ecx ; Save count
- and rcx, -20H ; Round down to nearest multiple of 32
- sub edx, ecx ; Remaining data after loop
- sub rsi, rax ; Nearest preceding aligned block of src
- ; Add the same to rsi and rdi as we have subtracted from rcx
- add rsi, rdx
- add rdi, rdx
-
- ; Check if count very big
- cmp rcx, [CacheBypassLimit]
- ja B1400 ; Use non-temporal store if count > CacheBypassLimit
-
- ; Dispatch to different codes depending on src alignment
- lea r8, [MAlignmentDispatchSSSE3]
- jmp near [r8+rax*8]
-
-B1400: ; Dispatch to different codes depending on src alignment
- lea r8, [MAlignmentDispatchNT]
- jmp near [r8+rax*8]
-
-
-align 16
-C100: ; Code for aligned src. SSE2 and later CPUs
- ; The nice case, src and dest have same alignment.
-
- ; Loop. rcx has positive index from the beginning, counting down to zero
- movaps xmm0, [rsi+rcx-10H]
- movaps xmm1, [rsi+rcx-20H]
- movaps [rdi+rcx-10H], xmm0
- movaps [rdi+rcx-20H], xmm1
- sub rcx, 20H
- jnz C100
-
- ; Move the remaining edx bytes (0 - 31):
- ; move 16-8-4-2-1 bytes, aligned
- test edx, edx
- jz C500 ; Early out if no more data
- test dl, 10H
- jz C200
- ; move 16 bytes
- sub rcx, 10H
- movaps xmm0, [rsi+rcx]
- movaps [rdi+rcx], xmm0
-
-C200: ; Other branches come in here, rcx may contain arbitrary offset
- test edx, edx
- jz C500 ; Early out if no more data
- test dl, 8
- jz C210
- ; move 8 bytes
- sub rcx, 8
- mov rax, [rsi+rcx]
- mov [rdi+rcx], rax
-C210: test dl, 4
- jz C220
- ; move 4 bytes
- sub rcx, 4
- mov eax, [rsi+rcx]
- mov [rdi+rcx], eax
- jz C500 ; Early out if count divisible by 4
-C220: test dl, 2
- jz C230
- ; move 2 bytes
- sub rcx, 2
- movzx eax, word [rsi+rcx]
- mov [rdi+rcx], ax
-C230: test dl, 1
- jz C500
- ; move 1 byte
- movzx eax, byte [rsi+rcx-1] ; rcx-1 is not always 0 here
- mov [rdi+rcx-1], al
-C500: ; finished
- RETURNM
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-; Version for processors with SSE2. Aligned read + shift + aligned write
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-memmoveSSE2: ; SSE2 version begins here
-memmoveSSE2@: ; local label
- PROLOGM memcpySSE2
-
- ; Cannot use memcpy. Must move backwards because of overlap between src and dest
- cmp rcx, 40H
- jae B0100 ; Use simpler code if count < 64
-
- ; count < 64. Move 32-16-8-4-2-1 bytes
- test cl, 20H
- jz A100
- ; move 32 bytes
- ; mov is faster than movdqu on SSE2 processors,
- ; movdqu is faster on later processors
- sub ecx, 20H
- mov rax, [rsi+rcx+18H]
- mov rdx, [rsi+rcx+10H]
- mov [rdi+rcx+18H], rax
- mov [rdi+rcx+10H], rdx
- mov rax, [rsi+rcx+8]
- mov rdx, [rsi+rcx]
- mov [rdi+rcx+8], rax
- mov [rdi+rcx], rdx
-A100: test cl, 10H
- jz A200
- ; move 16 bytes
- sub ecx, 10H
- mov rax, [rsi+rcx+8]
- mov rdx, [rsi+rcx]
- mov [rdi+rcx+8], rax
- mov [rdi+rcx], rdx
-A200: test cl, 8
- jz A300
- ; move 8 bytes
- sub ecx, 8
- mov rax, [rsi+rcx]
- mov [rdi+rcx], rax
-A300: test cl, 4
- jz A400
- ; move 4 bytes
- sub ecx, 4
- mov eax, [rsi+rcx]
- mov [rdi+rcx], eax
- jz A900 ; early out if count divisible by 4
-A400: test cl, 2
- jz A500
- ; move 2 bytes
- sub ecx, 2
- movzx eax, word [rsi+rcx]
- mov [rdi+rcx], ax
-A500: test cl, 1
- jz A900
- ; move 1 byte
- movzx eax, byte [rsi] ; rcx-1 = 0
- mov [rdi], al
-A900: ; finished
- RETURNM
-
-B0100: ; count >= 64
- ; Note: this part will not always work if count < 64
- ; Calculate size of last block after last regular boundary of dest
- lea edx, [rdi+rcx] ; end of dext
- and edx, 0FH
- jz B0300 ; Skip if end of dest aligned by 16
-
- ; edx = size of last partial block, 1 - 15 bytes
- test dl, 3
- jz B0210
- test dl, 1
- jz B0201 ; B0200 if we haven't tested edx,3
- ; move 1 byte
- dec rcx
- movzx eax, byte [rsi+rcx]
- mov [rdi+rcx], al
-B0200: test dl, 2
- jz B0210
-B0201: ; move 2 bytes
- sub rcx, 2
- movzx eax, word [rsi+rcx]
- mov [rdi+rcx], ax
-B0210: test dl, 4
- jz B0220
- ; move 4 bytes
- sub rcx, 4
- mov eax, [rsi+rcx]
- mov [rdi+rcx], eax
-B0220: test dl, 8
- jz B0300
- ; move 8 bytes
- sub rcx, 8
- mov rax, [rsi+rcx]
- mov [rdi+rcx], rax
-
-B0300: ; Now end of dest is aligned by 16. Any partial block has been moved
- ; Find alignment of end of src modulo 16 at this point:
- lea eax, [rsi+rcx]
- and eax, 0FH
-
- ; Set up for loop moving 32 bytes per iteration:
- mov edx, ecx ; Save count
- and rcx, -20H ; Round down to nearest multiple of 32
- sub edx, ecx ; Remaining data after loop
- sub rsi, rax ; Nearest preceding aligned block of src
- ; Add the same to rsi and rdi as we have subtracted from rcx
- add rsi, rdx
- add rdi, rdx
-
- ; Check if count very big
- cmp rcx, [CacheBypassLimit]
- ja B0400 ; Use non-temporal store if count > CacheBypassLimit
-
- ; Dispatch to different codes depending on src alignment
- lea r8, [MAlignmentDispatchSSE2]
- jmp near [r8+rax*8]
-
-B0400: ; Dispatch to different codes depending on src alignment
- lea r8, [MAlignmentDispatchNT]
- jmp near [r8+rax*8]
-
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-; Macros and alignment jump tables
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-; Macros for each src alignment, SSE2 instruction set:
-; Make separate code for each alignment u because the shift instructions
-; have the shift count as a constant:
-
-%MACRO MOVE_REVERSE_UNALIGNED_SSE2 2 ; u, nt
-; Move rcx + rdx bytes of data
-; Source is misaligned. (src-dest) modulo 16 = %1
-; %2 = 1 if non-temporal store desired
-; eax = %1
-; rsi = src - %1 = nearest preceding 16-bytes boundary
-; rdi = dest (aligned)
-; rcx = count rounded down to nearest divisible by 32
-; edx = remaining bytes to move after loop
- movdqa xmm0, [rsi+rcx] ; Read from nearest following 16B boundary
-%%L1: ; Loop. rcx has positive index from the beginning, counting down to zero
- sub rcx, 20H
- movdqa xmm1, [rsi+rcx+10H] ; Read next two blocks aligned
- movdqa xmm2, [rsi+rcx]
- movdqa xmm3, xmm1 ; Copy because used twice
- pslldq xmm0, 16-%1 ; shift left
- psrldq xmm1, %1 ; shift right
- por xmm0, xmm1 ; combine blocks
- %IF %2 == 0
- movdqa [rdi+rcx+10H], xmm0 ; Save aligned
- %ELSE
- movntdq [rdi+rcx+10H], xmm0 ; Save aligned
- %ENDIF
- movdqa xmm0, xmm2 ; Save for next iteration
- pslldq xmm3, 16-%1 ; shift left
- psrldq xmm2, %1 ; shift right
- por xmm3, xmm2 ; combine blocks
- %IF %2 == 0
- movdqa [rdi+rcx], xmm3 ; Save aligned
- %ELSE
- movntdq [rdi+rcx], xmm3 ; Save aligned
- %ENDIF
- jnz %%L1
-
- ; Move edx remaining bytes
- test dl, 10H
- jz %%L2
- ; One more 16-bytes block to move
- sub rcx, 10H
- movdqa xmm1, [rsi+rcx]
- pslldq xmm0, 16-%1 ; shift left
- psrldq xmm1, %1 ; shift right
- por xmm0, xmm1 ; combine blocks
- %IF %2 == 0
- movdqa [rdi+rcx], xmm0 ; Save aligned
- %ELSE
- movntdq [rdi+rcx], xmm0 ; Save aligned
- %ENDIF
-%%L2: ; Get src pointer back to misaligned state
- add rsi, rax
- %IF %2 == 1
- sfence
- %ENDIF
- ; Move remaining 0 - 15 bytes, unaligned
- jmp C200
-%ENDMACRO
-
-
-%MACRO MOVE_REVERSE_UNALIGNED_SSE2_4 1 ; nt
-; Special case: u = 4
-; %1 = 1 if non-temporal store desired
- movaps xmm0, [rsi+rcx] ; Read from nearest following 16B boundary
-%%L1: ; Loop. rcx has positive index from the beginning, counting down to zero
- sub rcx, 20H
- movaps xmm1, [rsi+rcx+10H] ; Read next two blocks aligned
- movaps xmm2, [rsi+rcx]
- movaps xmm3, xmm0
- movaps xmm0, xmm2
- movss xmm2, xmm1
- shufps xmm2, xmm2, 00111001B ; Rotate right
- movss xmm1, xmm3
- shufps xmm1, xmm1, 00111001B ; Rotate right
- %IF %1 == 0
- movaps [rdi+rcx+10H], xmm1 ; Save aligned
- movaps [rdi+rcx], xmm2 ; Save aligned
- %ELSE
- movntps [rdi+rcx+10H], xmm1 ; Non-temporal save
- movntps [rdi+rcx], xmm2 ; Non-temporal save
- %ENDIF
- jnz %%L1
-
- ; Move edx remaining bytes
- test dl, 10H
- jz %%L2
- ; One more 16-bytes block to move
- sub rcx, 10H
- movaps xmm1, [rsi+rcx]
- movss xmm1, xmm0
- shufps xmm1, xmm1, 00111001B ; Rotate right
- %IF %1 == 0
- movaps [rdi+rcx], xmm1 ; Save aligned
- %ELSE
- movntps [rdi+rcx], xmm1 ; Non-temporal save
- %ENDIF
-%%L2: ; Get src pointer back to misaligned state
- add rsi, rax
- %IF %1 == 1
- sfence
- %ENDIF
- ; Move remaining 0 - 15 bytes, unaligned
- jmp C200
-%ENDMACRO
-
-
-%MACRO MOVE_REVERSE_UNALIGNED_SSE2_8 1 ; nt
-; Special case: u = 8
-; %1 = 1 if non-temporal store desired
- movaps xmm0, [rsi+rcx] ; Read from nearest following 16B boundary
- shufps xmm0, xmm0, 01001110B ; Rotate
-%%L1: ; Loop. rcx has positive index from the beginning, counting down to zero
- sub rcx, 20H
- movaps xmm1, [rsi+rcx+10H] ; Read next two blocks aligned
- shufps xmm1, xmm1, 01001110B ; Rotate
- movsd xmm0, xmm1
- %IF %1 == 0
- movaps [rdi+rcx+10H], xmm0 ; Save aligned
- %ELSE
- movntps [rdi+rcx+10H], xmm0 ; Non-temporal save
- %ENDIF
- movaps xmm0, [rsi+rcx]
- shufps xmm0, xmm0, 01001110B ; Rotate
- movsd xmm1, xmm0
- %IF %1 == 0
- movaps [rdi+rcx], xmm1 ; Save aligned
- %ELSE
- movntps [rdi+rcx], xmm1 ; Non-temporal save
- %ENDIF
- jnz %%L1
-
- ; Move edx remaining bytes
- test dl, 10H
- jz %%L2
- ; One more 16-bytes block to move
- sub rcx, 10H
- movaps xmm1, [rsi+rcx]
- shufps xmm1, xmm1, 01001110B ; Rotate
- movsd xmm0, xmm1
- %IF %1 == 0
- movaps [rdi+rcx], xmm0 ; Save aligned
- %ELSE
- movntps [rdi+rcx], xmm0 ; Non-temporal save
- %ENDIF
-%%L2: ; Get src pointer back to misaligned state
- add rsi, rax
- %IF %1 == 1
- sfence
- %ENDIF
- ; Move remaining 0 - 15 bytes, unaligned
- jmp C200
-%ENDMACRO
-
-
-%MACRO MOVE_REVERSE_UNALIGNED_SSE2_12 1 ; nt
-; Special case: u = 12
-; %1 = 1 if non-temporal store desired
- movaps xmm0, [rsi+rcx] ; Read from nearest following 16B boundary
- shufps xmm0, xmm0, 10010011B ; Rotate right
-%%L1: ; Loop. rcx has positive index from the beginning, counting down to zero
- sub rcx, 20H
- movaps xmm1, [rsi+rcx+10H] ; Read next two blocks aligned
- shufps xmm1, xmm1, 10010011B ; Rotate left
- movss xmm0, xmm1
- %IF %1 == 0
- movaps [rdi+rcx+10H], xmm0 ; Save aligned
- %ELSE
- movntps [rdi+rcx+10H], xmm0 ; Non-temporal save
- %ENDIF
- movaps xmm0, [rsi+rcx]
- shufps xmm0, xmm0, 10010011B ; Rotate left
- movss xmm1, xmm0
- %IF %1 == 0
- movaps [rdi+rcx], xmm1 ; Save aligned
- %ELSE
- movntps [rdi+rcx], xmm1 ; Non-temporal save
- %ENDIF
- jnz %%L1
-
- ; Move edx remaining bytes
- test dl, 10H
- jz %%L2
- ; One more 16-bytes block to move
- sub rcx, 10H
- movaps xmm1, [rsi+rcx]
- shufps xmm1, xmm1, 10010011B ; Rotate left
- movss xmm0, xmm1
- %IF %1 == 0
- movaps [rdi+rcx], xmm0 ; Save aligned
- %ELSE
- movntps [rdi+rcx], xmm0 ; Non-temporal save
- %ENDIF
-%%L2: ; Get src pointer back to misaligned state
- add rsi, rax
- %IF %1 == 1
- sfence
- %ENDIF
- ; Move remaining 0 - 15 bytes, unaligned
- jmp C200
-%ENDMACRO
-
-
-; Macros for each src alignment, Suppl.SSE3 instruction set:
-; Code for unaligned src, Suppl.SSE3 instruction set.
-; Make separate code for each alignment u because the palignr instruction
-; has the shift count as a constant:
-
-%MACRO MOVE_REVERSE_UNALIGNED_SSSE3 1; u
-; Move rcx + rdx bytes of data
-; Source is misaligned. (src-dest) modulo 16 = %1
-; eax = %1
-; rsi = src - %1 = nearest preceding 16-bytes boundary
-; rdi = dest (aligned)
-; rcx = - (count rounded down to nearest divisible by 32)
-; edx = remaining bytes to move after loop
- movdqa xmm0, [rsi+rcx] ; Read from nearest following 16B boundary
-
-%%L1: ; Loop. rcx has positive index from the beginning, counting down to zero
- movdqa xmm1, [rsi+rcx-10H] ; Read next two blocks
- palignr xmm0, xmm1, %1 ; Combine parts into aligned block
- movdqa [rdi+rcx-10H], xmm0 ; Save aligned
- movdqa xmm0, [rsi+rcx-20H]
- palignr xmm1, xmm0, %1 ; Combine parts into aligned block
- movdqa [rdi+rcx-20H], xmm1 ; Save aligned
- sub rcx, 20H
- jnz %%L1
-
- ; Set up for edx remaining bytes
- test dl, 10H
- jz %%L2
- ; One more 16-bytes block to move
- sub rcx, 10H
- movdqa xmm1, [rsi+rcx] ; Read next two blocks
- palignr xmm0, xmm1, %1 ; Combine parts into aligned block
- movdqa [rdi+rcx], xmm0 ; Save aligned
-
-%%L2: ; Get src pointer back to misaligned state
- add rsi, rax
- ; Move remaining 0 - 15 bytes
- jmp C200
-%ENDMACRO
-
-
-; Make 15 instances of SSE2 macro for each value of the alignment u.
-; These are pointed to by the jump table MAlignmentDispatchSSE2 below
-; (aligns and fillers are inserted manually to minimize the
-; number of 16-bytes boundaries inside loops)
-
-align 16
-D104: MOVE_REVERSE_UNALIGNED_SSE2_4 0
-D108: MOVE_REVERSE_UNALIGNED_SSE2_8 0
-D10C: MOVE_REVERSE_UNALIGNED_SSE2_12 0
-D101: MOVE_REVERSE_UNALIGNED_SSE2 1, 0
-D102: MOVE_REVERSE_UNALIGNED_SSE2 2, 0
-D103: MOVE_REVERSE_UNALIGNED_SSE2 3, 0
-D105: MOVE_REVERSE_UNALIGNED_SSE2 5, 0
-D106: MOVE_REVERSE_UNALIGNED_SSE2 6, 0
-D107: MOVE_REVERSE_UNALIGNED_SSE2 7, 0
-D109: MOVE_REVERSE_UNALIGNED_SSE2 9, 0
-D10A: MOVE_REVERSE_UNALIGNED_SSE2 0AH, 0
-D10B: MOVE_REVERSE_UNALIGNED_SSE2 0BH, 0
-D10D: MOVE_REVERSE_UNALIGNED_SSE2 0DH, 0
-D10E: MOVE_REVERSE_UNALIGNED_SSE2 0EH, 0
-D10F: MOVE_REVERSE_UNALIGNED_SSE2 0FH, 0
-
-; Make 15 instances of Suppl-SSE3 macro for each value of the alignment u.
-; These are pointed to by the jump table MAlignmentDispatchSupSSE3 below
-
-align 16
-E104: MOVE_REVERSE_UNALIGNED_SSSE3 4
-E108: MOVE_REVERSE_UNALIGNED_SSSE3 8
-E10C: MOVE_REVERSE_UNALIGNED_SSSE3 0CH
-E101: MOVE_REVERSE_UNALIGNED_SSSE3 1
-E102: MOVE_REVERSE_UNALIGNED_SSSE3 2
-E103: MOVE_REVERSE_UNALIGNED_SSSE3 3
-E105: MOVE_REVERSE_UNALIGNED_SSSE3 5
-E106: MOVE_REVERSE_UNALIGNED_SSSE3 6
-E107: MOVE_REVERSE_UNALIGNED_SSSE3 7
-E109: MOVE_REVERSE_UNALIGNED_SSSE3 9
-E10A: MOVE_REVERSE_UNALIGNED_SSSE3 0AH
-E10B: MOVE_REVERSE_UNALIGNED_SSSE3 0BH
-E10D: MOVE_REVERSE_UNALIGNED_SSSE3 0DH
-E10E: MOVE_REVERSE_UNALIGNED_SSSE3 0EH
-E10F: MOVE_REVERSE_UNALIGNED_SSSE3 0FH
-
-align 16
-F100: ; Non-temporal move, src and dest have same alignment.
- ; Loop. rcx has positive index from the beginning, counting down to zero
- sub rcx, 20H
- movaps xmm0, [rsi+rcx+10H]
- movaps xmm1, [rsi+rcx]
- movntps [rdi+rcx+10H], xmm0
- movntps [rdi+rcx], xmm1
- jnz F100
-
- ; Move the remaining edx bytes (0 - 31):
- ; move 16-8-4-2-1 bytes, aligned
- test dl, 10H
- jz C200
- ; move 16 bytes
- sub rcx, 10H
- movaps xmm0, [rsi+rcx]
- movntps [rdi+rcx], xmm0
- sfence
- ; move the remaining 0 - 15 bytes
- jmp C200
-
-; Non-temporal move, src and dest have different alignment.
-; Make 15 instances of SSE2 macro for each value of the alignment u.
-; These are pointed to by the jump table MAlignmentDispatchNT below
-
-align 16
-F101: MOVE_REVERSE_UNALIGNED_SSE2 1, 1
-F102: MOVE_REVERSE_UNALIGNED_SSE2 2, 1
-F103: MOVE_REVERSE_UNALIGNED_SSE2 3, 1
-F104: MOVE_REVERSE_UNALIGNED_SSE2_4 1
-F105: MOVE_REVERSE_UNALIGNED_SSE2 5, 1
-F106: MOVE_REVERSE_UNALIGNED_SSE2 6, 1
-F107: MOVE_REVERSE_UNALIGNED_SSE2 7, 1
-F108: MOVE_REVERSE_UNALIGNED_SSE2_8 1
-F109: MOVE_REVERSE_UNALIGNED_SSE2 9, 1
-F10A: MOVE_REVERSE_UNALIGNED_SSE2 0AH, 1
-F10B: MOVE_REVERSE_UNALIGNED_SSE2 0BH, 1
-F10C: MOVE_REVERSE_UNALIGNED_SSE2_12 1
-F10D: MOVE_REVERSE_UNALIGNED_SSE2 0DH, 1
-F10E: MOVE_REVERSE_UNALIGNED_SSE2 0EH, 1
-F10F: MOVE_REVERSE_UNALIGNED_SSE2 0FH, 1
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-; CPU dispatcher
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-memmoveCPUDispatch: ; CPU dispatcher, check for Suppl-SSE3 instruction set
- ; This part is executed only once
- push rbx
- push rcx
- push rdx
- push rsi
- push rdi
- push r8
-
- ; set CacheBypassLimit to half the size of the largest level cache
-%ifdef WINDOWS
- xor ecx, ecx ; 0 means default
-%else
- xor edi, edi
-%endif
- call SetMemcpyCacheLimit@
- mov eax, 1
- cpuid ; Get feature flags
- lea rbx, [memmoveSSE2@]
- bt ecx, 9 ; Test bit for SupplSSE3
- jnc Q100
- lea rbx, [memmoveSSSE3@]
- call UnalignedIsFaster
- test eax, eax
- jz Q100
- lea rbx, [memmoveU@]
- call Store256BitIsFaster
- test eax, eax
- jz Q100
- lea rbx, [memmoveU256@]
-
-Q100: ; Insert appropriate pointer
- mov [memmoveDispatch], rbx
- mov rax, rbx
- pop r8
- pop rdi
- pop rsi
- pop rdx
- pop rcx
- pop rbx
- ; Jump according to the replaced function pointer
- jmp rax
-
-; Note: Must call SetMemcpyCacheLimit1 defined in memcpy64.asm
-SetMemcpyCacheLimit:
-SetMemcpyCacheLimit@:
- call SetMemcpyCacheLimit1
- mov [CacheBypassLimit], rax
- ret
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-; data section. jump tables, dispatch function pointer, cache size
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-; Data segment must be included in function namespace
-SECTION .data
-align 16
-
-; Jump tables for alignments 0 - 15:
-; The CPU dispatcher replaces MAlignmentDispatch with
-; MAlignmentDispatchSSE2 or MAlignmentDispatchSupSSE3 if Suppl-SSE3
-; is supported.
-
-; Code pointer for each alignment for SSE2 instruction set
-MAlignmentDispatchSSE2:
-DQ C100, D101, D102, D103, D104, D105, D106, D107
-DQ D108, D109, D10A, D10B, D10C, D10D, D10E, D10F
-
-; Code pointer for each alignment for Suppl-SSE3 instruction set
-MAlignmentDispatchSSSE3:
-DQ C100, E101, E102, E103, E104, E105, E106, E107
-DQ E108, E109, E10A, E10B, E10C, E10D, E10E, E10F
-
-; Code pointer for each alignment for non-temporal store
-MAlignmentDispatchNT:
-DQ F100, F101, F102, F103, F104, F105, F106, F107
-DQ F108, F109, F10A, F10B, F10C, F10D, F10E, F10F
-
-memmoveDispatch: DQ memmoveCPUDispatch
-
-; Bypass cache by using non-temporal moves if count > _CacheBypassLimit
-; The optimal value of CacheBypassLimit is difficult to estimate, but
-; a reasonable value is half the size of the largest cache:
-CacheBypassLimit: DD 0
diff --git a/contrib/libs/asmlib/memset64.asm b/contrib/libs/asmlib/memset64.asm
deleted file mode 100644
index 52d647984d..0000000000
--- a/contrib/libs/asmlib/memset64.asm
+++ /dev/null
@@ -1,372 +0,0 @@
-%include "defs.asm"
-
-;************************* memset64.asm *************************************
-; Author: Agner Fog
-; Date created: 2008-07-19
-; Last modified: 2016-11-12 (patched version with AVX512 support removed)
-; Description:
-; Faster version of the standard memset function:
-; void * A_memset(void * dest, int c, size_t count);
-; Sets 'count' bytes from 'dest' to the 8-bit value 'c'
-;
-; Overriding standard function memset:
-; The alias ?OVR_memset is changed to _memset in the object file if
-; it is desired to override the standard library function memset.
-;
-; extern "C" size_t GetMemsetCacheLimit(); // Data blocks bigger than this will be stored uncached by memset
-; extern "C" void SetMemsetCacheLimit(); // Change limit in GetMemsetCacheLimit
-;
-; Optimization:
-; Uses XMM registers to set 16 bytes at a time, aligned.
-;
-; The latest version of this file is available at:
-; www.agner.org/optimize/asmexamples.zip
-; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
-;******************************************************************************
-
-default rel
-
-global A_memset: function ; Function memset
-global EXP(memset): function ; ?OVR removed if standard function memset overridden
-global memsetSSE2: function ; SSE2 version
-global memsetAVX: function ; version for CPUs with fast 256-bit store
-global GetMemsetCacheLimit: function ; Data blocks bigger than this will be stored uncached by memset
-global SetMemsetCacheLimit: function ; Change limit in GetMemsetCacheLimit
-
-; Imported from cachesize64.asm:
-extern DataCacheSize ; Get size of data cache
-
-; Imported from unalignedisfaster64.asm:
-extern Store256BitIsFaster ; Tells if a 256 bit store is faster than two 128 bit stores
-
-; Define prolog for this function
-%MACRO PROLOGM 0
-%IFDEF WINDOWS
-%define Rdest rcx ; dest
- movzx eax, dl ; c
- mov rdx, r8 ; count
-%define Rcount rdx ; count
-%define Rdest2 r9 ; copy of dest
-%define Rcount2 r8 ; copy of count
-
-%ELSE ; Unix
-%define Rdest rdi ; dest
- movzx eax, sil ; c
-%define Rcount rdx ; count
-%define Rdest2 rcx ; copy of dest
-%define Rcount2 rsi ; copy of count
- mov Rcount2, Rcount ; copy count
-%ENDIF
-%ENDMACRO
-
-
-SECTION .text align=16
-
-; extern "C" void * memset(void * dest, int c, size_t count);
-; Function entry:
-A_memset:
-EXP(memset):
- jmp [memsetDispatch] ; CPU dispatch table
-
-memsetAVX: ; AVX version. Use ymm register
-memsetAVX@: ; local label
- PROLOGM
- imul eax, 01010101H ; Broadcast c into all bytes of eax
- mov Rdest2, Rdest ; save dest
- cmp Rcount, 16
- ja B100
-B050: lea r10, [MemsetJTab] ; SSE2 version comes in here
- jmp qword [r10+Rcount*8] ; jump table for small counts
-
-; Separate code for each count from 0 to 16:
-M16: mov [Rdest+12], eax
-M12: mov [Rdest+8], eax
-M08: mov [Rdest+4], eax
-M04: mov [Rdest], eax
-M00: mov rax, Rdest2 ; return dest
- ret
-
-M15: mov [Rdest+11], eax
-M11: mov [Rdest+7], eax
-M07: mov [Rdest+3], eax
-M03: mov [Rdest+1], ax
-M01: mov [Rdest], al
- mov rax, Rdest2 ; return dest
- ret
-
-M14: mov [Rdest+10], eax
-M10: mov [Rdest+6], eax
-M06: mov [Rdest+2], eax
-M02: mov [Rdest], ax
- mov rax, Rdest2 ; return dest
- ret
-
-M13: mov [Rdest+9], eax
-M09: mov [Rdest+5], eax
-M05: mov [Rdest+1], eax
- mov [Rdest], al
- mov rax, Rdest2 ; return dest
- ret
-
-B100: ; AVX version, Rcount > 16
- movd xmm0, eax
- pshufd xmm0, xmm0, 0 ; Broadcast c into all bytes of xmm0
-
- lea rax, [Rdest+Rcount] ; point to end
-
- cmp Rcount, 20H
- jbe K600 ; faster to use xmm registers if small
-
- ; Store the first possibly unaligned 16 bytes
- ; It is faster to always write 16 bytes, possibly overlapping
- ; with the subsequent regular part, than to make possibly mispredicted
- ; branches depending on the size of the first part.
- movups oword [Rdest], xmm0
-
- ; store another 16 bytes, aligned
- add Rdest, 10H
- and Rdest, -10H
- movaps oword [Rdest], xmm0
-
- ; go to next 32 bytes boundary
- add Rdest, 10H
- and Rdest, -20H
-
- ; Check if count very big
- cmp Rcount, [MemsetCacheLimit]
- ja K300 ; Use non-temporal store if count > MemsetCacheLimit
-
- ; find last 32 bytes boundary
- mov Rcount, rax
- and Rcount, -20H
-
- ; - size of 32-bytes blocks
- sub Rdest, Rcount
- jnb K200 ; Jump if not negative
-
- ; extend value to 256 bits
- vinsertf128 ymm0,ymm0,xmm0,1
-
-align 16
-K100: ; Loop through 32-bytes blocks. Register use is swapped
- ; Rcount = end of 32-bytes blocks part
- ; Rdest = negative index from the end, counting up to zero
- vmovaps [Rcount+Rdest], ymm0
- add Rdest, 20H
- jnz K100
- vzeroupper
-
-K200: ; the last part from Rcount to rax is < 32 bytes. write last 32 bytes with overlap
- movups [rax-20H], xmm0
- movups [rax-10H], xmm0
- mov rax, Rdest2 ; return dest
- ret
-
-K300: ; Use non-temporal moves, same code as above:
-
- ; find last 32 bytes boundary
- mov Rcount, rax
- and Rcount, -20H
-
- ; - size of 32-bytes blocks
- sub Rdest, Rcount
- jnb K500 ; Jump if not negative
-
- ; extend value to 256 bits
- vinsertf128 ymm0,ymm0,xmm0,1
-
-align 16
-K400: ; Loop through 32-bytes blocks. Register use is swapped
- ; Rcount = end of 32-bytes blocks part
- ; Rdest = negative index from the end, counting up to zero
- vmovntps [Rcount+Rdest], ymm0
- add Rdest, 20H
- jnz K400
- sfence
- vzeroupper
-
-K500: ; the last part from Rcount to rax is < 32 bytes. write last 32 bytes with overlap
- movups [rax-20H], xmm0
- movups [rax-10H], xmm0
- mov rax, Rdest2 ; return dest
- ret
-
-K600: ; 16 < count <= 32
- movups [Rdest], xmm0
- movups [rax-10H], xmm0
- mov rax, Rdest2 ; return dest
- ret
-
-
-memsetSSE2: ; count > 16. Use SSE2 instruction set
-memsetSSE2@: ; local label
- PROLOGM
- imul eax, 01010101H ; Broadcast c into all bytes of eax
- mov Rdest2, Rdest ; save dest
- cmp Rcount, 16
- jna B050
-
- movd xmm0, eax
- pshufd xmm0, xmm0, 0 ; Broadcast c into all bytes of xmm0
-
- ; Store the first unaligned part.
- ; The size of this part is 1 - 16 bytes.
- ; It is faster to always write 16 bytes, possibly overlapping
- ; with the subsequent regular part, than to make possibly mispredicted
- ; branches depending on the size of the first part.
- movq qword [Rdest], xmm0
- movq qword [Rdest+8], xmm0
-
- ; Check if count very big
-M150: mov rax, [MemsetCacheLimit]
- cmp Rcount, rax
- ja M500 ; Use non-temporal store if count > MemsetCacheLimit
-
- ; Point to end of regular part:
- ; Round down dest+count to nearest preceding 16-bytes boundary
- lea Rcount, [Rdest+Rcount-1]
- and Rcount, -10H
-
- ; Point to start of regular part:
- ; Round up dest to next 16-bytes boundary
- add Rdest, 10H
- and Rdest, -10H
-
- ; -(size of regular part)
- sub Rdest, Rcount
- jnb M300 ; Jump if not negative
-
-align 16
-M200: ; Loop through regular part
- ; Rcount = end of regular part
- ; Rdest = negative index from the end, counting up to zero
- movdqa [Rcount+Rdest], xmm0
- add Rdest, 10H
- jnz M200
-
-M300: ; Do the last irregular part
- ; The size of this part is 1 - 16 bytes.
- ; It is faster to always write 16 bytes, possibly overlapping
- ; with the preceding regular part, than to make possibly mispredicted
- ; branches depending on the size of the last part.
- mov rax, Rdest2 ; dest
- movq qword [rax+Rcount2-10H], xmm0
- movq qword [rax+Rcount2-8], xmm0
- ret
-
-
-M500: ; Use non-temporal moves, same code as above:
- ; End of regular part:
- ; Round down dest+count to nearest preceding 16-bytes boundary
- lea Rcount, [Rdest+Rcount-1]
- and Rcount, -10H
-
- ; Start of regular part:
- ; Round up dest to next 16-bytes boundary
- add Rdest, 10H
- and Rdest, -10H
-
- ; -(size of regular part)
- sub Rdest, Rcount
- jnb M700 ; Jump if not negative
-
-align 16
-M600: ; Loop through regular part
- ; Rcount = end of regular part
- ; Rdest = negative index from the end, counting up to zero
- movntdq [Rcount+Rdest], xmm0
- add Rdest, 10H
- jnz M600
- sfence
-
-M700: ; Do the last irregular part
- ; The size of this part is 1 - 16 bytes.
- ; It is faster to always write 16 bytes, possibly overlapping
- ; with the preceding regular part, than to make possibly mispredicted
- ; branches depending on the size of the last part.
- mov rax, Rdest2 ; dest
- movq qword [rax+Rcount2-10H], xmm0
- movq qword [rax+Rcount2-8], xmm0
- ret
-
-
-memsetCPUDispatch: ; CPU dispatcher, check for instruction sets and which method is fastest
- ; This part is executed only once
- push rbx
- push rcx
- push rdx
- push rsi
- push rdi
- push r8
- ; set CacheBypassLimit to half the size of the largest level cache
- call GetMemsetCacheLimit@
- lea rbx, [memsetSSE2@]
- call Store256BitIsFaster ; Test if 256-bit read/write is available and faster than 128-bit read/write
- test eax, eax
- jz Q100
- lea rbx, [memsetAVX@]
-Q100:
- ; Insert appropriate pointer
- mov [memsetDispatch], rbx
- mov rax, rbx
- pop r8
- pop rdi
- pop rsi
- pop rdx
- pop rcx
- pop rbx
- ; Jump according to the replaced function pointer
- jmp rax
-
-
-; extern "C" size_t GetMemsetCacheLimit(); // Data blocks bigger than this will be stored uncached by memset
-GetMemsetCacheLimit:
-GetMemsetCacheLimit@:
- mov rax, [MemsetCacheLimit]
- test rax, rax
- jnz U200
- ; Get half the size of the largest level cache
-%ifdef WINDOWS
- xor ecx, ecx ; 0 means largest level cache
-%else
- xor edi, edi ; 0 means largest level cache
-%endif
- call DataCacheSize ; get cache size
- shr eax, 1 ; half the size
- jnz U100
- mov eax, 400000H ; cannot determine cache size. use 4 Mbytes
-U100: mov [MemsetCacheLimit], eax
-U200: ret
-
-; extern "C" void SetMemsetCacheLimit(); // Change limit in GetMemsetCacheLimit
-SetMemsetCacheLimit:
-%ifdef WINDOWS
- mov rax, rcx
-%else
- mov rax, rdi
-%endif
- test rax, rax
- jnz U400
- ; zero, means default
- mov [MemsetCacheLimit], rax
- call GetMemsetCacheLimit@
-U400: mov [MemsetCacheLimit], rax
- ret
-
-
-SECTION .data
-align 16
-; Jump table for count from 0 to 16:
-MemsetJTab:DQ M00, M01, M02, M03, M04, M05, M06, M07
- DQ M08, M09, M10, M11, M12, M13, M14, M15, M16
-
-; Pointer to appropriate version.
-; This initially points to memsetCPUDispatch. memsetCPUDispatch will
-; change this to the appropriate version of memset, so that
-; memsetCPUDispatch is only executed once:
-memsetDispatch: DQ memsetCPUDispatch
-
-; Bypass cache by using non-temporal moves if count > MemsetCacheLimit
-; The optimal value of MemsetCacheLimit is difficult to estimate, but
-; a reasonable value is half the size of the largest cache
-MemsetCacheLimit: DQ 0
diff --git a/contrib/libs/asmlib/mersenne64.asm b/contrib/libs/asmlib/mersenne64.asm
deleted file mode 100644
index 758075d61d..0000000000
--- a/contrib/libs/asmlib/mersenne64.asm
+++ /dev/null
@@ -1,616 +0,0 @@
-%include "defs.asm"
-
-; ----------------------------- MERSENNE64.ASM ---------------------------
-; Author: Agner Fog
-; Date created: 1998
-; Last modified: 2013-09-13
-; Source URL: www.agner.org/optimize
-; Project: asmlib.zip
-; Language: assembly, NASM/YASM syntax, 64 bit
-; Description:
-; Random Number generator 'Mersenne Twister' type MT11213A (or MT19937)
-;
-;
-; This random number generator is described in the article by
-; M. Matsumoto & T. Nishimura, in:
-; ACM Transactions on Modeling and Computer Simulation,
-; vol. 8, no. 1, 1998, pp. 3-30. See also:
-; http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
-;
-; Initialization:
-; MersRandomInit must be called before the first call to any of the other
-; random number functions. The seed is any 32-bit integer.
-; You may use MersRandomInitByArray instead if you want more
-; than 32 bits for seed. length is the number of integers in seeds[].
-; length must be > 0, there is no upper limit for length.
-;
-; Generating random numbers:
-; MersRandom returns a floating point number in the interval 0 <= x < 1 with
-; a resolution of 32 bits.
-; MersIRandom returns an integer in the interval defined by min and max with
-; a resolution of 32 bits.
-; MersIRandomX returns an integer in the interval defined by min and max with
-; exactly equal probabilities of all values in the interval.
-; MersBRandom returns 32 random bits.
-;
-; Error conditions:
-; If MersRandomInit or MersRandomInitByArray has not been called then MersRandom
-; and MersBRandom keep returning 0, and MersIRandom and MersIRandomX return min.
-; MersIRandom and MersIRandomX return a large negative number if max < min.
-;
-; C++ prototypes in randoma.h:
-; Thread-safe versions:
-; extern "C" void MersRandomInit(void * Pthis, int seed); // Re-seed
-; extern "C" void MersRandomInitByArray(void * Pthis, unsigned int seeds[], int length); // Seed by more than 32 bits
-; extern "C" int MersIRandom (void * Pthis, int min, int max); // Output random integer
-; extern "C" int MersIRandomX(void * Pthis, int min, int max); // Output random integer, exact
-; extern "C" double MersRandom(void * Pthis); // Output random float
-; extern "C" unsigned int MersBRandom(void * Pthis); // Output random bits
-;
-; Single-threaded versions:
-; extern "C" void MersenneRandomInit(int seed); // Re-seed
-; extern "C" void MersenneRandomInitByArray(unsigned int seeds[], int length); // Seed by more than 32 bits
-; extern "C" int MersenneIRandom (int min, int max); // Output random integer
-; extern "C" int MersenneIRandomX(int min, int max); // Output random integer, exact
-; extern "C" double MersenneRandom(); // Output random float
-; extern "C" unsigned int MersenneBRandom(); // Output random bits
-;
-; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-default rel
-
-; structure definition and constants:
-%INCLUDE "randomah.asi"
-
-global MersenneRandomInit, MersenneRandomInitD, MersRandomInit
-global MersenneRandomInitByArray, MersenneRandomInitByArrayD, MersRandomInitByArray
-global MersenneBRandom, MersenneBRandomD, MersBRandom
-global MersenneRandom, MersenneRandomD, MersRandom
-global MersenneIRandom, MersenneIRandomD, MersIRandom
-global MersenneIRandomX, MersenneIRandomXD, MersIRandomX
-
-
-section .data
-align 16
-
-; Data for single instance of random number generator
-MersenneInstance: ISTRUC CRandomMersenneA
-IEND
-; Size of structure
-MersenneSize equ $ - MersenneInstance
-
-
-SECTION .CODE ALIGN=16
-
-MersenneRandomInit: ; PROC
-%IFDEF UNIX
- mov edx, edi ; seed
- lea rcx, [MersenneInstance] ; Pthis = point to instance
- jmp ?Windows_MersRandomInit
-%ENDIF
-%IFDEF WINDOWS
-MersenneRandomInitD: ; alias
- mov edx, ecx ; seed
- lea rcx, [MersenneInstance] ; Pthis = point to instance
- ;jmp ?Windows_MersRandomInit
-%ENDIF
-;MersenneRandomInit ENDP
-
-
-; Thread-safe version:
-; extern "C" void MersRandomInit(void * Pthis, int seed); // Re-seed
-MersRandomInit: ; PROC
-%IFDEF UNIX
- ; translate calling convention
- mov edx, esi ; seed
- mov rcx, rdi ; Pthis
-%ENDIF
- ; parameters: rcx = Pthis, edx = seed
- and rcx, -16 ; align buffer
- ?Windows_MersRandomInit:
- call Mers_init0 ; initialize mt buffer with seeds
-
- ; Number of premade numbers that are lost in the initialization when the
- ; SSE2 implementation makes up to 4 premade numbers at a time:
-%IF MERS_N & 3
- PREMADELOST equ (MERS_N & 3)
-%ELSE
- PREMADELOST equ 4
-%ENDIF
- ; We want the C++ and the assembly implementation to give exactly the same
- ; sequence. The C++ version discards 37 random numbers after initialization.
- ; The assembly version generates a sequence that is PREMADELOST + 1 numbers
- ; behind. Therefore we discard the first 37 + PREMADELOST + 1 numbers if
- ; SSE2 is supported, otherwise 37 + 1.
-
- push rbx
- mov ebx, 37+PREMADELOST+1
- ; CMP dword [rcx+CRandomMersenneA.Instset], 4 ; can we use XMM registers and SSE2 ?
- ; jae M110
- ; sub ebx, PREMADELOST ; SSE2 not supported
- ; mov dword [rcx+CRandomMersenneA.PreInx], 0 ; reset index to premade list
-M110: ; loop
-M120: call ?Windows_MersBRandom
- dec ebx
- jnz M120
- pop rbx
- ret
-;MersRandomInit ENDP
-
-
-Mers_init0: ; make random seeds from eax and put them into MT buffer
-; Input parameters:
-; rcx points to CRandomMersenneA
-; edx: seed
-; rcx unchanged by procedure
-
- push rdi
- ; clear my buffer
- push rcx
- mov rdi, rcx ; Pthis
- add rdi, 16
- mov ecx, (MersenneSize - 16) / 4
- xor eax, eax
- cld
- rep stosd
- pop rcx ; Pthis
- mov edi, edx ; seed
-
- ; initialize CRandomMersenneA structure
- mov dword [rcx+CRandomMersenneA.PreInx], 4*4
- mov dword [rcx+CRandomMersenneA.Instset], 4
- mov eax, MERS_B
- mov [rcx+CRandomMersenneA.TMB], eax
- mov [rcx+CRandomMersenneA.TMB+4], eax
- mov [rcx+CRandomMersenneA.TMB+8], eax
- mov [rcx+CRandomMersenneA.TMB+12], eax
- mov eax, MERS_C
- mov [rcx+CRandomMersenneA.TMC], eax
- mov [rcx+CRandomMersenneA.TMC+4], eax
- mov [rcx+CRandomMersenneA.TMC+8], eax
- mov [rcx+CRandomMersenneA.TMC+12], eax
- mov eax, 3FF00000H ; upper dword of 1.0, double precision
- mov [rcx+CRandomMersenneA.one+4], eax
- mov [rcx+CRandomMersenneA.one+12], eax
- mov dword [rcx+CRandomMersenneA.LMASK], LOWER_MASK
- mov dword [rcx+CRandomMersenneA.UMASK], UPPER_MASK
- mov dword [rcx+CRandomMersenneA.MATA], MERS_A
-
- ; put random numbers into MT buffer
- xor eax, eax
-M210: mov [rcx+rax*4+CRandomMersenneA.MT], edi
- mov edx, edi
- shr edi, 30
- xor edi, edx
- imul edi, 1812433253
- inc eax
- add edi, eax
- cmp eax, MERS_N
- jb M210
-
- ; Set index MTI to end of list, (scaled by 4)
- ; Round up to multiple of 4 to avoid alignment error
- mov dword [rcx+CRandomMersenneA.MTI], ((MERS_N+3) & (-4)) * 4
-
- pop rdi
- ret
-
-
-; Single threaded version:
-; extern "C" void MersenneRandomInitByArray(unsigned int seeds[], int length);
-
-MersenneRandomInitByArray: ; PROC ; entry for Linux call
-%IFDEF UNIX
- mov r8d, esi ; length
- mov rdx, rdi ; seeds
- lea rcx, [MersenneInstance] ; Pthis = point to instance
- jmp ?Windows_MersRandomInitByArray
-%ENDIF
-%IFDEF WINDOWS
-MersenneRandomInitByArrayD: ; LABEL NEAR ; alias
- mov r8d, edx ; length
- mov rdx, rcx ; seeds
- lea rcx, [MersenneInstance] ; Pthis = point to instance
- jmp ?Windows_MersRandomInitByArray
-%ENDIF
-;MersenneRandomInitByArray ENDP
-
-; Thread-safe version:
-; extern "C" int MersRandomInitByArray(void * Pthis, unsigned int seeds[], int length);
-MersRandomInitByArray: ; PROC
-%IFDEF UNIX
- ; translate calling convention
- mov r8d, edx ; length
- mov rdx, rsi ; seeds
- mov rcx, rdi ; Pthis
-%ENDIF
-
-?Windows_MersRandomInitByArray:
-; parameters: rcx = Pthis, rdx = seeds, r8d = length
-
- and rcx, -16 ; align buffer
- push rbx
- push rsi
- push rdi
- push rbp
- mov rbx, rdx ; seeds
- mov ebp, r8d ; length
-
- mov edx, 19650218
- call Mers_init0 ; init0(19650218); (rcx unchanged)
-
- mov r8d, ebp ; r8d = length, ebp = k
- test ebp, ebp
- jle M380 ; error: length <= 0
- xor edi, edi ; j = 0
- lea esi, [rdi+1] ; i = 1
- cmp ebp, MERS_N
- ja M310
- mov ebp, MERS_N ; k = max (MERS_N,length)
-M310:
-
- ; for (; k; k--) {
-M320: mov eax, [rcx+rsi*4-4+CRandomMersenneA.MT] ; mt[i-1]
- mov edx, eax
- shr eax, 30
- xor eax, edx ; mt[i-1] ^ (mt[i-1] >> 30)
- imul eax, 1664525 ; * 1664525
- xor eax, [rcx+rsi*4+CRandomMersenneA.MT] ; ^ mt[i]
- add eax, [rbx+rdi*4] ; + seeds[j]
- add eax, edi ; + j
- mov [rcx+rsi*4+CRandomMersenneA.MT], eax ; save in mt[i]
- inc esi ; i++
- inc edi ; j++
- cmp esi, MERS_N
- jb M330 ; if (i>=MERS_N)
- mov eax, [rcx+(MERS_N-1)*4+CRandomMersenneA.MT]; mt[0] = mt[MERS_N-1];
- mov [rcx+CRandomMersenneA.MT], eax
- mov esi, 1 ; i=1;
-M330:
- cmp edi, r8d ; length
- jb M340 ; if (j>=length)
- xor edi, edi ; j = 0;
-M340:
- dec ebp ; k--
- jnz M320 ; first k loop
-M350:
- mov ebp, MERS_N-1 ; k
-M360: mov eax, [rcx+rsi*4-4+CRandomMersenneA.MT] ; mt[i-1]
- mov edx, eax
- shr eax, 30
- xor eax, edx ; mt[i-1] ^ (mt[i-1] >> 30)
- imul eax, 1566083941 ; * 1566083941
- xor eax, [rcx+rsi*4+CRandomMersenneA.MT] ; ^ mt[i]
- sub eax, esi ; - i
- mov [rcx+rsi*4+CRandomMersenneA.MT], eax ; save in mt[i]
- inc esi ; i++
- cmp esi, MERS_N
- jb M370 ; if (i>=MERS_N)
- mov eax, [rcx+(MERS_N-1)*4+CRandomMersenneA.MT]; mt[0] = mt[MERS_N-1];
- mov [rcx+CRandomMersenneA.MT], eax
- mov esi, 1 ; i=1;
-M370:
- dec ebp ; k--
- jnz M360 ; second k loop
- mov dword [rcx+CRandomMersenneA.MT], 80000000H ; mt[0] = 0x80000000
-M380:
- mov dword [rcx+CRandomMersenneA.MTI], 0
- mov dword [rcx+CRandomMersenneA.PreInx], 0
-
-; discard first MERS_N random numbers + PREMADELOST+1 to compensate for lag
- mov edi, MERS_N + PREMADELOST+1
-M391: call ?Windows_MersBRandom
- dec edi
- jnz M391
-
- pop rbp ; restore registers
- pop rdi
- pop rsi
- pop rbx
- ret
-;MersRandomInitByArray ENDP
-
-
-; Single threaded version:
-; extern "C" unsigned int MersenneBRandom(); // Output random bits
-
-MersenneBRandom: ; PROC ; entry for both Windows and Linux call
-%IFDEF WINDOWS
-MersenneBRandomD: ; LABEL NEAR ; alias
-%ENDIF
- lea rcx, [MersenneInstance] ; Point to instance
- jmp ?Windows_MersBRandom
-;MersenneBRandom ENDP
-
-; Thread-safe version:
-; extern "C" unsigned int MersBRandom(void * Pthis); // Output random bits
-
-MersBRandom: ; PROC
-%IFDEF UNIX
- mov rcx, rdi ; translate calling convention
-%ENDIF
-
-?Windows_MersBRandom: ; LABEL NEAR ; Label used internally
- and rcx, -16 ; align buffer
- mov edx, [rcx+CRandomMersenneA.PreInx] ; index into premade numbers
- mov eax, [rcx+rdx*1+CRandomMersenneA.PreInt] ; fetch premade random number
- add edx, 4
- mov [rcx+CRandomMersenneA.PreInx], edx
- cmp edx, 4*4
- jnb M410
- ret ; return premade number
-
-M410:
-; PREMADE list is empty. Make 4 more numbers ready for next call:
- mov edx, [rcx+CRandomMersenneA.MTI] ; fetch 4 numbers from MT buffer
- movdqa xmm0, oword [rcx+rdx*1+CRandomMersenneA.MT]
-
-%IF TEMPERING ; optional tempering algorithm
- movdqa xmm1, xmm0
- psrld xmm0, MERS_U
- pxor xmm0, xmm1
- movdqa xmm1, xmm0
- pslld xmm0, MERS_S
- pand xmm0, oword [rcx+CRandomMersenneA.TMB]
- pxor xmm0, xmm1
- movdqa xmm1, xmm0
- pslld xmm0, MERS_T
- pand xmm0, oword [rcx+CRandomMersenneA.TMC]
- pxor xmm0, xmm1
- movdqa xmm1, xmm0
- psrld xmm0, MERS_L
- pxor xmm0, xmm1
-%ENDIF ; tempering
-
- ; save four premade integers
- movdqa oword [rcx+CRandomMersenneA.PreInt], xmm0
- ; premake four floating point numbers
- pxor xmm1, xmm1
- pxor xmm2, xmm2
- punpckldq xmm1, xmm0 ; get first two numbers into bits 32-63 and 96-127
- punpckhdq xmm2, xmm0 ; get next two numbers into bits 32-63 and 96-127
- psrlq xmm1, 12 ; get bits into mantissa position
- psrlq xmm2, 12 ; get bits into mantissa position
- por xmm1,oword[rcx+CRandomMersenneA.one] ; set exponent for interval [1,2)
- por xmm2,oword[rcx+CRandomMersenneA.one] ; set exponent for interval [1,2)
- movdqa oword [rcx+CRandomMersenneA.PreFlt], xmm1 ; store two premade numbers
- movdqa oword [rcx+CRandomMersenneA.PreFlt+16],xmm2; store two more premade numbers
- mov dword [rcx+CRandomMersenneA.PreInx], 0 ; index to premade numbers
- add edx, 4*4 ; increment MTI index into MT buffer by 4
- mov [rcx+CRandomMersenneA.MTI], edx
- cmp edx, MERS_N*4
- jae M420
- ret ; return random number in eax
-
-; MT buffer exhausted. Make MERS_N new numbers ready for next time
-M420: ; eax is the random number to return
-%IF MERS_N & 3 ; if MERS_N is not divisible by 4
- NVALID equ MERS_N & 3 ; only NVALID of the 4 premade numbers are valid
- ; Move premade numbers (4-NVALID) positions forward
- movdqa xmm0, [rcx+CRandomMersenneA.PreInt]
- movdqa xmm1, [rcx+CRandomMersenneA.PreFlt]
- movdqa xmm2, [rcx+CRandomMersenneA.PreFlt+16]
- movdqu [rcx+CRandomMersenneA.PreInt + (4-NVALID)*4], xmm0
- movdqu [rcx+CRandomMersenneA.PreFlt + (4-NVALID)*8], xmm1
-%IF NVALID == 3
- movq [rcx+CRandomMersenneA.PreFlt+16 + 8], xmm2
-%ENDIF
- ; save index to first valid premade number
- mov [rcx+CRandomMersenneA.PreInx], (4-NVALID)*4
-%ENDIF
-
-; MT buffer is empty. Fill it up
- push rbx
- movd xmm3, [rcx+CRandomMersenneA.UMASK] ; load constants
- movd xmm4, [rcx+CRandomMersenneA.LMASK]
- movd xmm5, [rcx+CRandomMersenneA.MATA]
- pshufd xmm3, xmm3, 0 ; broadcast constants
- pshufd xmm4, xmm4, 0
- pshufd xmm5, xmm5, 0
- xor rbx, rbx ; kk = 0
- mov edx, MERS_M*4 ; km
-
-; change rcx from pointing to CRandomMersenneA to pointing to CRandomMersenneA.MT
- add rcx, CRandomMersenneA.MT
-
-M430: ; kk loop
- movdqa xmm2, [rcx+rbx] ; mt[kk]
- movd xmm0, dword [rcx+rbx+16]
- movdqa xmm1, [rcx+rbx] ; mt[kk]
- movss xmm2, xmm0 ; faster than movdqu xmm2,[]
- pshufd xmm2, xmm2, 00111001B ; mt[kk+1]
- movdqu xmm0, oword [rcx+rdx] ; mt[km]
- ;movq xmm0, qword [rcx+rdx] ; mt[km]
- ;movhps xmm0, qword [rcx+rdx+8] ; faster than movdqu on older processors
- pand xmm1, xmm3 ; mt[kk] & UPPER_MASK
- pand xmm2, xmm4 ; mt[kk+1] & LOWER_MASK
- por xmm1, xmm2 ; y
- movdqa xmm2, xmm1 ; y
- pslld xmm1, 31 ; copy bit 0 into all bits
- psrad xmm1, 31 ; -(y & 1)
- pand xmm1, xmm5 ; & MERS_A
- psrld xmm2, 1 ; y >> 1
- pxor xmm0, xmm1
- pxor xmm0, xmm2
- movdqa [rcx+rbx], xmm0 ; result into mt[kk]
- cmp ebx, (MERS_N-4)*4
- jae M440 ; exit loop when kk past end of buffer
- add ebx, 16 ; kk += 4
- add rdx, 16 ; km += 4 (signed)
- cmp edx, (MERS_N-4)*4
- jbe M430 ; skip unless km wraparound
- sub rdx, MERS_N*4 ; km wraparound (signed)
- movdqu xmm0, [rcx+(MERS_N-4)*4] ; copy end to before begin for km wraparound
- movdqa [rcx-4*4], xmm0
- movdqa xmm0, [rcx] ; copy begin to after end for kk wraparound
- movdqu [rcx+MERS_N*4], xmm0
- jmp M430
-
-M440: ; loop finished. discard excess part of last result
-
-; change ecx back to pointing to CRandomMersenneA
- sub rcx, CRandomMersenneA.MT
-
- mov dword [rcx+CRandomMersenneA.MTI], 0
- pop rbx
- ret ; random number is still in eax
-
-;MersBRandom ENDP
-
-
-; Single threaded version:
-; extern "C" unsigned int MersenneRandom(); // Get floating point random number
-
-MersenneRandom: ; PROC ; entry for both Windows and Linux call
-%IFDEF WINDOWS
-MersenneRandomD: ; alias
- lea rcx, [MersenneInstance] ; Point to instance
- ; continue in next function
-%ENDIF
-%IFDEF UNIX
- lea rdi, [MersenneInstance] ; Point to instance
- ; continue in next function
-%ENDIF
-
-; Thread-safe version:
-; extern "C" double MersRandom(void * Pthis); // Get floating point random number
-MersRandom:
-%IFDEF UNIX
- mov rcx, rdi ; translate calling convention
-%ENDIF
- mov edx, [rcx+CRandomMersenneA.PreInx] ; index into premade numbers
- movsd xmm0, [rcx+rdx*2+CRandomMersenneA.PreFlt] ; fetch premade floating point random number
- subsd xmm0, [rcx+CRandomMersenneA.one] ; subtract 1.0
- movsd [rcx+CRandomMersenneA.TmpFlt], xmm0 ; store random number
- call ?Windows_MersBRandom ; prepare next random number
- movsd xmm0, [rcx+CRandomMersenneA.TmpFlt] ; recall random number
- ret
-;MersenneRandom ENDP
-
-
-
-; Single threaded version:
-; extern "C" unsigned int MersenneIRandom(int min, int max); // Get integer random number in desired interval
-
-MersenneIRandom: ; PROC
-%IFDEF UNIX
- push rsi ; max
- push rdi ; min
- lea rcx, [MersenneInstance] ; Pthis = point to instance
- jmp MersIRandom_max_min_on_stack
-%ENDIF
-%IFDEF WINDOWS
-MersenneIRandomD: ; Alias
- push rdx ; max
- push rcx ; min
- lea rcx, [MersenneInstance] ; Pthis = point to instance
- jmp MersIRandom_max_min_on_stack
-%ENDIF
-;MersenneIRandom ENDP
-
-; Thread-safe version:
-; extern "C" int MersIRandom(void * Pthis, int min, int max); // Get integer random number in desired interval
-MersIRandom: ; PROC
-%IFDEF UNIX
- ; translate calling convention
- mov r8d, edx ; max
- mov edx, esi ; min
- mov rcx, rdi ; Pthis
-%ENDIF
- push r8 ; max
- push rdx ; min
-MersIRandom_max_min_on_stack:
-
- call ?Windows_MersBRandom ; random bits
- pop rcx ; min
- pop rdx ; max
- sub edx, ecx
- js short M720 ; max < min
- add edx, 1 ; interval = max - min + 1
- mul edx ; multiply random number by interval and truncate
- lea eax, [rdx+rcx] ; add min
- ret
-M720: mov eax, 80000000H ; error exit
- ret
-;MersIRandom ENDP
-
-
-; Single threaded version:
-; extern "C" unsigned int MersenneIRandomX(int min, int max); // Get integer random number in desired interval
-
-MersenneIRandomX: ; PROC
-%IFDEF UNIX
- mov r8d, esi ; max
- mov edx, edi ; min
- lea rcx, [MersenneInstance] ; Pthis = point to instance
- jmp ?Windows_MersIRandomX
-%ENDIF
-%IFDEF WINDOWS
-MersenneIRandomXD: ; alias
- mov r8d, edx ; max
- mov edx, ecx ; min
- lea rcx, [MersenneInstance] ; Pthis = point to instance
- jmp ?Windows_MersIRandomX
-%ENDIF
-;MersenneIRandomX ENDP
-
-; Thread-safe version:
-; extern "C" int MersIRandomX(void * Pthis, int min, int max); // Get integer random number in desired interval
-MersIRandomX: ; PROC
-%IFDEF UNIX
- ; translate calling convention
- mov r8d, edx ; max
- mov edx, esi ; min
- mov rcx, rdi ; Pthis
-%ENDIF
-
-?Windows_MersIRandomX:
-; parameters: rcx = Pthis, edx = min, r8d = max
-
- and rcx, -16 ; align buffer
- push rdi
- mov edi, r8d ; max
-
- sub edi, edx ; max - min
- jle short M830 ; max <= min (signed)
- inc edi ; interval = max - min + 1
- push rdx ; save min
-
- ; if (interval != LastInterval) {
- cmp edi, [rcx+CRandomMersenneA.LastInterval]
- je M810
- ; RLimit = uint32(((uint64)1 << 32) / interval) * interval - 1;}
- xor eax, eax ; 0
- lea edx, [rax+1] ; 1
- div edi ; (would give overflow if interval = 1)
- mul edi
- dec eax
- mov [rcx+CRandomMersenneA.RLimit], eax
- mov [rcx+CRandomMersenneA.LastInterval], edi
-M810:
-M820: ; do { // Rejection loop
- call ?Windows_MersBRandom ; random bits (rcx is preserved)
- ; longran = (uint64)BRandom() * interval;
- mul edi
- ; } while (remainder > RLimit);
- cmp eax, [rcx+CRandomMersenneA.RLimit]
- ja M820
-
- ; return (int32)iran + min
- pop rax ; min
- add eax, edx
- pop rdi
- ret
-
-M830: jl M840
- ; max = min. Return min
- mov eax, edx
- pop rdi
- ret ; max = min exit
-
-M840: ; max < min: error
- mov eax, 80000000H ; error exit
- pop rdi
- ret
-;MersIRandomX ENDP
diff --git a/contrib/libs/asmlib/mother64.asm b/contrib/libs/asmlib/mother64.asm
deleted file mode 100644
index c6fd34ec3b..0000000000
--- a/contrib/libs/asmlib/mother64.asm
+++ /dev/null
@@ -1,242 +0,0 @@
-%include "defs.asm"
-
-; ----------------------------- MOTHER64.ASM -----------------------------
-; Author: Agner Fog
-; Date created: 1998
-; Last modified: 2013-09-11
-; Source URL: www.agner.org/optimize
-; Project: asmlib.zip
-; Language: assembly, NASM/YASM syntax, 64 bit
-; Description:
-; Mother-of-All random number generator by Agner Fog
-; 64-bit mode version for x86-64 compatible microprocessors.
-;
-; This is a multiply-with-carry type of random number generator
-; invented by George Marsaglia. The algorithm is:
-; S = 2111111111*X[n-4] + 1492*X[n-3] + 1776*X[n-2] + 5115*X[n-1] + C
-; X[n] = S modulo 2^32
-; C = floor(S / 2^32)
-;
-; C++ prototypes:
-; extern "C" void MotRandomInit(void * Pthis, int seed); // Initialization
-; extern "C" int MotIRandom(void * Pthis, int min, int max); // Get integer random number in desired interval
-; extern "C" double MotRandom(void * Pthis); // Get floating point random number
-; extern "C" unsigned int MotBRandom(void * Pthis); // Output random bits
-;
-; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-default rel
-
-; structure definition and constants:
-%INCLUDE "randomah.asi"
-
-; publics:
-global MotherBRandom, MotBRandom, ?Windows_MotBRandom
-global MotherRandom, MotRandom, MotherIRandom, MotIRandom
-global MotherRandomInit, MotRandomInit
-
-section .data
-align 16
-
-; Data for single instance of random number generator
-MotherInstance: ISTRUC CRandomMotherA
-IEND
-; Size of structure
-MotherSize equ $-MotherInstance
-
-
-SECTION .CODE ALIGN=16 ; code segment
-
-; Single threaded version:
-; extern "C" unsigned int MotherBRandom(); // Output random bits
-
-MotherBRandom: ; PROC ; entry for both Windows and Linux call
- lea rcx, [MotherInstance] ; Point to instance
- jmp ?Windows_MotBRandom
-;MotherBRandom ENDP
-
-; Thread-safe version:
-; extern "C" unsigned int MotBRandom(void * Pthis); // Output random bits
-
-MotBRandom: ; PROC
-%IFDEF UNIX
- mov rcx, rdi ; translate calling convention
-%ENDIF
-?Windows_MotBRandom:
- and rcx, -16 ; align
- movdqa xmm1, oword [rcx+CRandomMotherA.M3] ; load M3,M2,M1,M0
- mov eax, [rcx+CRandomMotherA.M0] ; Retrieve previous random number
- movdqa xmm2, xmm1 ; copy
- movdqa xmm3, oword [rcx+CRandomMotherA.MF3] ; factors
- psrlq xmm2, 32 ; move M2,M0 down
- movq qword [rcx+CRandomMotherA.M4], xmm1 ; M4=M3, M3=M2
- movhps qword [rcx+CRandomMotherA.M2], xmm1 ; M2=M1, M1=M0
- pmuludq xmm1, xmm3 ; M3*MF3, M1*MF1
- psrlq xmm3, 32 ; move MF2,MF0 down
- pmuludq xmm2, xmm3 ; M2*MF2, M0*MF0
- paddq xmm1, xmm2 ; P2+P3, P0+P1
- movhlps xmm2, xmm1 ; Get high qword
- paddq xmm1, xmm2 ; P0+P1+P2+P3
- paddq xmm1, oword [rcx+CRandomMotherA.MC] ; +carry
- movq qword [rcx+CRandomMotherA.M0], xmm1 ; Store new M0 and carry
- ; convert to double precision float
- psllq xmm1, 32 ; Discard carry bits
- psrlq xmm1, 12 ; Get bits into mantissa position
- por xmm1, oword [rcx+CRandomMotherA.one] ; Add exponent bits to get number in interval [1,2)
- movq [rcx+CRandomMotherA.RanP1], xmm1 ; Store floating point number
- ret
-
-;MotBRandom ENDP
-
-
-; Single threaded version:
-; extern "C" unsigned int MotherRandom(); // Get floating point random number
-
-MotherRandom:
-%IFDEF UNIX
- lea rdi, [MotherInstance] ; Point to instance
-%ENDIF
-%IFDEF WINDOWS
- lea rcx, [MotherInstance] ; Point to instance
-%ENDIF
-
-; Thread-safe version:
-; extern "C" double MotRandom(void * Pthis); // Get floating point random number
-MotRandom:
-%IFDEF UNIX
- mov rcx, rdi ; translate calling convention
-%ENDIF
- and rcx, -16 ; align
- ; get previously prepared random number
- movsd xmm0, [rcx+CRandomMotherA.RanP1]
- subsd xmm0, [rcx+CRandomMotherA.one]
-
- ; make new random number ready for next time
- call ?Windows_MotBRandom
- ret
-;MotherRandom ENDP
-
-
-; Single threaded version:
-; extern "C" unsigned int MotherIRandom(int min, int max); // Get integer random number in desired interval
-
-MotherIRandom: ; PROC
-%IFDEF UNIX
- mov r8d, esi ; max
- mov edx, edi ; min
- lea rcx, [MotherInstance] ; Pthis = point to instance
- jmp ?Windows_MotIRandom
-%ENDIF
-%IFDEF WINDOWS
- mov r8d, edx ; max
- mov edx, ecx ; min
- lea rcx, [MotherInstance] ; Pthis = point to instance
- jmp ?Windows_MotIRandom
-%ENDIF
-; MotherIRandom ENDP
-
-; Thread-safe version:
-; extern "C" int MotIRandom(void * Pthis, int min, int max); // Get integer random number in desired interval
-MotIRandom:
-%IFDEF UNIX
- ; translate calling convention
- mov r8d, edx ; max
- mov edx, esi ; min
- mov rcx, rdi ; Pthis
-%ENDIF
-
-?Windows_MotIRandom: ; LABEL NEAR ; entry for Windows call
- and rcx, -16 ; align
- push r8
- push rdx
- call ?Windows_MotBRandom ; make random number
- pop rcx ; min
- pop r8 ; max
- sub r8d, ecx
- js short rerror ; max < min
- inc r8d ; interval = max - min + 1
- mul r8d ; multiply random number eax by interval and truncate
- lea eax, [rdx+rcx] ; add min to interval*BRandom >> 32
- ret ; ret 8 if not _cdecl calling
-
-rerror: mov eax, 80000000h ; error exit
- ret ; ret 8 if not _cdecl calling
-;MotIRandom ENDP
-
-
-; Single threaded version:
-; extern "C" unsigned int MotherRandomInit(int seed); // Initialization
-
-MotherRandomInit: ; PROC
-%IFDEF UNIX
- mov edx, edi ; seed
- lea rcx, [MotherInstance] ; Pthis = point to instance
- jmp ?Windows_MotRandomInit
-%ENDIF
-%IFDEF WINDOWS
- mov edx, ecx ; seed
- lea rcx, [MotherInstance] ; Pthis = point to instance
- jmp ?Windows_MotRandomInit
-%ENDIF
-;MotherRandomInit ENDP
-
-; Thread-safe version:
-; extern "C" void MotRandomInit(void * Pthis, int seed); // Initialization
-MotRandomInit: ; PROC
-%IFDEF UNIX
- ; translate calling convention
- mov edx, esi ; seed
- mov rcx, rdi ; Pthis
-%ENDIF
-
-?Windows_MotRandomInit: ; LABEL NEAR ; entry for Windows call
- and rcx, -16 ; align
- ; clear my buffer
- push rdi
- push rcx
- mov rdi, rcx ; Pthis
- add rdi, 16
- mov ecx, (MotherSize - 16) / 4
- xor eax, eax
- cld
- rep stosd
- pop rcx
-
- ; insert constants
- mov dword [rcx+CRandomMotherA.one+4], 3FF00000H ; high dword of 1.0
- mov dword [rcx+CRandomMotherA.MF0], 5115 ; factors
- mov dword [rcx+CRandomMotherA.MF1], 1776
- mov dword [rcx+CRandomMotherA.MF2], 1492
- mov dword [rcx+CRandomMotherA.MF3], 2111111111
-
- ; initialize from seed
- mov eax, edx ; seed
- ; make random numbers and put them into buffer
- mov edx, 29943829
- imul eax, edx
- dec eax
- mov [rcx+CRandomMotherA.M0], eax
- imul eax, edx
- dec eax
- mov [rcx+CRandomMotherA.M1], eax
- imul eax, edx
- dec eax
- mov [rcx+CRandomMotherA.M2], eax
- imul eax, edx
- dec eax
- mov [rcx+CRandomMotherA.M3], eax
- imul eax, edx
- dec eax
- mov [rcx+CRandomMotherA.MC], eax
-
- ; randomize some more
- mov edi, 20 ; loop counter
-r90: call ?Windows_MotBRandom ; (rcx and rdi unchanged)
- dec edi
- jnz r90
- pop rdi
- ret
-;MotRandomInit ENDP
-
- ; END
diff --git a/contrib/libs/asmlib/physseed64.asm b/contrib/libs/asmlib/physseed64.asm
deleted file mode 100644
index b30fc26712..0000000000
--- a/contrib/libs/asmlib/physseed64.asm
+++ /dev/null
@@ -1,396 +0,0 @@
-%include "defs.asm"
-
-;************************* physseed64.asm **********************************
-; Author: Agner Fog
-; Date created: 2010-08-03
-; Last modified: 2013-09-13
-; Source URL: www.agner.org/optimize
-; Project: asmlib.zip
-; C++ prototype:
-; extern "C" int PhysicalSeed(int seeds[], int NumSeeds);
-;
-; Description:
-; Generates a non-deterministic random seed from a physical random number generator
-; which is available on some processors.
-; Uses the time stamp counter (which is less random) if no physical random number
-; generator is available.
-; The code is not optimized for speed because it is typically called only once.
-;
-; Parameters:
-; int seeds[] An array which will be filled with random numbers
-; int NumSeeds Indicates the desired number of 32-bit random numbers
-;
-; Return value: 0 Failure. No suitable instruction available (processor older than Pentium)
-; 1 No physical random number generator. Used time stamp counter instead
-; 2 Success. VIA physical random number generator used
-; 3 Success. Intel physical random number generator used
-; 4 Success. Intel physical seed generator used
-;
-; The return value will indicate the availability of a physical random number generator
-; even if NumSeeds = 0.
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-default rel
-
-%define NUM_TRIES 20 ; max number of tries for rdseed and rdrand instructions
-
-%define TESTING 0 ; 1 for test only
-
-global PhysicalSeed
-
-; Direct entries to CPU-specific versions
-global PhysicalSeedNone: function
-global PhysicalSeedRDTSC: function
-global PhysicalSeedVIA: function
-global PhysicalSeedRDRand: function
-global PhysicalSeedRDSeed function
-
-; ***************************************************************************
-; Define registers used for function parameters, used in 64-bit mode only
-; ***************************************************************************
-
-%IFDEF WINDOWS
- %define par1 rcx
- %define par2 rdx
- %define par3 r8
- %define par1d ecx
- %define par2d edx
- %define par3d r8d
-%ENDIF
-
-%IFDEF UNIX
- %define par1 rdi
- %define par2 rsi
- %define par3 rdx
- %define par1d edi
- %define par2d esi
- %define par3d edx
-%ENDIF
-
-
-SECTION .text align=16
-
-%IFDEF WINDOWS
-global PhysicalSeedD@8 ; DLL version
-PhysicalSeedD@8:
-%ENDIF
-
-PhysicalSeed:
- jmp [PhysicalSeedDispatch] ; Go to appropriate version, depending on instructions available
-
-
-PhysicalSeedRDSeed:
- push rbx
- test par2d, par2d ; NumSeeds
- jz S300
- js S900
- mov par3d, par2d ; NumSeeds
- shr par3d, 1
- jz S150
- ; do 64 bits at a time
-S100: mov ebx, NUM_TRIES
-S110: ; rdseed rax
-%if TESTING
- mov eax, par3d
- stc
-%ELSE
- db 48h, 0Fh, 0C7h, 0F8h ; rdseed rax
-%ENDIF
- jc S120
- ; failed. try again
- dec ebx
- jz S900
- jmp S110
-S120: mov [par1], rax
- add par1, 8
- dec par3d
- jnz S100 ; loop 64 bits
-S150:
- and par2d, 1
- jz S300
- ; an odd 32 bit remains
-S200: mov ebx, NUM_TRIES
-S210: ; rdseed rax
-%if TESTING
- mov eax, par3d
- stc
-%ELSE
- db 0Fh, 0C7h, 0F8h ; rdseed eax
-%ENDIF
- jc S220
- ; failed. try again
- dec ebx
- jz S900
- jmp S210
-S220: mov [par1], eax
-S300: mov eax, 4 ; return value
- pop rbx
- ret
-S900: ; failure
- xor eax, eax ; return 0
- pop rbx
- ret
-
-
-PhysicalSeedRDRand:
- push rbx
- test par2d, par2d ; NumSeeds
- jz R300
- js R900
- mov par3d, par2d ; NumSeeds
- shr par3d, 1 ; NumSeeds/2
- jz R150
- ; do 64 bits at a time
-R100: mov ebx, NUM_TRIES
-R110: ; rdrand rax
-%if TESTING
- mov eax, par3d
- stc
-%ELSE
- db 48h, 0Fh, 0C7h, 0F0h ; rdrand rax
-%ENDIF
- jc R120
- ; failed. try again
- dec ebx
- jz R900
- jmp R110
-R120: mov [par1], rax
- add par1, 8
- dec par3d
- jnz R100 ; loop 64 bits
-R150:
- and par2d, 1
- jz R300
- ; an odd 32 bit remains
-R200: mov ebx, NUM_TRIES
-R210: ; rdrand eax
-%if TESTING
- mov eax, par3d
- stc
-%ELSE
- db 0Fh, 0C7h, 0F0h ; rdrand eax
-%ENDIF
- jc R220
- ; failed. try again
- dec ebx
- jz R900
- jmp R210
-R220: mov [par1], eax
-R300: mov eax, 4 ; return value
- pop rbx
- ret
-R900: ; failure
- xor eax, eax ; return 0
- pop rbx
- ret
-
-
-PhysicalSeedVIA:
-; VIA XSTORE supported
- push rbx
-%IFDEF WINDOWS
- push rsi
- push rdi
- mov rdi, rcx ; seeds
- mov esi, edx ; NumSeeds
-%ENDIF
- mov ecx, esi ; NumSeeds
- and ecx, -2 ; round down to nearest even
- jz T200 ; NumSeeds <= 1
- ; make an even number of random dwords
- shl ecx, 2 ; number of bytes (divisible by 8)
- mov edx, 3 ; quality factor
-%if TESTING
- mov eax, 1
- rep stosb
-%ELSE
- db 0F3H, 00FH, 0A7H, 0C0H ; rep xstore instuction
-%ENDIF
-T200:
- test esi, 1
- jz T300
- ; NumSeeds is odd. Make 8 bytes in temporary buffer and store 4 of the bytes
- mov rbx, rdi ; current output pointer
- mov ecx, 4 ; Will generate 4 or 8 bytes, depending on CPU
- mov edx, 3 ; quality factor
- push rcx ; make temporary space on stack
- mov rdi, rsp ; point to buffer on stack
-%if TESTING
- mov eax, 1
- rep stosb
-%ELSE
- db 0F3H, 00FH, 0A7H, 0C0H ; rep xstore instuction
-%ENDIF
- pop rax
- mov [rbx], eax ; store the last 4 bytes
-T300:
- mov eax, 2 ; return value
-%IFDEF WINDOWS
- pop rdi
- pop rsi
-%ENDIF
- pop rbx
- ret
-
-
-PhysicalSeedRDTSC:
-%IFDEF WINDOWS
- push rbx
- push rcx
- push rdx
- xor eax, eax
- cpuid ; serialize
- rdtsc ; get time stamp counter
- pop rbx ; numseeds
- pop rcx ; seeds
- test ebx, ebx
- jz U300 ; zero seeds
- js U900 ; failure
- mov [rcx], eax ; store time stamp counter as seeds[0]
- add rcx, 4
- dec ebx
- jz U300
- mov [rcx], edx ; store upper part of time stamp counter as seeds[1]
- add rcx, 4
- dec ebx
- jz U300
- xor eax, eax
-U100: mov [rcx], eax ; store 0 for the rest
- add rcx, 4
- dec ebx
- jnz U100
-U300: mov eax, 1 ; return value
- pop rbx
- ret
-U900: ; failure
- xor eax, eax ; return 0
- pop rbx
- ret
-
-%ELSE ; UNIX
-
- push rbx
- xor eax, eax
- cpuid ; serialize
- rdtsc ; get time stamp counter
- test esi, esi ; numseeds
- jz U300 ; zero seeds
- js U900 ; failure
- mov [rdi], eax ; store time stamp counter as seeds[0]
- add rdi, 4
- dec esi
- jz U300
- mov [rdi], edx ; store upper part of time stamp counter as seeds[1]
- add rdi, 4
- dec esi
- jz U300
- xor eax, eax
-U100: mov [rdi], eax ; store 0 for the rest
- add rdi, 4
- dec esi
- jnz U100
-U300: mov eax, 1 ; return value
- pop rbx
- ret
-U900: ; failure
- xor eax, eax ; return 0
- pop rbx
- ret
-
-%ENDIF
-
-
-PhysicalSeedNone: ; no possible generation
- xor eax, eax
- test par2d, par2d ; numseeds
- jz N200
-N100: mov [par1], eax
- add par1, 4
- dec par2d
- jnz N100
-N200: ret ; return 0
-
-
-PhysicalSeedDispatcher:
- push rbx
-%IFDEF WINDOWS
- push rcx
- push rdx
-%ENDIF
- ; test if RDSEED supported
- xor eax, eax
- cpuid
- cmp eax, 7
- jb P200 ; RDSEED not supported
- mov eax, 7
- xor ecx, ecx
- cpuid
- bt ebx, 18
- ; jc USE_RDSEED ; not tested yet!!
-
-P200: ; test if RDRAND supported
- mov eax, 1
- cpuid
- bt ecx, 30
- jc USE_RDRAND
-
- ; test if VIA xstore instruction supported
- mov eax, 0C0000000H
- push rax
- cpuid
- pop rbx
- cmp eax, ebx
- jna P300 ; not a VIA processor
- lea eax, [rbx+1]
- cpuid
- bt edx, 3
- jc VIA_METHOD
-
-P300: ; test if RDTSC supported
- mov eax, 1
- cpuid
- bt edx, 4
- jc USE_RDTSC ; XSTORE instruction not supported or not enabled
-
-FAILURE: ; No useful instruction supported
- lea rax, [PhysicalSeedNone]
- jmp P800
-
-USE_RDRAND: ; Use RDRAND instruction
- lea rax, [PhysicalSeedRDRand]
- jmp P800
-
-USE_RDSEED: ; Use RDSEED instruction (not tested yet)
- lea rax, [PhysicalSeedRDSeed]
- jmp P800
-
-VIA_METHOD: ; Use VIA xstore instructions
- lea rax, [PhysicalSeedVIA]
- jmp P800
-
-USE_RDTSC:
- lea rax, [PhysicalSeedRDTSC]
- ;jmp P800
-
-P800: mov [PhysicalSeedDispatch], rax
-%IFDEF WINDOWS
- pop rdx
- pop rcx
-%ENDIF
- pop rbx
- jmp rax ; continue in dispatched version
-
-
-; -----------------------------------------------------------------
-; Data section for dispatcher
-; -----------------------------------------------------------------
-
-SECTION .data
-
-; Pointer to appropriate versions. Initially point to dispatcher
-PhysicalSeedDispatch DQ PhysicalSeedDispatcher
-
-%IFDEF POSITIONINDEPENDENT
-; Fix potential problem in Mac linker
- DD 0, 0
-%ENDIF
diff --git a/contrib/libs/asmlib/popcount64.asm b/contrib/libs/asmlib/popcount64.asm
deleted file mode 100644
index c4ad64e03b..0000000000
--- a/contrib/libs/asmlib/popcount64.asm
+++ /dev/null
@@ -1,112 +0,0 @@
-%include "defs.asm"
-
-;************************* popcount64.asm ************************************
-; Author: Agner Fog
-; Date created: 2011-07-20
-; Last modified: 2011-07-20
-
-; Description:
-; Population count function. Counts the number of 1-bits in a 32-bit integer
-; unsigned int A_popcount (unsigned int x);
-;
-; Position-independent code is generated if POSITIONINDEPENDENT is defined.
-;
-; CPU dispatching included for 386 and SSE4.2 instruction sets.
-;
-; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
-;******************************************************************************
-default rel
-
-global A_popcount: function
-
-; Direct entries to CPU-specific versions
-global popcountGeneric: function
-global popcountSSE42: function
-
-; Imported from instrset32.asm:
-extern InstructionSet ; Instruction set for CPU dispatcher
-
-section .text
-
-;******************************************************************************
-; popcount function
-;******************************************************************************
-
-
-A_popcount: ; function dispatching
- jmp near [popcountDispatch] ; Go to appropriate version, depending on instruction set
-
-align 16
-popcountSSE42: ; SSE4.2 version
-%ifdef WINDOWS
- popcnt eax, ecx
-%else
- popcnt eax, edi
-%endif
- ret
-
-
-;******************************************************************************
-; popcount function generic
-;******************************************************************************
-
-popcountGeneric: ; Generic version
-%ifdef WINDOWS
- mov eax, ecx
-%else
- mov eax, edi
-%endif
- mov edx, eax
- shr eax, 1
- and eax, 55555555h ; odd bits in eax, even bits in edx
- and edx, 55555555h
- add eax, edx
- mov edx, eax
- shr eax, 2
- and eax, 33333333h
- and edx, 33333333h
- add eax, edx
- mov edx, eax
- shr eax, 4
- add eax, edx
- and eax, 0F0F0F0Fh
- mov edx, eax
- shr eax, 8
- add eax, edx
- mov edx, eax
- shr eax, 16
- add eax, edx
- and eax, 03FH
- ret
-;popcountGeneric end
-
-; ********************************************************************************
-; CPU dispatching for popcount. This is executed only once
-; ********************************************************************************
-
-%ifdef WINDOWS
-%define par1 rcx ; parameter 1, pointer to haystack
-%else
-%define par1 rdi ; parameter 1, pointer to haystack
-%endif
-
-popcountCPUDispatch:
- ; get supported instruction set
- push par1
- call InstructionSet
- pop par1
- ; Point to generic version of strstr
- lea rdx, [popcountGeneric]
- cmp eax, 9 ; check popcnt supported
- jb Q100
- ; SSE4.2 supported
- ; Point to SSE4.2 version of strstr
- lea rdx, [popcountSSE42]
-Q100: mov [popcountDispatch], rdx
- ; Continue in appropriate version
- jmp rdx
-
-SECTION .data
-
-; Pointer to appropriate versions. Initially point to dispatcher
-popcountDispatch DQ popcountCPUDispatch
diff --git a/contrib/libs/asmlib/procname64.asm b/contrib/libs/asmlib/procname64.asm
deleted file mode 100644
index 1b77b74320..0000000000
--- a/contrib/libs/asmlib/procname64.asm
+++ /dev/null
@@ -1,145 +0,0 @@
-%include "defs.asm"
-
-; procname64.asm
-;
-; Author: Agner Fog
-; Date created: 2007
-; Last modified: 2011-07-02
-; Description:
-; ProcessorName
-; =============
-; This function produces a zero-terminated ASCII string containing a name
-; for the microprocessor in human-readable format.
-;
-; Copyright (c) 2007-2011 GNU General Public License www.gnu.org/licenses
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-default rel
-
-global ProcessorName: function
-
-SECTION .data
-align 16
-
-NameBuffer times 50H db 0 ; Static buffer to contain name
-
-
-SECTION .text align=16
-
-; ********** ProcessorName function **********
-; C++ prototype:
-; void ProcessorName (char * text);
-
-; This function finds the name of the microprocessor. The name is returned
-; in the parameter text, which must be a character array of at least 68 bytes.
-
-ProcessorName:
- push rbx
- push rdi
- lea rdi, [NameBuffer] ; text pointer
-
- mov eax, 80000000H
- cpuid
- cmp eax, 80000004H ; text if extended vendor string available
- jb no_ext_vendor_string
-
- ; Has extended vendor string
- mov eax, 80000002H
- cpuid
- mov [rdi], eax ; store 16 bytes of extended vendor string
- mov [rdi+4], ebx
- mov [rdi+8], ecx
- mov [rdi+0CH], edx
- mov eax, 80000003H
- cpuid
- mov [rdi+10H], eax ; next 16 bytes
- mov [rdi+14H], ebx
- mov [rdi+18H], ecx
- mov [rdi+1CH], edx
- mov eax, 80000004H
- cpuid
- mov [rdi+20H], eax ; next 16 bytes
- mov [rdi+24H], ebx
- mov [rdi+28H], ecx
- mov [rdi+2CH], edx
- jmp get_family_and_model
-
-no_ext_vendor_string:
- ; No extended vendor string. Get short vendor string
- xor eax, eax
- cpuid
- mov [rdi],ebx ; store short vendor string
- mov [rdi+4],edx
- mov [rdi+8],ecx
- mov byte [rdi+12],0 ; terminate string
-
-get_family_and_model:
- xor eax, eax
- mov ecx, 30H
- cld
- repne scasb ; find end of text
- dec rdi
-
- mov dword [rdi], ' Fam' ; Append text " Family "
- mov dword [rdi+4], 'ily '
- add rdi, 8
-
- mov eax, 1
- cpuid ; Get family and model
- mov ebx, eax
- mov ecx, eax
- shr eax, 8
- and eax, 0FH ; Family
- shr ecx, 20
- and ecx, 0FFH ; Extended family
- add eax, ecx ; Family + extended family
- call WriteHex ; Write as hexadecimal
-
- mov dword [rdi], 'H Mo' ; Write text "H Model "
- mov dword [rdi+4], 'del '
- add rdi, 8
-
- mov eax, ebx
- shr eax, 4
- and eax, 0FH ; Model
- mov ecx, ebx
- shr ecx, 12
- and ecx, 0F0H ; Extended model
- or eax, ecx ; Model | extended model
- call WriteHex ; Write as hexadecimal
-
- mov dword [rdi], 'H' ; Write text "H"
-
-PNEND: ; finished
- lea rax, [NameBuffer] ; Pointer to result
- pop rdi
- pop rbx
- ret
-;ProcessorName ENDP
-
-WriteHex: ; Local function: Write 2 hexadecimal digits
- ; Parameters: AL = number to write, RDI = text destination
- mov ecx, eax
- shr ecx, 4
- and ecx, 0FH ; most significant digit first
- cmp ecx, 10
- jnb W1
- ; 0 - 9
- add ecx, '0'
- jmp W2
-W1: ; A - F
- add ecx, 'A' - 10
-W2: mov [rdi], cl ; write digit
-
- mov ecx, eax
- and ecx, 0FH ; next digit
- cmp ecx, 10
- jnb W3
- ; 0 - 9
- add ecx, '0'
- jmp W4
-W3: ; A - F
- add ecx, 'A' - 10
-W4: mov [rdi+1], cl ; write digit
- add rdi, 2 ; advance string pointer
- ret
diff --git a/contrib/libs/asmlib/randomah.asi b/contrib/libs/asmlib/randomah.asi
deleted file mode 100644
index ed7a0185a4..0000000000
--- a/contrib/libs/asmlib/randomah.asi
+++ /dev/null
@@ -1,290 +0,0 @@
-; ----------------------------- RANDOMAH.ASI ---------------------------
-;
-; Author: Agner Fog
-; Date created: 1998
-; Last modified: 2013-09-09
-; Description:
-; Assembly include file containing
-; structure/class definitions for random number generators
-;
-; Copyright (c) 1998-2013 GNU General Public License www.gnu.org/licenses
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-; Definitions for Mersenne Twister:
-
-TEMPERING EQU 1 ; set to 0 if no tempering (improves speed by 25%)
-
-%if 0
-; define constants for MT11213A:
-MERS_N EQU 351
-MERS_M EQU 175
-MERS_R EQU 19
-MERS_A EQU 0E4BD75F5H
-MERS_U EQU 11
-MERS_S EQU 7
-MERS_T EQU 15
-MERS_L EQU 17
-MERS_B EQU 655E5280H
-MERS_C EQU 0FFD58000H
-
-%ELSE
-; or constants for MT19937:
-MERS_N EQU 624
-MERS_M EQU 397
-MERS_R EQU 31
-MERS_A EQU 09908B0DFH
-MERS_U EQU 11
-MERS_S EQU 7
-MERS_T EQU 15
-MERS_L EQU 18
-MERS_B EQU 9D2C5680H
-MERS_C EQU 0EFC60000H
-
-%ENDIF
-
-LOWER_MASK EQU (1 << MERS_R) - 1 ; lower MERS_R bits
-UPPER_MASK EQU -1 << MERS_R ; upper 32-MERS_R bits
-
-; Define class CRandomMersenneA member data
-; Must be aligned by 16.
-
-STRUC CRandomMersenneA
-.Fill1 RESD 4 ; Alignment filler
-.PreInt: RESD 4 ; premade tempered integer numbers, ready to use
-.PreFlt: RESQ 4 ; premade floating point numbers, ready to use (subtract 1.0)
- RESQ 1 ; last PreFlt unaligned overrun if MERS_N mod 4 = 1
-.TmpFlt: RESQ 1 ; temporary storage of floating point random number
-.PreInx: RESD 1 ; index to next PreInt and PreFlt number
-.Instset: RESD 1 ; Instruction set
-.LastInterval: RESD 1 ; Last interval length for IRandomX
-.RLimit: RESD 1 ; Rejection limit used by IRandomX
-.TMB: RESD 4 ; 4 copies of MERS_B constant
-.TMC: RESD 4 ; 4 copies of MERS_C constant
-.one: RESQ 2 ; 2 copies of 1.0 constant
-.MTI: RESD 1 ; index into MT buffer
-.UMASK: RESD 1 ; UPPER_MASK
-.LMASK: RESD 1 ; LOWER_MASK ; constants
-.MATA: RESD 1 ; MERS_A
-.wrap1: RESD 4 ; MT buffer km wraparound
-.MT: RESD MERS_N ; MT history buffer (aligned by 16)
-.wrap2: RESD 4 ; MT buffer kk wraparound
-%if MERS_N & 3
- ; MERS_N not divisible by 4. align by 4
- RESD (4 - (MERS_N & 3))
-%ENDIF
-endstruc ; CRandomMersenneA
-
-
-; Definitions for Mother-of-all generator:
-
-; Define class CRandomMotherA member data
-; Must be aligned by 16. Preferably aligned by 64 to fit a cache line
-STRUC CRandomMotherA
-.Fill2 RESD 4 ; Alignment filler
-.one RESQ 1 ; 1.0
-.Instset RESD 1 ; Instruction set
-.M4 RESD 1 ; x[n-4]
-.M3 RESD 1 ; x[n-3] (aligned)
-.M2 RESD 1 ; x[n-2]
-.M1 RESD 1 ; x[n-1]
-.M0 RESD 1 ; x[n]
-.MC RESD 1 ; Carry (aligned)
-.zero RESD 1 ; Zero-extension of carry
-.RanP1 RESQ 1 ; Double random number in interval [1,2)
-.MF3 RESD 1 ; 2111111111 (aligned)
-.MF2 RESD 1 ; 1492
-.MF1 RESD 1 ; 1776
-.MF0 RESD 1 ; 5115
-endstruc ; CRandomMotherA
-
-MOTHERF0 EQU 5115 ; factor 0
-MOTHERF1 EQU 1776 ; factor 1
-MOTHERF2 EQU 1492 ; factor 2
-MOTHERF3 EQU 2111111111 ; factor 3
-
-
-; ***************************************************************************
-; Definitions for SFMT generator
-; ***************************************************************************
-
-; Choose Mersenne exponent.
-; Higher values give longer cycle length and use more memory:
-; MEXP equ 607
-; MEXP equ 1279
-; MEXP equ 2281
-; MEXP equ 4253
- MEXP equ 11213
-; MEXP equ 19937
-; MEXP equ 44497
-
-%if MEXP == 44497
-SFMT_N equ 348 ; Size of state vector
-SFMT_M equ 330 ; Position of intermediate feedback
-SFMT_SL1 equ 5 ; Left shift of W[N-1], 32-bit words
-SFMT_SL2 equ 3 ; Left shift of W[0], *8, 128-bit words
-SFMT_SR1 equ 9 ; Right shift of W[M], 32-bit words
-SFMT_SR2 equ 3 ; Right shift of W[N-2], *8, 128-bit words
-SFMT_MASK1 equ 0effffffbH ;first DWORD of AND mask
-; AND mask:
-%define SFMT_MASK 0effffffbH,0dfbebfffH,0bfbf7befH,09ffd7bffH
-; Period certification vector
-%define 1,0,0a3ac4000H,0ecc1327aH
-
-%elif MEXP == 19937
-SFMT_N equ 156 ; Size of state vector
-SFMT_M equ 122 ; Position of intermediate feedback
-SFMT_SL1 equ 18 ; Left shift of W[N-1], 32-bit words
-SFMT_SL2 equ 1 ; Left shift of W[0], *8, 128-bit words
-SFMT_SR1 equ 11 ; Right shift of W[M], 32-bit words
-SFMT_SR2 equ 1 ; Right shift of W[N-2], *8, 128-bit words
-SFMT_MASK1 equ 0dfffffefH ;first DWORD of AND mask
-%define SFMT_MASK 0dfffffefH,0ddfecb7fH,0bffaffffH,0bffffff6H
-%define SFMT_PARITY 1,0,0,013c9e684H
-
-%elif MEXP == 11213
-SFMT_N equ 88 ; Size of state vector
-SFMT_M equ 68 ; Position of intermediate feedback
-SFMT_SL1 equ 14 ; Left shift of W[N-1], 32-bit words
-SFMT_SL2 equ 3 ; Left shift of W[0], *8, 128-bit words
-SFMT_SR1 equ 7 ; Right shift of W[M], 32-bit words
-SFMT_SR2 equ 3 ; Right shift of W[N-2], *8, 128-bit words
-SFMT_MASK1 equ 0effff7fbH ;first DWORD of AND mask
-%define SFMT_MASK 0effff7fbH,0ffffffefH,0dfdfbfffH,07fffdbfdH
-%define SFMT_PARITY 1,0,0e8148000H,0d0c7afa3H
-
-%elif MEXP == 4253
-SFMT_N equ 34 ; Size of state vector
-SFMT_M equ 17 ; Position of intermediate feedback
-SFMT_SL1 equ 20 ; Left shift of W[N-1], 32-bit words
-SFMT_SL2 equ 1 ; Left shift of W[0], *8, 128-bit words
-SFMT_SR1 equ 7 ; Right shift of W[M], 32-bit words
-SFMT_SR2 equ 1 ; Right shift of W[N-2], *8, 128-bit words
-SFMT_MASK1 equ 09f7bffffH ;first DWORD of AND mask
-%define SFMT_MASK 09f7bffffH,09fffff5fH,03efffffbH,0fffff7bbH
-%define SFMT_PARITY 0a8000001H,0af5390a3H,0b740b3f8H,06c11486dH
-
-%elif MEXP == 2281
-SFMT_N equ 18 ; Size of state vector
-SFMT_M equ 12 ; Position of intermediate feedback
-SFMT_SL1 equ 19 ; Left shift of W[N-1], 32-bit words
-SFMT_SL2 equ 1 ; Left shift of W[0], *8, 128-bit words
-SFMT_SR1 equ 5 ; Right shift of W[M], 32-bit words
-SFMT_SR2 equ 1 ; Right shift of W[N-2], *8, 128-bit words
-SFMT_MASK1 equ 0bff7ffbfH ;first DWORD of AND mask
-%define SFMT_MASK 0bff7ffbfH,0fdfffffeH,0f7ffef7fH,0f2f7cbbfH
-%define SFMT_PARITY 1,0,0,041dfa600H
-
-%elif MEXP == 1279
-SFMT_N equ 10 ; Size of state vector
-SFMT_M equ 7 ; Position of intermediate feedback
-SFMT_SL1 equ 14 ; Left shift of W[N-1], 32-bit words
-SFMT_SL2 equ 3 ; Left shift of W[0], *8, 128-bit words
-SFMT_SR1 equ 5 ; Right shift of W[M], 32-bit words
-SFMT_SR2 equ 1 ; Right shift of W[N-2], *8, 128-bit words
-SFMT_MASK1 equ 0f7fefffdH ;first DWORD of AND mask
-%define SFMT_MASK 0f7fefffdH,07fefcfffH,0aff3ef3fH,0b5ffff7fH
-%define SFMT_PARITY 1,0,0,020000000H
-
-%elif MEXP == 607
-SFMT_N equ 5 ; Size of state vector
-SFMT_M equ 2 ; Position of intermediate feedback
-SFMT_SL1 equ 15 ; Left shift of W[N-1], 32-bit words
-SFMT_SL2 equ 3 ; Left shift of W[0], *8, 128-bit words
-SFMT_SR1 equ 13 ; Right shift of W[M], 32-bit words
-SFMT_SR2 equ 3 ; Right shift of W[N-2], *8, 128-bit words
-SFMT_MASK1 equ 0fdff37ffH ;first DWORD of AND mask
-%define SFMT_MASK 0fdff37ffH,0ef7f3f7dH,0ff777b7dH,07ff7fb2fH
-%define SFMT_PARITY 1,0,0,05986f054H
-
-%ELSE
-%error MEXP must have one of the predefined values
-%ENDIF
-
-STRUC CRandomSFMTA
-.Fill3 RESD 4 ; Alignment filler
-
-; Parameters for Mother-Of-All generator:
-.M3: RESD 1 ; x[n-3] (aligned)
- RESD 1 ; unused filler to fit the pmuludq instruction
-.M2: RESD 1 ; x[n-2]
- RESD 1 ; unused filler to fit the pmuludq instruction
-.M1: RESD 1 ; x[n-1]
- RESD 1 ; unused filler to fit the pmuludq instruction
-.M0: RESD 1 ; x[n]
-.MC: RESD 1 ; Carry (zero-extends into one)
-.one: RESQ 1 ; 1.0 (low dword = zero-extension of carry) (aligned)
-.TempRan: RESQ 1 ; Temporary random number
-.MF3: RESD 1 ; 2111111111 (aligned)
-.Instset: RESD 1 ; Instruction set
-.MF2: RESD 1 ; 1492 (MF3,MF2,MF1,MF0 interleaved with other variables to fit the pmuludq instruction)
- RESD 1 ; Filler (may be used for read-only parameter, but not for read/write parameter)
-.MF1: RESD 1 ; 1776
- RESD 1 ; Filler (may be used for read-only parameter, but not for read/write parameter)
-.MF0: RESD 1 ; 5115
- RESD 1 ; Filler (may be used for read-only parameter, but not for read/write parameter)
-
-; Parameters for IRandomX:
-.LASTINTERVAL: RESD 1 ; Last interval length for IRandomX
-.RLIMIT: RESD 1 ; Rejection limit used by IRandomX
-
-; Parameters for SFMT generator:
-.USEMOTHER: RESD 1 ; 1 if combine with Mother-Of-All generator
-.IX: RESD 1 ; Index into state buffer for SFMT
-
-.AMASK: RESD 4 ; AND mask (aligned)
-.STATE: RESD SFMT_N*4 ; State vector (aligned)
-endstruc ; CRandomSFMTA
-
-
-; Load offset of TARGET into ecx. Use position-independent method if necessary
-%macro LOADOFFSET2ECX 1
-%IFNDEF POSITIONINDEPENDENT
- mov ecx, %1
-%ELSE
- ; get position-independent address of TARGET
- call get_thunk_ecx
- add ecx, %1 - $
-%ENDIF
-%endmacro
-
-; Load offset of TARGET into edi. Use position-independent method if necessary
-%macro LOADOFFSET2EDI 1
-%IFNDEF POSITIONINDEPENDENT
- mov edi, %1
-%ELSE
- ; get position-independent address of TARGET
- call get_thunk_edi
- add edi, %1 - $
-%ENDIF
-%endmacro
-
-
-; ***************************************************************************
-; Define registers used for function parameters, used in 64-bit mode only
-; ***************************************************************************
-
-%IFDEF WINDOWS
- %define par1 rcx
- %define par2 rdx
- %define par3 r8
- %define par4 r9
- %define par5 qword [rsp+32+8] ; stack offset including shadow space
- %define par1d ecx
- %define par2d edx
- %define par3d r8d
- %define par4d r9d
- %define par5d dword [rsp+32+8]
-%ENDIF
-
-%IFDEF UNIX
- %define par1 rdi
- %define par2 rsi
- %define par3 rdx
- %define par4 rcx
- %define par5 r8
- %define par1d edi
- %define par2d esi
- %define par3d edx
- %define par4d ecx
- %define par5d r8d
-%ENDIF
diff --git a/contrib/libs/asmlib/rdtsc64.asm b/contrib/libs/asmlib/rdtsc64.asm
deleted file mode 100644
index 42a0e23203..0000000000
--- a/contrib/libs/asmlib/rdtsc64.asm
+++ /dev/null
@@ -1,53 +0,0 @@
-%include "defs.asm"
-
-; RDTSC64.ASM
-;
-; Author: Agner Fog
-; Date created: 2003
-; Last modified: 2008-10-16
-; Description:
-;
-; Copyright (c) 2009 GNU General Public License www.gnu.org/licenses
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-default rel
-
-global ReadTSC: function
-
-SECTION .text align=16
-
-; ********** ReadTSC function **********
-; C++ prototype:
-; extern "C" __int64 ReadTSC (void);
-
-; This function returns the value of the time stamp counter, which counts
-; clock cycles. To count how many clock cycles a piece of code takes, call
-; Rdtsc before and after the code to measure and calculate the difference.
-
-; The number of clock cycles taken by the ReadTSC function itself is approximately:
-; Core 2: 730
-; Pentium 4: 700
-; Pentium II and Pentium III: 225
-; AMD Athlon 64, Opteron: 126
-; Does not work on 80386 and 80486.
-
-; Note that clock counts may not be fully reproducible on Intel Core and
-; Core 2 processors because the clock frequency can change. More reliable
-; instruction timings are obtained with the performance monitor counter
-; for "core clock cycles". This requires a kernel mode driver as the one
-; included with www.agner.org/optimize/testp.zip.
-
-ReadTSC:
- push rbx ; ebx is modified by cpuid
- sub eax, eax ; 0
- cpuid ; serialize
- rdtsc ; read time stamp counter into edx:eax
- shl rdx, 32
- or rax, rdx ; combine into 64 bit register
- push rax
- sub eax, eax
- cpuid ; serialize
- pop rax ; return value
- pop rbx
- ret
-;ReadTSC ENDP
diff --git a/contrib/libs/asmlib/round64.asm b/contrib/libs/asmlib/round64.asm
deleted file mode 100644
index 5ed55c53c6..0000000000
--- a/contrib/libs/asmlib/round64.asm
+++ /dev/null
@@ -1,40 +0,0 @@
-%include "defs.asm"
-
-; ROUND64.ASM
-
-; Author: Agner Fog
-; Date created: 2007-06-15
-; Last modified: 2008-10-16
-; Description:
-; Round function
-
-; Copyright (c) 2009 GNU General Public License www.gnu.org/licenses
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-default rel
-
-global RoundD: function
-global RoundF: function
-
-
-SECTION .text align=16
-
-; ********** round function **********
-; C++ prototype:
-; extern "C" int RoundD (double x);
-; extern "C" int RoundF (float x);
-
-; This function converts a single or double precision floating point number
-; to an integer, rounding to nearest or even. Does not check for overflow.
-; This function is much faster than the default conversion method in C++
-; which uses truncation.
-
-RoundD:
- cvtsd2si eax, xmm0 ; Round xmm0 to eax
- ret
-;RoundD ENDP
-
-RoundF:
- cvtss2si eax, xmm0 ; Round xmm0 to eax
- ret
-;RoundF ENDP
diff --git a/contrib/libs/asmlib/sfmt64.asm b/contrib/libs/asmlib/sfmt64.asm
deleted file mode 100644
index 3ca3cedca0..0000000000
--- a/contrib/libs/asmlib/sfmt64.asm
+++ /dev/null
@@ -1,889 +0,0 @@
-%include "defs.asm"
-
-; ----------------------------- SFMT64.ASM ---------------------------
-; Author: Agner Fog
-; Date created: 2008-11-01
-; Last modified: 2013-09-13
-; Project: randoma library of random number generators
-; Source URL: www.agner.org/random
-; Description:
-; Random number generator of type SIMD-oriented Fast Mersenne Twister (SFMT)
-; (Mutsuo Saito and Makoto Matsumoto: "SIMD-oriented Fast Mersenne Twister:
-; a 128-bit Pseudorandom Number Generator", Monte Carlo and Quasi-Monte
-; Carlo Methods 2006, Springer, 2008, pp. 607-622).
-;
-; 64-bit mode version for x86-64 compatible microprocessors.
-; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
-; ----------------------------------------------------------------------
-
-default rel
-
-global SFMTRandomInit, SFMTRandomInitByArray, SFMTBRandom, SFMTRandom
-global SFMTRandomL, SFMTIRandom, SFMTIRandomX, SFMTgenRandomInit
-global SFMTgenRandomInitByArray, SFMTgenRandom, SFMTgenRandomL, SFMTgenIRandom
-global SFMTgenIRandomX, SFMTgenBRandom
-
-extern InstructionSet
-
-; structure definition and constants:
-%INCLUDE "randomah.asi"
-
-
-section .data
-align 16
-; Data for single instance of random number generator
-SFMTInstance: ISTRUC CRandomSFMTA
-; Size of structure
-IEND
-SFMTSize equ $-SFMTInstance
-
-
-align 16
-; Initialization constants for Mother-Of-All:
-InitMother DD 2111111111, 0, 1492, 0, 1776, 0, 5115, 0
-; Initialization Mask for SFMT:
-InitMask DD SFMT_MASK
-; Period certification vector for SFMT:
-InitParity DD SFMT_PARITY
-
-
-SECTION .CODE align=16 ; code segment
-
-
-; ---------------------------------------------------------------
-; Thread-safe static link versions for SFMT
-; ---------------------------------------------------------------
-
-; extern "C" void SFMTRandomInit(void * Pthis, int ThisSize, int seed, int IncludeMother = 0);
-; Parameters:
-; par1 = Pthis
-; par2d = ThisSize
-; par3d = seed
-; par4d = IncludeMother
-
-SFMTRandomInit:
- cmp par2d, SFMTSize
- jb Error ; Error exit if buffer too small
- push rbx
-
- ; Align by 16. Will overlap part of Fill if Pthis unaligned
- and par1, -16
-
- xor eax, eax
- test par4d, par4d ; IncludeMother
- setnz al ; convert any nonzero value to 1
- ; Store USEMOTHER
- mov [par1+CRandomSFMTA.USEMOTHER], eax
-
- mov eax, par3d ; seed
- xor ebx, ebx ; loop counter i
- jmp L002 ; go into seeding loop
-
-L001: ; seeding loop for SFMT
- ; y = factor * (y ^ (y >> 30)) + (++i);
- call InitSubf0 ; randomization subfunction
-L002: mov [par1+rbx*4+CRandomSFMTA.STATE],eax ; initialize state
- cmp ebx, SFMT_N*4 - 1
- jb L001
-
- ; Put 5 more values into Mother-Of-All generator
- call InitSubf0
- mov [par1+CRandomSFMTA.M0], eax
- call InitSubf0
- mov [par1+CRandomSFMTA.M1], eax
- call InitSubf0
- mov [par1+CRandomSFMTA.M2], eax
- call InitSubf0
- mov [par1+CRandomSFMTA.M3], eax
- call InitSubf0
- mov [par1+CRandomSFMTA.MC], eax
-
- ; more initialization and period certification
- call InitAndPeriod
-
- pop rbx
- ret
-;SFMTRandomInit ENDP
-
-Error: ; Error exit
- xor eax, eax
- div eax ; Divide by 0
- ret
-
-; Subfunction used by SFMTRandomInit
-InitSubf0: ; private
-; y = 1812433253 * (y ^ (y >> 30)) + (++i);
-; input parameters:
-; eax = y
-; ebx = i
-; output:
-; eax = new y
-; ebx = i+1
-; edx modified
- mov edx, eax
- shr eax, 30
- xor eax, edx
- imul eax, 1812433253
- inc ebx
- add eax, ebx
- ret
-;InitSubf0 endp
-
-; Subfunction used by SFMTRandomInitByArray
-InitSubf1: ; private
-; r = 1664525U * (r ^ (r >> 27));
-; input parameters:
-; eax = r
-; output:
-; eax = new r
-; r10 modified
- mov r10d, eax
- shr eax, 27
- xor eax, r10d
- imul eax, 1664525
- ret
-;InitSubf1 endp
-
-; Subfunction used by SFMTRandomInitByArray
-InitSubf2: ; private
-; r = 1566083941U * (r ^ (r >> 27));
-; input parameters:
-; eax = r
-; output:
-; eax = new r
-; r10 modified
- mov r10d, eax
- shr eax, 27
- xor eax, r10d
- imul eax, 1566083941
- ret
-;InitSubf2 endp
-
-
-; Subfunciton for initialization and period certification, except seeding
-; par1 = aligned pointer to CRandomSFMTA
-InitAndPeriod: ; private
- push rbx
-
- ; initialize constants for Mother-Of-All
- movaps xmm0, oword [InitMother]
- movaps oword [par1+CRandomSFMTA.MF3], xmm0
- movaps xmm0, oword [InitMother+16]
- movaps oword [par1+CRandomSFMTA.MF1], xmm0
-
- ; initialize constants for SFMT
- movaps xmm0, oword [InitMask]
- movaps oword [par1+CRandomSFMTA.AMASK], xmm0
-
- ; initialize various variables
- xor eax, eax
- mov dword [par1+CRandomSFMTA.one], eax
- mov dword [par1+4+CRandomSFMTA.one], 3FF00000H
- mov dword [par1+CRandomSFMTA.LASTINTERVAL], eax
-
- ; get instruction set
- push par1
- call InstructionSet
- pop par1
- mov [par1+CRandomSFMTA.Instset], eax
-
- ; Period certification
- ; Compute parity of STATE[0-4] & InitParity
- movaps xmm1, oword [par1+CRandomSFMTA.STATE]
- andps xmm1, oword [InitParity]
- movhlps xmm2, xmm1 ; high qword
- xorps xmm1, xmm2 ; xor two qwords
- pshufd xmm2, xmm1, 1 ; high dword
- xorps xmm1, xmm2 ; xor two dwords
- movd eax, xmm1 ; do rest of xor in eax
- mov edx, eax
- shr eax, 16
- xor eax, edx ; xor two words
- xor al, ah ; xor two bytes
- jpo L008 ; parity odd: period OK
-
- ; parity even: period not OK
- ; Find a nonzero dword in period certification vector
- xor ebx, ebx ; loop counter
- lea rdx, [InitParity]
-L005: mov eax, [rdx+rbx*4] ; InitParity[i]
- test eax, eax
- jnz L006
- inc ebx
- ; assume that there is a nonzero dword in InitParity
- jmp L005 ; loop until nonzero found
-
-L006: ; find first nonzero bit in eax
- bsf edx, eax
- ; flip the corresponding bit in STATE
- btc [par1+rbx*4+CRandomSFMTA.STATE], edx
-
-L008: cmp dword [par1+CRandomSFMTA.USEMOTHER], 0
- je L009
- call Mother_Next ; Make first random number ready
-
-L009: ; Generate first random numbers and set IX = 0
- call SFMT_Generate
- pop rbx
- ret
-;InitAndPeriod endp
-
-
-; extern "C" void SFMTRandomInitByArray
-; (void * Pthis, int ThisSize, int const seeds[], int NumSeeds, int IncludeMother = 0);
-; // Seed by more than 32 bits
-SFMTRandomInitByArray:
-; Parameters
-; par1 = Pthis
-; par2d = ThisSize
-; par3 = seeds
-; par4d = NumSeeds
-; par5d = IncludeMother
-
-; define constants:
-SFMT_SIZE equ SFMT_N*4 ; number of 32-bit integers in state
-
-%IF SFMT_SIZE >= 623
- SFMT_LAG equ 11
-%ELIF SFMT_SIZE >= 68
- SFMT_LAG equ 7
-%ELIF SFMT_SIZE >= 39
- SFMT_LAG equ 5
-%ELSE
- SFMT_LAG equ 3
-%ENDIF
-
-SFMT_MID equ ((SFMT_SIZE - SFMT_LAG) / 2)
-
- xor eax, eax
- cmp par5d, eax ; IncludeMother (parameter is on stack if windows)
- setnz al ; convert any nonzero value to 1
-
- push rbx
- push rbp
-
- cmp par2d, SFMTSize ; ThisSize
- jb Error ; Error exit if buffer too small
-
- ; Align by 16. Will overlap part of Fill if Pthis unaligned
- and par1, -16
-
- ; Store USEMOTHER
- mov [par1+CRandomSFMTA.USEMOTHER], eax
-
-; 1. loop: Fill state vector with random numbers from NumSeeds
-; r = NumSeeds;
-; for (i = 0; i < SFMT_N*4; i++) {
-; r = factor * (r ^ (r >> 30)) + i;
-; sta[i] = r;}
-
- mov eax, par4d ; r = NumSeeds
- xor ebx, ebx ; i
-L100: mov par2d, eax
- shr eax, 30
- xor eax, par2d
- imul eax, 1812433253
- add eax, ebx
- mov [par1+rbx*4+CRandomSFMTA.STATE], eax
- inc ebx
- cmp ebx, SFMT_SIZE
- jb L100
-
- ; count = max(NumSeeds,size-1)
- mov eax, SFMT_SIZE - 1
- mov r11d, par4d ; NumSeeds
- cmp r11d, eax
- cmovb r11d, eax
-
-; 2. loop: Fill state vector with random numbers from seeds[]
-; for (i = 1, j = 0; j < count; j++) {
-; r = func1(sta[i] ^ sta[(i + mid) % size] ^ sta[(i + size - 1) % size]);
-; sta[(i + mid) % size] += r;
-; if (j < NumSeeds) r += seeds[j]
-; r += i;
-; sta[(i + mid + lag) % size] += r;
-; sta[i] = r;
-; i = (i + 1) % size;
-; }
- ; register use:
- ; par1 = Pthis
- ; par2 = j
- ; par3 = seeds
- ; par4 = NumSeeds
- ; eax = r
- ; ebx = i
- ; ebp = (i + mid) % size, (i + mid + lag) % size
- ; r10 = (i + size - 1) % size
- ; r11 = count
-
- xor par2d, par2d ; j = 0
- lea ebx, [par2+1] ; i = 1
-
-L101: ; r = sta[i] ^ sta[(i + mid) % size] ^ sta[(i + size - 1) % size];
- mov eax, [par1+rbx*4+CRandomSFMTA.STATE] ; sta[i]
- lea ebp, [rbx+SFMT_MID]
- cmp ebp, SFMT_SIZE
- jb L102
- sub ebp, SFMT_SIZE
-L102: xor eax, [par1+rbp*4+CRandomSFMTA.STATE] ; sta[(i + mid) % size]
- lea r10d, [rbx+SFMT_SIZE-1]
- cmp r10d, SFMT_SIZE
- jb L103
- sub r10d, SFMT_SIZE
-L103: xor eax, [par1+r10*4+CRandomSFMTA.STATE] ; sta[(i + size - 1) % size]
-
- ; r = func1(r) = (r ^ (r >> 27)) * 1664525U;
- call InitSubf1
-
- ; sta[(i + mid) % size] += r;
- add [par1+rbp*4+CRandomSFMTA.STATE], eax
-
- ; if (j < NumSeeds) r += seeds[j]
- cmp par2d, par4d
- jnb L104
- add eax, [par3+par2*4]
-L104:
- ; r += i;
- add eax, ebx
-
- ; sta[(i + mid + lag) % size] += r;
- lea ebp, [rbx+SFMT_MID+SFMT_LAG]
- cmp ebp, SFMT_SIZE
- jb L105
- sub ebp, SFMT_SIZE
-L105: add [par1+rbp*4+CRandomSFMTA.STATE], eax
-
- ;sta[i] = r;
- mov [par1+rbx*4+CRandomSFMTA.STATE], eax
-
- ; i = (i + 1) % size;
- inc ebx
- cmp ebx, SFMT_SIZE
- jb L106
- sub ebx, SFMT_SIZE
-L106:
- ; j++, loop while j < count
- inc par2d
- cmp par2d, r11d
- jb L101
-
-; 3. loop: Randomize some more
-; for (j = 0; j < size; j++) {
-; r = func2(sta[i] + sta[(i + mid) % size] + sta[(i + size - 1) % size]);
-; sta[(i + mid) % size] ^= r;
-; r -= i;
-; sta[(i + mid + lag) % size] ^= r;
-; sta[i] = r;
-; i = (i + 1) % size;
-; }
- ; j = 0
- xor par2d, par2d
-
-L110: ; r = sta[i] + sta[(i + mid) % size] + sta[(i + size - 1) % size]
- mov eax, [par1+rbx*4+CRandomSFMTA.STATE] ; sta[i]
- lea ebp, [rbx+SFMT_MID]
- cmp ebp, SFMT_SIZE
- jb L111
- sub ebp, SFMT_SIZE
-L111: add eax, [par1+rbp*4+CRandomSFMTA.STATE] ; sta[(i + mid) % size]
- lea r10d, [rbx+SFMT_SIZE-1]
- cmp r10d, SFMT_SIZE
- jb L112
- sub r10d, SFMT_SIZE
-L112: add eax, [par1+r10*4+CRandomSFMTA.STATE] ; sta[(i + size - 1) % size]
-
- ; r = func2(r) = (x ^ (x >> 27)) * 1566083941U;
- call InitSubf2
-
- ; sta[(i + mid) % size] ^= r;
- xor [par1+rbp*4+CRandomSFMTA.STATE], eax
-
- ; r -= i;
- sub eax, ebx
-
- ; sta[(i + mid + lag) % size] ^= r;
- lea ebp, [rbx+SFMT_MID+SFMT_LAG]
- cmp ebp, SFMT_SIZE
- jb L113
- sub ebp, SFMT_SIZE
-L113: xor [par1+rbp*4+CRandomSFMTA.STATE], eax
-
- ; sta[i] = r;
- mov [par1+rbx*4+CRandomSFMTA.STATE], eax
-
- ; i = (i + 1) % size;
- inc ebx
- cmp ebx, SFMT_SIZE
- jb L114
- sub ebx, SFMT_SIZE
-L114:
- ; j++, loop while j < size
- inc par2d
- cmp par2d, SFMT_SIZE
- jb L110
-
- ; if (UseMother) {
- cmp dword [par1+CRandomSFMTA.USEMOTHER], 0
- jz L120
-
-; 4. loop: Initialize MotherState
-; for (j = 0; j < 5; j++) {
-; r = func2(r) + j;
-; MotherState[j] = r + sta[2*j];
-; }
- call InitSubf2
- mov par2d, [par1+CRandomSFMTA.STATE]
- add par2d, eax
- mov [par1+CRandomSFMTA.M0], par2d
- call InitSubf2
- inc eax
- mov par2d, [par1+8+CRandomSFMTA.STATE]
- add par2d, eax
- mov [par1+CRandomSFMTA.M1], par2d
- call InitSubf2
- add eax, 2
- mov par2d, [par1+16+CRandomSFMTA.STATE]
- add par2d, eax
- mov [par1+CRandomSFMTA.M2], par2d
- call InitSubf2
- add eax, 3
- mov par2d, [par1+24+CRandomSFMTA.STATE]
- add par2d, eax
- mov [par1+CRandomSFMTA.M3], par2d
- call InitSubf2
- add eax, 4
- mov par2d, [par1+32+CRandomSFMTA.STATE]
- add par2d, eax
- mov [par1+CRandomSFMTA.MC], par2d
-
-L120: ; More initialization and period certification
- call InitAndPeriod
-
- pop rbp
- pop rbx
- ret
-;SFMTRandomInitByArray ENDP
-
-
-Mother_Next: ; private
-; Internal procedure: advance Mother-Of-All generator
-; The random value is in M0
-; par1 = aligned pointer to structure CRandomSFMTA
-; eax, par1, xmm0 unchanged
-
- movdqa xmm1, oword [par1+CRandomSFMTA.M3] ; load M3,M2
- movdqa xmm2, oword [par1+CRandomSFMTA.M1] ; load M1,M0
- movhps qword [par1+CRandomSFMTA.M3], xmm1 ; M3=M2
- movq qword [par1+CRandomSFMTA.M2], xmm2 ; M2=M1
- movhps qword [par1+CRandomSFMTA.M1], xmm2 ; M1=M0
- pmuludq xmm1, oword [par1+CRandomSFMTA.MF3] ; M3*MF3, M2*MF2
- pmuludq xmm2, oword [par1+CRandomSFMTA.MF1] ; M1*MF1, M0*MF0
- paddq xmm1, xmm2 ; P3+P1, P2+P0
- movhlps xmm2, xmm1 ; Get high qword
- movq xmm3, qword [par1+CRandomSFMTA.MC] ; +carry
- paddq xmm1, xmm3
- paddq xmm1, xmm2 ; P0+P1+P2+P3
- movq qword [par1+CRandomSFMTA.M0], xmm1 ; Store new M0 and carry
- ret
-;Mother_Next endp
-
-
-align 16
-SFMT_Generate: ; private
-; void CRandomSFMT::Generate() {
-; Fill state array with new random numbers
-
- push rbx
-
- ; register use
- ; par1 = Pthis (rcx or rdi)
- ; edx = i*16 + offset state
- ; eax, ebx = loop end
- ; xmm1 = r1
- ; xmm2 = r2 = r
- ; xmm0, xmm3 = scratch
-
- ; r1 = state[SFMT_N*16 - 2];
- ; r2 = state[SFMT_N*16 - 1];
- movdqa xmm1, oword [par1+(SFMT_N-2)*16+CRandomSFMTA.STATE]
- movdqa xmm2, oword [par1+(SFMT_N-1)*16+CRandomSFMTA.STATE]
- mov edx, CRandomSFMTA.STATE
-
-;static inline __m128i sfmt_recursion(__m128i const &a, __m128i const &b,
-;__m128i const &c, __m128i const &d, __m128i const &mask) {
-; __m128i a1, b1, c1, d1, z1, z2;
-; b1 = _mm_srli_epi32(b, SFMT_SR1);
-; a1 = _mm_slli_si128(a, SFMT_SL2);
-; c1 = _mm_srli_si128(c, SFMT_SR2);
-; d1 = _mm_slli_epi32(d, SFMT_SL1);
-; b1 = _mm_and_si128(b1, mask);
-; z1 = _mm_xor_si128(a, a1);
-; z2 = _mm_xor_si128(b1, d1);
-; z1 = _mm_xor_si128(z1, c1);
-; z2 = _mm_xor_si128(z1, z2);
-; return z2;}
-
-; for (i = 0; i < SFMT_N - SFMT_M; i++) {
-; r = sfmt_recursion(state[i], state[i + SFMT_M], r1, r2, mask);
-; state[i] = r;
-; r1 = r2;
-; r2 = r;
-; }
-
- mov eax, (SFMT_N-SFMT_M)*16 + CRandomSFMTA.STATE ; first loop end
- mov ebx, SFMT_N*16 + CRandomSFMTA.STATE ; second loop end
-
-; first i loop from 0 to SFMT_N - SFMT_M
-align 8
-L201: movdqa xmm0, oword [par1+rdx+SFMT_M*16] ; b
- psrld xmm0, SFMT_SR1 ; b1
- pand xmm0, oword [par1+CRandomSFMTA.AMASK] ; b1
- movdqa xmm3, oword [par1+rdx] ; a
- pxor xmm0, xmm3
- pslldq xmm3, SFMT_SL2 ; a1
- psrldq xmm1, SFMT_SR2 ; c1, c = r1
- pxor xmm0, xmm3
- pxor xmm0, xmm1
- movdqa xmm1, xmm2 ; r1 = r2
- pslld xmm2, SFMT_SL1 ; d1, d = r2
- pxor xmm2, xmm0 ; r2 = r
- ; state[i] = r;
- movdqa oword [par1+rdx], xmm2
-
- ; i++ while i < SFMT_N - SFMT_M
- add edx, 16
- cmp edx, eax
- jb L201
-
-;align 16
-L202: ; second i loop from SFMT_N - SFMT_M + 1 to SFMT_N
- movdqa xmm0, oword [par1+rdx+(SFMT_M-SFMT_N)*16]; b
- psrld xmm0, SFMT_SR1 ; b1
- pand xmm0, oword [par1+CRandomSFMTA.AMASK] ; b1
- movdqa xmm3, oword [par1+rdx] ; a
- pxor xmm0, xmm3
- pslldq xmm3, SFMT_SL2 ; a1
- psrldq xmm1, SFMT_SR2 ; c1, c = r1
- pxor xmm0, xmm3
- pxor xmm0, xmm1
- movdqa xmm1, xmm2 ; r1 = r2
- pslld xmm2, SFMT_SL1 ; d1, d = r2
- pxor xmm2, xmm0 ; r2 = r
- ; state[i] = r;
- movdqa oword [par1+rdx], xmm2
-
- ; i++ while i < SFMT_N
- add edx, 16
- cmp edx, ebx
- jb L202
-
- ; Check if initialized
-L208: cmp dword [par1+CRandomSFMTA.AMASK], SFMT_MASK1
- jne Error ; Make error if not initialized
-
- ; ix = 0;
- mov dword [par1+CRandomSFMTA.IX], 0 ; point to start of STATE buffer
- pop rbx
- ret
-;SFMT_Generate endp
-
-
-; extern "C" unsigned int SFMTBRandom(void * Pthis); // Output random bits
-
-SFMTBRandom: ; generate random bits
- ; Align Pthis by 16. Will overlap part of Fill1 if Pthis unaligned
- and par1, -16
-
-SFMTBRandom_reg: ; Entry for register parameters, used internally
-
-; if (ix >= SFMT_N*4) Generate();
- mov edx, [par1+CRandomSFMTA.IX]
- cmp edx, SFMT_N*16
- jnb NeedGenerate
-
-; y = ((uint32_t*)state)[ix++];
- mov eax, dword [par1+rdx+CRandomSFMTA.STATE]
- add edx, 4
- mov [par1+CRandomSFMTA.IX], edx
-
-AfterGenerate:
-; if (UseMother) y += MotherBits();
- cmp dword [par1+CRandomSFMTA.USEMOTHER], 0
- jz NoMother
-
- ; add mother bits
- add eax, [par1+CRandomSFMTA.M0] ; Add Mother random number
- call Mother_Next ; Make next Mother random number ready
-
-NoMother: ; return y;
- ret
-
-NeedGenerate:
- call SFMT_Generate ; generate SFMT_N*4 random dwords
- mov eax, [par1+CRandomSFMTA.STATE]
- mov dword [par1+CRandomSFMTA.IX], 4
- jmp AfterGenerate
-
-;SFMTBRandom ENDP
-
-
-; extern "C" double SFMTRandom (void * Pthis); // Output random float
-SFMTRandom: ; generate random float with 52 bits resolution
- ; Align Pthis by 16. Will overlap part of Fill1 if Pthis unaligned
- and par1, -16
-
-SFMTRandom_reg: ; internal entry point
-
-; check if there are at least 64 random bits in state buffer
-; if (ix >= SFMT_N*4-1) Generate();
- mov edx, [par1+CRandomSFMTA.IX]
- cmp edx, SFMT_N*16-4
- jnb L303
-
-L301: ; read 64 random bits
- movq xmm0, qword [par1+rdx+CRandomSFMTA.STATE]
- add edx, 8
- mov [par1+CRandomSFMTA.IX], edx
-
- ; combine with Mother-Of-All generator?
- cmp dword [par1+CRandomSFMTA.USEMOTHER], 0
- jz L302 ; ConvertToFloat
-
- ; add mother bits
- movq xmm1, qword [par1+CRandomSFMTA.M0] ; Mother random number MC and M0
- pshuflw xmm1, xmm1, 01001011B ; Put M0 before MC, and swap the words in MC
- paddq xmm0, xmm1 ; Add SFMT and Mother outputs
- call Mother_Next ; Make next Mother random number ready
-
-L302: ; ConvertToFloat
- psrlq xmm0, 12 ; align with mantissa field of double precision float
- movsd xmm1, [par1+CRandomSFMTA.one] ; 1.0 double precision
- por xmm0, xmm1 ; insert exponent to get 1.0 <= x < 2.0
- subsd xmm0, xmm1 ; subtract 1.0 to get 0.0 <= x < 1.0
- ret ; return value
-
-L303: ; NeedGenerateR
- call SFMT_Generate ; generate SFMT_N*4 random dwords
- xor edx, edx
- jmp L301
-
-;SFMTRandom ENDP
-
-
-; extern "C" long double SFMTRandomL (void * Pthis);
-SFMTRandomL: ; generate random float with 63 bits resolution
- ; Align Pthis by 16.
- and par1, -16
-
-SFMTRandomL_reg: ; internal entry point
-
-; check if there are at least 64 random bits in state buffer
-; if (ix >= SFMT_N*4-1) Generate();
- mov edx, [par1+CRandomSFMTA.IX]
- cmp edx, SFMT_N*16-4
- jnb L403
-
-L401: ; read 64 random bits
- movq xmm0, qword [par1+rdx+CRandomSFMTA.STATE]
- add edx, 8
- mov [par1+CRandomSFMTA.IX], edx
-
- ; combine with Mother-Of-All generator?
- cmp dword [par1+CRandomSFMTA.USEMOTHER], 0
- jz L402
-
- ; add mother bits
- movq xmm1, qword [par1+CRandomSFMTA.M0] ; Mother random number MC and M0
- pshuflw xmm1, xmm1, 01001011B ; Put M0 before MC, and swap the words in MC
- paddq xmm0, xmm1 ; Add SFMT and Mother outputs
- call Mother_Next ; Make next Mother random number ready
-
-L402: ;ConvertToFloat
- sub rsp, 16 ; make space for long double
- psrlq xmm0, 1 ; align with mantissa field of long double
- pcmpeqw xmm1, xmm1 ; all 1's
- psllq xmm1, 63 ; create a 1 in bit 63
- por xmm0, xmm1 ; bit 63 is always 1 in long double
- movq qword [rsp], xmm0 ; store mantissa
- mov dword [rsp+8], 3FFFH ; exponent
- fld tword [rsp] ; load long double
- fsub qword [par1+CRandomSFMTA.one] ; subtract 1.0 to get 0.0 <= x < 1.0
- pcmpeqw xmm0, xmm0 ; make a NAN for compilers that don't support long double
- add rsp, 16
- ret ; return value in st(0)
-
-L403: ;NeedGenerateR
- call SFMT_Generate ; generate SFMT_N*4 random dwords
- xor edx, edx
- jmp L401
-;SFMTRandomL ENDP
-
-
-; extern "C" int SFMTIRandom (void * Pthis, int min, int max); // Output random integer
-
-SFMTIRandom:
-; par1 = Pthis
-; par2d = min
-; par3d = max
-
- ; Align Pthis by 16.
- and par1, -16
- push par2 ; save min, max
- push par3
- call SFMTBRandom_reg ; random bits
- pop rdx ; max
- pop rcx ; min
- sub edx, ecx
- jl short WrongInterval ; max < min
- inc edx ; max - min + 1
- mul edx ; multiply random number by interval and truncate
- lea eax, [rdx+rcx] ; add min to high dword of product
- ret
-WrongInterval:
- mov eax, 80000000H ; error exit
- ret
-;SFMTIRandom ENDP
-
-
-; extern "C" int SFMTIRandomX (void * Pthis, int min, int max); // Output random integer
-
-SFMTIRandomX:
-; par1 = Pthis
-; par2d = min
-; par3d = max
-
- push rbx
- ; Align Pthis by 16.
- and par1, -16
-
- mov ebx, par3d
- sub ebx, par2d ; max - min
- jle short M30 ; max <= min (signed)
- inc ebx ; interval = max - min + 1
-
- ; if (interval != LastInterval) {
- cmp ebx, [par1+CRandomSFMTA.LASTINTERVAL]
- je M10
- ; need to calculate new rejection limit
- ; RLimit = uint32(((uint64)1 << 32) / interval) * interval - 1;}
- xor eax, eax ; 0
- lea edx, [eax+1] ; 1
- div ebx ; (would give overflow if interval = 1)
- mul ebx
- dec eax
- mov [par1+CRandomSFMTA.RLIMIT], eax
- mov [par1+CRandomSFMTA.LASTINTERVAL], ebx
-M10: mov ebx, par2d ; save min
-
-M20: ; do { // Rejection loop
- call SFMTBRandom_reg ; random bits (par1 is preserved)
- ; longran = (uint64)BRandom() * interval;
- mul dword [par1+CRandomSFMTA.LASTINTERVAL]
- ; } while (remainder > RLimit);
- cmp eax, [par1+CRandomSFMTA.RLIMIT]
- ja M20
-
- ; return (int32)iran + min
- lea eax, [rbx+rdx]
- pop rbx
- ret
-
-M30: jl M40
- ; max = min. Return min
- mov eax, par2d
- pop rbx
- ret ; max = min exit
-
-M40: ; max < min: error
- mov eax, 80000000H ; error exit
- pop rbx
- ret
-;SFMTIRandomX ENDP
-
-
-
-; -------------------------------------------------------------------------
-; Single-threaded static link versions for SFMT generator
-; -------------------------------------------------------------------------
-
-; extern "C" void SFMTgenRandomInit(int seed, int IncludeMother = 0);
-SFMTgenRandomInit:
-; par1d = seed
-; par2d = IncludeMother
-
- ; set up parameters for call SFMTRandomInit
- mov par4d, par2d ; IncludeMother
- mov par3d, par1d ; seed
- mov par2d, SFMTSize ; ThisSize
- lea par1, [SFMTInstance] ; Get address of SFMTInstance into par1
- jmp SFMTRandomInit
-;SFMTgenRandomInit ENDP
-
-
-; extern "C" void SFMTgenRandomInitByArray(int const seeds[], int NumSeeds, int IncludeMother = 0);
-SFMTgenRandomInitByArray:
-; par1 = seeds
-; par2d = NumSeeds
-; par3d = IncludeMother
-
- ; set up parameters for call SFMTRandomInitByArray
-%IFDEF WINDOWS
- push par3 ; IncludeMother on stack
- sub rsp, 32 ; empty shadow space
- mov par4d, par2d ; NumSeeds
- mov par3, par1 ; seeds
- mov par2d, SFMTSize ; ThisSize
- lea par1, [SFMTInstance] ; Get address of SFMTInstance into par1
- call SFMTRandomInitByArray
- add rsp, 40
- ret
-%ELSE ; UNIX
- mov par5d, par3d ; IncludeMother in register
- mov par4d, par2d ; NumSeeds
- mov par3, par1 ; seeds
- mov par2d, SFMTSize ; ThisSize
- lea par1, [SFMTInstance] ; Get address of SFMTInstance into par1
- jmp SFMTRandomInitByArray
-%ENDIF
-;SFMTgenRandomInitByArray ENDP
-
-
-; extern "C" double SFMTgenRandom();
-SFMTgenRandom: ; generate random float with 52 bits resolution
- lea par1, [SFMTInstance] ; Get address of SFMTInstance into par1
- jmp SFMTRandom_reg ; random bits
-;SFMTgenRandom ENDP
-
-
-; extern "C" double SFMTgenRandom();
-SFMTgenRandomL: ; generate random float with 63 bits resolution
- lea par1, [SFMTInstance] ; Get address of SFMTInstance into par1
- jmp SFMTRandomL_reg ; random bits
-;SFMTgenRandomL ENDP
-
-
-; extern "C" int SFMTgenIRandom (int min, int max);
-SFMTgenIRandom:
- mov par3d, par2d
- mov par2d, par1d
- lea par1, [SFMTInstance] ; Get address of SFMTInstance into par1
- jmp SFMTIRandom ; continue in _SFMTIRandom
-;SFMTgenIRandom ENDP
-
-
-; extern "C" int SFMTgenIRandomX (int min, int max);
-SFMTgenIRandomX:
- mov par3d, par2d
- mov par2d, par1d
- lea par1, [SFMTInstance] ; Get address of SFMTInstance into par1
- jmp SFMTIRandomX ; continue in _SFMTIRandomX
-;SFMTgenIRandomX ENDP
-
-
-; extern "C" uint32_t SFMTgenBRandom();
-SFMTgenBRandom: ; generate random float with 32 bits resolution
- lea par1, [SFMTInstance] ; Get address of SFMTInstance into par1
- jmp SFMTBRandom_reg ; random bits
-;SFMTgenBRandom ENDP
-
-;END
diff --git a/contrib/libs/asmlib/strcat64.asm b/contrib/libs/asmlib/strcat64.asm
deleted file mode 100644
index 3c8a247e3e..0000000000
--- a/contrib/libs/asmlib/strcat64.asm
+++ /dev/null
@@ -1,70 +0,0 @@
-%include "defs.asm"
-
-;************************* strcat64.asm ************************************
-; Author: Agner Fog
-; Date created: 2008-07-19
-; Last modified: 2008-10-16
-; Description:
-; Faster version of the standard strcat function:
-; char * strcat(char *dest, const char * src);
-; Copies zero-terminated string from src to end of dest.
-;
-; Overriding standard function strcat:
-; The alias ?OVR_strcat is changed to _strcat in the object file if
-; it is desired to override the standard library function strcat.
-;
-; Optimization:
-; Uses optimized functions A_strlen and A_memcpy.
-;
-; Copyright (c) 2009 GNU General Public License www.gnu.org/licenses
-;******************************************************************************
-
-default rel
-
-global A_strcat: function ; Function A_strcat
-global EXP(strcat): function ; ?OVR removed if standard function strcat overridden
-
-; Imported from strlen64.asm
-extern A_strlen
-
-; Imported from memcpy64.asm
-extern A_memcpy
-
-
-SECTION .text align=16
-
-; extern "C" char * A_strcat(char * dest, const char * src) {
-; memcpy(dest+strlen(dest), src, strlen(src)+1);
-; return dest
-; }
-
-; Function entry:
-A_strcat:
-EXP(strcat):
-
-%IFDEF WINDOWS
-%define Rpar1 rcx ; function parameter 1
-%define Rpar2 rdx ; function parameter 2
-%define Rpar3 r8 ; function parameter 3
-%ENDIF
-%IFDEF UNIX
-%define Rpar1 rdi ; function parameter 1
-%define Rpar2 rsi ; function parameter 2
-%define Rpar3 rdx ; function parameter 3
-%ENDIF
-
- push Rpar1 ; dest
- push Rpar2 ; src
- call A_strlen ; length of dest
- push rax ; strlen(dest)
- mov Rpar1, [rsp+8] ; src
- call A_strlen ; length of src
- pop Rpar1 ; strlen(dest)
- pop Rpar2 ; src
- add Rpar1, [rsp] ; dest + strlen(dest)
- lea Rpar3, [rax+1] ; strlen(src)+1
- call A_memcpy ; copy
- pop rax ; return dest
- ret
-
-;A_strcat ENDP
diff --git a/contrib/libs/asmlib/strcpy64.asm b/contrib/libs/asmlib/strcpy64.asm
deleted file mode 100644
index c505c48be7..0000000000
--- a/contrib/libs/asmlib/strcpy64.asm
+++ /dev/null
@@ -1,66 +0,0 @@
-%include "defs.asm"
-
-;************************* strcpy64.asm ************************************
-; Author: Agner Fog
-; Date created: 2008-07-19
-; Last modified: 2011-07-01
-; Description:
-; Faster version of the standard strcpy function:
-; char * A_strcpy(char * dest, const char * src);
-; Copies zero-terminated string from src to dest, including terminating zero.
-;
-; Overriding standard function memcpy:
-; The alias ?OVR_strcpy is changed to _strcpy in the object file if
-; it is desired to override the standard library function strcpy.
-;
-; Optimization:
-; Uses optimized functions A_strlen and A_memcpy. These functions allow
-; calling without proper stack alignment.
-;
-; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
-;******************************************************************************
-
-default rel
-
-global A_strcpy: function ; Function A_strcpy
-global EXP(strcpy): function ; ?OVR removed if standard function memcpy overridden
-
-; Imported from strlen64.asm
-extern A_strlen
-
-; Imported from memcpy64.asm
-extern A_memcpy
-
-
-SECTION .text align=16
-
-; extern "C" char * A_strcpy(char * dest, const char * src) {
-; return memcpy(dest, src, strlen(src)+1);
-; }
-
-; Function entry:
-A_strcpy:
-EXP(strcpy):
-
-%IFDEF WINDOWS
-%define Rpar1 rcx ; function parameter 1
-%define Rpar2 rdx ; function parameter 2
-%define Rpar3 r8 ; function parameter 3
-%ENDIF
-%IFDEF UNIX
-%define Rpar1 rdi ; function parameter 1
-%define Rpar2 rsi ; function parameter 2
-%define Rpar3 rdx ; function parameter 3
-%ENDIF
-
- push Rpar1 ; dest
- push Rpar2 ; src
- mov Rpar1, Rpar2
- ; (A_strlen does not require stack alignment)
- call A_strlen ; length of src
- lea Rpar3,[rax+1] ; include terminating zero in length
- pop Rpar2 ; src
- pop Rpar1 ; dest
- jmp A_memcpy ; copy and return
-
-;A_strcpy ENDP
diff --git a/contrib/libs/asmlib/stricmp64.asm b/contrib/libs/asmlib/stricmp64.asm
deleted file mode 100644
index c568832b27..0000000000
--- a/contrib/libs/asmlib/stricmp64.asm
+++ /dev/null
@@ -1,86 +0,0 @@
-%include "defs.asm"
-
-;************************* stricmpaz64.asm **********************************
-; Author: Agner Fog
-; Date created: 2008-12-05
-; Last modified: 2011-07-01
-; Source URL: www.agner.org/optimize
-; Project: asmlib.zip
-; Description:
-; Faster version of the standard stricmp or strcasecmp function:
-; int A_stricmp(const char *string1, const char *string2);
-; Compares two zero-terminated strings without case sensitivity.
-; Does not recognize locale-specific characters. A-Z are changed
-; to a-z before comparing, while other upper-case letters are not
-; converted but considered unique.
-;
-; Optimization:
-; SSE4.2 version not implemented because the gain is small.
-;
-; Copyright (c) 2008-2011 GNU General Public License www.gnu.org/licenses/gpl.html
-;******************************************************************************
-
-default rel
-
-global A_stricmp: function ; Function A_stricmp
-
-; ***************************************************************************
-; Define registers used for function parameters, used in 64-bit mode only
-; ***************************************************************************
-
-%IFDEF WINDOWS
- %define par1 rcx ; first parameter
- %define par2 rdx ; second parameter
-%ENDIF
-
-%IFDEF UNIX
- %define par1 rdi ; first parameter
- %define par2 rsi ; second parameter
-%ENDIF
-
-SECTION .text align=16
-
-; extern "C" int A_stricmp(const char *string1, const char *string2);
-
-A_stricmp:
- sub par2, par1
-
-L10: mov al, [par1] ; string1
- cmp al, [par1+par2] ; string2
- jne L20
- inc par1
- test al, al
- jnz L10 ; continue with next byte
-
- ; terminating zero found. Strings are equal
- xor eax, eax
- ret
-
-L20: ; bytes are different. check case
- xor al, 20H ; toggle case
- cmp al, [par1+par2]
- jne L30
- ; possibly differing only by case. Check if a-z
- or al, 20H ; upper case
- sub al, 'a'
- cmp al, 'z'-'a'
- ja L30 ; not a-z
- ; a-z and differing only by case
- inc par1
- jmp L10 ; continue with next byte
-
-L30: ; bytes are different, even after changing case
- movzx eax, byte [par1] ; get original value again
- sub eax, 'A'
- cmp eax, 'Z' - 'A'
- ja L40
- add eax, 20H ; A-Z, make lower case
-L40: movzx edx, byte [par1+par2]
- sub edx, 'A'
- cmp edx, 'Z' - 'A'
- ja L50
- add edx, 20H ; A-Z, make lower case
-L50: sub eax, edx ; subtract to get result
- ret
-
-;A_stricmp END
diff --git a/contrib/libs/asmlib/strlen64.asm b/contrib/libs/asmlib/strlen64.asm
deleted file mode 100644
index ff65c10127..0000000000
--- a/contrib/libs/asmlib/strlen64.asm
+++ /dev/null
@@ -1,86 +0,0 @@
-%include "defs.asm"
-
-;************************** strlen64.asm **********************************
-; Author: Agner Fog
-; Date created: 2008-07-19
-; Last modified: 2008-10-16
-; Description:
-; Faster version of the standard strlen function:
-; size_t strlen(const char * str);
-; Finds the length of a zero-terminated string of bytes, optimized for speed.
-;
-; Overriding standard function strlen:
-; The alias ?OVR_strlen is changed to _strlen in the object file if
-; it is desired to override the standard library function strlen.
-;
-; Calling conventions:
-; Stack alignment is not required. No shadow space or red zone used.
-; Called internally from strcpy and strcat without stack aligned.
-;
-; Optimization:
-; Uses XMM registers to read 16 bytes at a time, aligned.
-; Misaligned parts of the string are read from the nearest 16-bytes boundary
-; and the irrelevant part masked out. It may read both before the begin of
-; the string and after the end, but will never load any unnecessary cache
-; line and never trigger a page fault for reading from non-existing memory
-; pages because it never reads past the nearest following 16-bytes boundary.
-; It may, though, trigger any debug watch within the same 16-bytes boundary.
-;
-; The latest version of this file is available at:
-; www.agner.org/optimize/asmexamples.zip
-; Copyright (c) 2009 GNU General Public License www.gnu.org/licenses
-;******************************************************************************
-
-default rel
-
-global A_strlen: function ; Function A_strlen
-global EXP(strlen): function ; ?OVR removed if standard function strlen overridden
-
-
-SECTION .text align=16
-
-; extern "C" int strlen (const char * s);
-
-; 64-bit Windows version:
-A_strlen:
-EXP(strlen):
-
-%IFDEF WINDOWS
- mov rax, rcx ; get pointer to string from rcx
- mov r8, rcx ; copy pointer
-%define Rscopy r8 ; Copy of s
-
-%ELSE ; Unix
- mov rax, rdi ; get pointer to string from rdi
- mov ecx, edi ; copy pointer (lower 32 bits)
-%define Rscopy rdi ; Copy of s
-%ENDIF
-
- ; rax = s, ecx = 32 bits of s
- pxor xmm0, xmm0 ; set to zero
- and ecx, 0FH ; lower 4 bits indicate misalignment
- and rax, -10H ; align pointer by 16
- movdqa xmm1, [rax] ; read from nearest preceding boundary
- pcmpeqb xmm1, xmm0 ; compare 16 bytes with zero
- pmovmskb edx, xmm1 ; get one bit for each byte result
- shr edx, cl ; shift out false bits
- shl edx, cl ; shift back again
- bsf edx, edx ; find first 1-bit
- jnz L2 ; found
-
- ; Main loop, search 16 bytes at a time
-L1: add rax, 10H ; increment pointer by 16
- movdqa xmm1, [rax] ; read 16 bytes aligned
- pcmpeqb xmm1, xmm0 ; compare 16 bytes with zero
- pmovmskb edx, xmm1 ; get one bit for each byte result
- bsf edx, edx ; find first 1-bit
- ; (moving the bsf out of the loop and using test here would be faster for long strings on old processors,
- ; but we are assuming that most strings are short, and newer processors have higher priority)
- jz L1 ; loop if not found
-
-L2: ; Zero-byte found. Compute string length
- sub rax, Rscopy ; subtract start address
- add rax, rdx ; add byte index
- ret
-
-;A_strlen ENDP
diff --git a/contrib/libs/asmlib/substring64.asm b/contrib/libs/asmlib/substring64.asm
deleted file mode 100644
index 235b19a5f5..0000000000
--- a/contrib/libs/asmlib/substring64.asm
+++ /dev/null
@@ -1,75 +0,0 @@
-%include "defs.asm"
-
-;************************* substring64.asm **********************************
-; Author: Agner Fog
-; Date created: 2011-07-18
-; Last modified: 2011-07-18
-; Source URL: www.agner.org/optimize
-; Project: asmlib.zip
-; Description:
-; Makes a substring of a zero-terminated ASCII string
-;
-; C++ prototype:
-; extern "C"
-; size_t A_substring(char * dest, const char * source, size_t pos, size_t len);
-; Makes a substring from source, starting at position pos (zero-based) and length
-; len and stores it in the array dest. It is the responsibility of the programmer
-; that the size of the dest array is at least len + 1.
-; The return value is the actual length of the substring. This may be less than
-; len if the length of source is less than pos + len.
-;
-; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses/gpl.html
-;******************************************************************************
-
-global A_substring: function ; Function _A_substring
-
-extern A_strlen
-extern A_memcpy
-
-SECTION .text
-
-; extern "C"
-; size_t A_substring(char * dest, const char * source, size_t pos, size_t len);
-
-%ifdef WINDOWS
-%define par1 rcx ; dest
-%define par2 rdx ; source
-%define par3 r8 ; pos
-%define par4 r9 ; len
-%else ; UNIX
-%define par1 rdi
-%define par2 rsi
-%define par3 rdx
-%define par4 rcx
-%endif
-
-A_substring:
- push par1
- push par2
- push par3
- push par4
- mov par1, par2
- call A_strlen ; rax = strlen(source)
- pop par4
- pop par3
- pop par2
- pop par1
- sub rax, par3 ; max length = strlen(source) - pos
- jbe empty ; strlen(source) <= pos. Return empty string
- cmp rax, par4
- cmova rax, par4 ; min(len, maxlen)
- add par2, par3 ; source + pos = source for memcpy
- mov par3, rax ; length for memcpy
- push rax ; new length
- call A_memcpy
- pop rcx ; new length = return value, rax = dest
- mov byte [rcx+rax], 0 ; terminating zero
- mov rax, rcx ; return new length
- ret
-
-empty: ; return empty string
- xor eax, eax ; return 0
- mov byte [par1], al
- ret
-
-;A_substring END
diff --git a/contrib/libs/asmlib/unalignedisfaster64.asm b/contrib/libs/asmlib/unalignedisfaster64.asm
deleted file mode 100644
index eed68a1398..0000000000
--- a/contrib/libs/asmlib/unalignedisfaster64.asm
+++ /dev/null
@@ -1,188 +0,0 @@
-%include "defs.asm"
-
-;************************* unalignedisfaster64.asm ******************************
-; Author: Agner Fog
-; Date created: 2011-07-09
-; Last modified: 2013-08-30
-; Source URL: www.agner.org/optimize
-; Project: asmlib.zip
-; Language: assembly, NASM/YASM syntax, 64 bit
-;
-; C++ prototype:
-; extern "C" int UnalignedIsFaster(void);
-;
-; Description:
-; This function finds out if unaligned 16-bytes memory read is
-; faster than aligned read followed by an alignment shift (PALIGNR) on the
-; current CPU.
-;
-; Return value:
-; 0: Unaligned read is probably slower than alignment shift
-; 1: Unknown
-; 2: Unaligned read is probably faster than alignment shift
-;
-;
-; C++ prototype:
-; extern "C" int Store256BitIsFaster(void);
-;
-; Description:
-; This function finds out if a 32-bytes memory write is
-; faster than two 16-bytes writes on the current CPU.
-;
-; Return value:
-; 0: 32-bytes memory write is slower or AVX not supported
-; 1: Unknown
-; 2: 32-bytes memory write is faster
-;
-; Copyright (c) 2011 - 2013 GNU General Public License www.gnu.org/licenses
-;******************************************************************************
-;
-; C++ prototype:
-; extern "C" int UnalignedIsFaster(void);
-
-global UnalignedIsFaster: function
-global Store256BitIsFaster: function
-extern CpuType
-extern InstructionSet
-
-
-SECTION .text
-
-UnalignedIsFaster:
-
-%ifdef UNIX
- push 0 ; vendor
- mov rdi, rsp
- push 0 ; family
- mov rsi, rsp
- push 0 ; model
- mov rdx, rsp
-%else ; WINDOWS
- push 0 ; vendor
- mov rcx, rsp
- push 0 ; family
- mov rdx, rsp
- push 0 ; model
- mov r8, rsp
-%endif
- call CpuType ; get vendor, family, model
- pop rdx ; model
- pop rcx ; family
- pop r8 ; vendor
- xor eax, eax ; return value
- dec r8d
- jz Intel
- dec r8d
- jz AMD
- dec r8d
- jz VIA
- ; unknown vendor
- inc eax
- jmp Uend
-
-Intel: ; Unaligned read is faster on Intel Nehalem and later, but not Atom
- ; Nehalem = family 6, model 1AH
- ; Atom = family 6, model 1CH
- ; Netburst = family 0FH
- ; Future models are likely to be family 6, mayby > 6, model > 1C
- cmp ecx, 6
- jb Uend ; old Pentium 1, etc
- cmp ecx, 0FH
- je Uend ; old Netburst architecture
- cmp edx, 1AH
- jb Uend ; earlier than Nehalem
- cmp edx, 1CH
- je Uend ; Intel Atom
- or eax, 2 ; Intel Nehalem and later, except Atom
- jmp Uend
-
-AMD: ; AMD processors:
- ; The PALIGNR instruction is slow on AMD Bobcat but fast on Jaguar
- ; K10/Opteron = family 10H ; Use unaligned
- ; Bobcat = family 14H ; PALIGNR is very slow. Use unaligned
- ; Piledriver = family 15H ; Use unaligned
- ; Jaguar = family 16H ; PALIGNR is fast. Use aligned (aligned is faster in most cases, but not all)
- cmp ecx, 10H ; AMD K8 or earlier: use aligned
- jb Uend
- cmp ecx, 16H ; Jaguar: use aligned
- je Uend
- or eax, 2 ; AMD K10 or later: use unaligned
- jmp Uend
-
-VIA: ; Unaligned read is not faster than PALIGNR on VIA Nano 2000 and 3000
- cmp ecx, 0FH
- jna Uend ; VIA Nano
- inc eax ; Future versions: unknown
- ;jmp Uend
-
-Uend: ret
-
-;UnalignedIsFaster ENDP
-
-
-Store256BitIsFaster:
- call InstructionSet
- cmp eax, 11 ; AVX supported
- jb S90
-%ifdef UNIX
- push 0 ; vendor
- mov rdi, rsp
- push 0 ; family
- mov rsi, rsp
- push 0 ; model
- mov rdx, rsp
-%else ; WINDOWS
- push 0 ; vendor
- mov rcx, rsp
- push 0 ; family
- mov rdx, rsp
- push 0 ; model
- mov r8, rsp
-%endif
- call CpuType ; get vendor, family, model
- pop rdx ; model
- pop rcx ; family
- pop rax ; vendor
-
- cmp eax, 1 ; Intel
- je S_Intel
- cmp eax, 2 ; AMD
- je S_AMD
- cmp eax, 3
- je S_VIA
- jmp S91 ; other vendor, not known
-
-S_Intel:cmp ecx, 6
- jne S92 ; unknown family. possibly future model
- ; model 2AH Sandy Bridge
- ; model 3AH Ivy Bridge
- ; model 3CH Haswell
- ; Sandy Bridge and Ivy Bridge are slightly faster with 128 than with 256 bit moves on large data blocks
- ; Haswell is much faster with 256 bit moves
- cmp edx, 3AH
- jbe S90
- jmp S92
-
-S_AMD: ; AMD
- cmp ecx, 15H ; family 15h = Bulldozer, Piledriver
- ja S92 ; assume future AMD families are faster
- ; family 16H = Jaguar. 256 bit write is slightly faster
- ; model 1 = Bulldozer is a little slower on 256 bit write
- ; model 2 = Piledriver is terribly slow on 256 bit write
- ; assume future models 3-4 are like Bulldozer
- cmp edx, 4
- jbe S90
- jmp S91 ; later models: don't know
-
-S_VIA: jmp S91 ; don't know
-
-S90: xor eax, eax ; return 0
- ret
-
-S91: mov eax, 1 ; return 1
- ret
-
-S92: mov eax, 2 ; return 2
- ret
-
-; Store256BitIsFaster ENDP
diff --git a/contrib/libs/asmlib/ya.make b/contrib/libs/asmlib/ya.make
deleted file mode 100644
index 35baa5a7a2..0000000000
--- a/contrib/libs/asmlib/ya.make
+++ /dev/null
@@ -1,110 +0,0 @@
-LIBRARY()
-
-LICENSE(
- GPL-1.0-or-later AND
- GPL-2.0-only AND
- GPL-3.0-or-later AND
- LGPL-2.0-or-later AND
- LGPL-3.0-only
-)
-
-VERSION(2016-11-16)
-
-LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
-
-ORIGINAL_SOURCE(https://www.agner.org/optimize/)
-
-NO_PLATFORM()
-
-SET(_YASM_PREDEFINED_FLAGS_VALUE "")
-
-IF (ARCH_X86_64)
- IF (OS_DARWIN)
- PEERDIR(
- contrib/libs/asmglibc
- )
- ENDIF()
- IF (NOT OS_DARWIN)
- SRCS(
- sfmt64.asm
- mother64.asm
- mersenne64.asm
- )
- ENDIF()
- SRCS(
- debugbreak64.asm
- cachesize64.asm
- divfixedi64.asm
- rdtsc64.asm
- strcat64.asm
- unalignedisfaster64.asm
- strcpy64.asm
- substring64.asm
- strlen64.asm
- cputype64.asm
- memcmp64.asm
- memmove64.asm
- stricmp64.asm
- divfixedv64.asm
- physseed64.asm
- cpuid64.asm
- round64.asm
- memcpy64.asm
- popcount64.asm
- dispatchpatch64.asm
- #instrset64.asm
- procname64.asm
- memset64.asm
- #disabled because of protection violation
- #strcountutf864.asm
- #strcountset64.asm
- #strtouplow64.asm
- #strcmp64.asm
- #strspn64.asm
- #strstr64.asm
- )
-ENDIF()
-
-IF (ARCH_I386)
- SRCS(
- debugbreak32.asm
- cachesize32.asm
- divfixedi32.asm
- rdtsc32.asm
- strcat32.asm
- unalignedisfaster32.asm
- strcpy32.asm
- substring32.asm
- strlen32.asm
- cputype32.asm
- memcmp32.asm
- memmove32.asm
- sfmt32.asm
- stricmp32.asm
- divfixedv32.asm
- physseed32.asm
- cpuid32.asm
- mother32.asm
- round32.asm
- mersenne32.asm
- memcpy32.asm
- popcount32.asm
- dispatchpatch32.asm
- #instrset32.asm
- procname32.asm
- memset32.asm
- #disabled because of protection violation
- #strcountutf832.asm
- #strcountset32.asm
- #strtouplow32.asm
- #strcmp32.asm
- #strspn32.asm
- #strstr32.asm
- )
-ENDIF()
-
-SRCS(
- dummy.c
-)
-
-END()
diff --git a/ydb/apps/ydb/CMakeLists.darwin-x86_64.txt b/ydb/apps/ydb/CMakeLists.darwin-x86_64.txt
index 832d38005c..87e7b9d72a 100644
--- a/ydb/apps/ydb/CMakeLists.darwin-x86_64.txt
+++ b/ydb/apps/ydb/CMakeLists.darwin-x86_64.txt
@@ -19,7 +19,6 @@ target_link_libraries(ydb PUBLIC
contrib-libs-cxxsupp
yutil
library-cpp-cpuid_check
- contrib-libs-asmlib
commands
library-cpp-resource
)
diff --git a/ydb/apps/ydb/CMakeLists.linux-aarch64.txt b/ydb/apps/ydb/CMakeLists.linux-aarch64.txt
index b4b252b29b..619d67ad35 100644
--- a/ydb/apps/ydb/CMakeLists.linux-aarch64.txt
+++ b/ydb/apps/ydb/CMakeLists.linux-aarch64.txt
@@ -20,7 +20,6 @@ target_link_libraries(ydb PUBLIC
contrib-libs-linux-headers
contrib-libs-cxxsupp
yutil
- contrib-libs-asmlib
commands
library-cpp-resource
)
diff --git a/ydb/apps/ydb/CMakeLists.linux-x86_64.txt b/ydb/apps/ydb/CMakeLists.linux-x86_64.txt
index c6722e7f54..ab7446e042 100644
--- a/ydb/apps/ydb/CMakeLists.linux-x86_64.txt
+++ b/ydb/apps/ydb/CMakeLists.linux-x86_64.txt
@@ -21,7 +21,6 @@ target_link_libraries(ydb PUBLIC
contrib-libs-cxxsupp
yutil
library-cpp-cpuid_check
- contrib-libs-asmlib
commands
library-cpp-resource
)
diff --git a/ydb/apps/ydb/CMakeLists.windows-x86_64.txt b/ydb/apps/ydb/CMakeLists.windows-x86_64.txt
index 1919d40b1d..9bba9d0f4b 100644
--- a/ydb/apps/ydb/CMakeLists.windows-x86_64.txt
+++ b/ydb/apps/ydb/CMakeLists.windows-x86_64.txt
@@ -19,7 +19,6 @@ target_link_libraries(ydb PUBLIC
contrib-libs-cxxsupp
yutil
library-cpp-cpuid_check
- contrib-libs-asmlib
commands
library-cpp-resource
)
diff --git a/ydb/apps/ydb/ya.make b/ydb/apps/ydb/ya.make
index b971d714c3..dc11518ba5 100644
--- a/ydb/apps/ydb/ya.make
+++ b/ydb/apps/ydb/ya.make
@@ -6,10 +6,7 @@ SRCS(
main.cpp
)
-DISABLE(USE_ASMLIB)
-
PEERDIR(
- contrib/libs/asmlib
ydb/apps/ydb/commands
)
@@ -17,6 +14,15 @@ RESOURCE(
ydb/apps/ydb/version.txt version.txt
)
+IF (NOT USE_SSE4 AND NOT OPENSOURCE)
+ # contrib/libs/glibasm can not be built without SSE4
+ # Replace it with contrib/libs/asmlib which can be built this way.
+ DISABLE(USE_ASMLIB)
+ PEERDIR(
+ contrib/libs/asmlib
+ )
+ENDIF()
+
#
# DON'T ALLOW NEW DEPENDENCIES WITHOUT EXPLICIT APPROVE FROM kikimr-dev@ or fomichev@
#