diff options
author | pnv1 <pnv@ydb.tech> | 2023-04-27 19:15:07 +0300 |
---|---|---|
committer | pnv1 <pnv@ydb.tech> | 2023-04-27 19:15:07 +0300 |
commit | a66c59109292f9e0fb44ede41adfdebe569e4df3 (patch) | |
tree | 906b3d10274afd16e8e70c61ff416bff9075422e | |
parent | 9ca91b40d6f45546e20a646d15590c0cc6cc9778 (diff) | |
download | ydb-a66c59109292f9e0fb44ede41adfdebe569e4df3.tar.gz |
Switch to old asmlib to be able to build ydb cli without sse4
47 files changed, 9203 insertions, 0 deletions
diff --git a/cmake/global_vars.cmake b/cmake/global_vars.cmake index 1a4bd6b922..b03aa1fdc5 100644 --- a/cmake/global_vars.cmake +++ b/cmake/global_vars.cmake @@ -24,6 +24,7 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_ endif() if(WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA) + set(YASM_FLAGS -f win64 -D WIN64 -D _x86_64_ -D_YASM_) set(BISON_FLAGS -v) set(RAGEL_FLAGS -L -I ${CMAKE_SOURCE_DIR}/) endif() diff --git a/contrib/libs/CMakeLists.darwin-x86_64.txt b/contrib/libs/CMakeLists.darwin-x86_64.txt index 8f27501754..dbdaed7276 100644 --- a/contrib/libs/CMakeLists.darwin-x86_64.txt +++ b/contrib/libs/CMakeLists.darwin-x86_64.txt @@ -8,6 +8,8 @@ add_subdirectory(antlr3_cpp_runtime) add_subdirectory(apache) +add_subdirectory(asmglibc) +add_subdirectory(asmlib) add_subdirectory(aws-sdk-cpp) add_subdirectory(base64) add_subdirectory(brotli) diff --git a/contrib/libs/CMakeLists.linux-aarch64.txt b/contrib/libs/CMakeLists.linux-aarch64.txt index 80cc88b2df..c67d278e53 100644 --- a/contrib/libs/CMakeLists.linux-aarch64.txt +++ b/contrib/libs/CMakeLists.linux-aarch64.txt @@ -8,6 +8,7 @@ add_subdirectory(antlr3_cpp_runtime) add_subdirectory(apache) +add_subdirectory(asmlib) add_subdirectory(aws-sdk-cpp) add_subdirectory(base64) add_subdirectory(brotli) diff --git a/contrib/libs/CMakeLists.linux-x86_64.txt b/contrib/libs/CMakeLists.linux-x86_64.txt index 797ac4fd05..185d96e891 100644 --- a/contrib/libs/CMakeLists.linux-x86_64.txt +++ b/contrib/libs/CMakeLists.linux-x86_64.txt @@ -8,6 +8,7 @@ add_subdirectory(antlr3_cpp_runtime) add_subdirectory(apache) +add_subdirectory(asmlib) add_subdirectory(aws-sdk-cpp) add_subdirectory(base64) add_subdirectory(brotli) diff --git a/contrib/libs/CMakeLists.windows-x86_64.txt b/contrib/libs/CMakeLists.windows-x86_64.txt index cd2534dc93..96f605c258 100644 --- a/contrib/libs/CMakeLists.windows-x86_64.txt +++ b/contrib/libs/CMakeLists.windows-x86_64.txt @@ -8,6 +8,7 @@ add_subdirectory(antlr3_cpp_runtime) add_subdirectory(apache) +add_subdirectory(asmlib) add_subdirectory(aws-sdk-cpp) add_subdirectory(base64) add_subdirectory(brotli) diff --git a/contrib/libs/asmglibc/CMakeLists.darwin-x86_64.txt b/contrib/libs/asmglibc/CMakeLists.darwin-x86_64.txt new file mode 100644 index 0000000000..e2b4e37fbb --- /dev/null +++ b/contrib/libs/asmglibc/CMakeLists.darwin-x86_64.txt @@ -0,0 +1,13 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(contrib-libs-asmglibc) +target_sources(contrib-libs-asmglibc PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmglibc/memchr.S +) diff --git a/contrib/libs/asmglibc/CMakeLists.txt b/contrib/libs/asmglibc/CMakeLists.txt new file mode 100644 index 0000000000..661b6431cc --- /dev/null +++ b/contrib/libs/asmglibc/CMakeLists.txt @@ -0,0 +1,11 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +if (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + include(CMakeLists.darwin-x86_64.txt) +endif() diff --git a/contrib/libs/asmglibc/memchr.S b/contrib/libs/asmglibc/memchr.S new file mode 100644 index 0000000000..b0a51115c4 --- /dev/null +++ b/contrib/libs/asmglibc/memchr.S @@ -0,0 +1,330 @@ +/* Copyright (C) 2011-2018 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include "sysdep.h" + +#ifdef USE_AS_WMEMCHR +# define MEMCHR wmemchr +# define PCMPEQ pcmpeqd +#else +# define MEMCHR memchr +# define PCMPEQ pcmpeqb +#endif + +/* fast SSE2 version with using pmaxub and 64 byte loop */ + + .text +ENTRY(MEMCHR) + movd %esi, %xmm1 + mov %edi, %ecx + +#ifdef USE_AS_WMEMCHR + test %rdx, %rdx + jz L(return_null) + shl $2, %rdx +#else + punpcklbw %xmm1, %xmm1 + test %rdx, %rdx + jz L(return_null) + punpcklbw %xmm1, %xmm1 +#endif + + and $63, %ecx + pshufd $0, %xmm1, %xmm1 + + cmp $48, %ecx + ja L(crosscache) + + movdqu (%rdi), %xmm0 + PCMPEQ %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + + jnz L(matches_1) + sub $16, %rdx + jbe L(return_null) + add $16, %rdi + and $15, %ecx + and $-16, %rdi + add %rcx, %rdx + sub $64, %rdx + jbe L(exit_loop) + jmp L(loop_prolog) + + .p2align 4 +L(crosscache): + and $15, %ecx + and $-16, %rdi + movdqa (%rdi), %xmm0 + + PCMPEQ %xmm1, %xmm0 +/* Check if there is a match. */ + pmovmskb %xmm0, %eax +/* Remove the leading bytes. */ + sar %cl, %eax + test %eax, %eax + je L(unaligned_no_match) +/* Check which byte is a match. */ + bsf %eax, %eax + + sub %rax, %rdx + jbe L(return_null) + add %rdi, %rax + add %rcx, %rax + ret + + .p2align 4 +L(unaligned_no_match): + /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using + "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void + possible addition overflow. */ + neg %rcx + add $16, %rcx + sub %rcx, %rdx + jbe L(return_null) + add $16, %rdi + sub $64, %rdx + jbe L(exit_loop) + + .p2align 4 +L(loop_prolog): + movdqa (%rdi), %xmm0 + PCMPEQ %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + movdqa 16(%rdi), %xmm2 + PCMPEQ %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + movdqa 32(%rdi), %xmm3 + PCMPEQ %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 48(%rdi), %xmm4 + PCMPEQ %xmm1, %xmm4 + add $64, %rdi + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(matches0) + + test $0x3f, %rdi + jz L(align64_loop) + + sub $64, %rdx + jbe L(exit_loop) + + movdqa (%rdi), %xmm0 + PCMPEQ %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + movdqa 16(%rdi), %xmm2 + PCMPEQ %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + movdqa 32(%rdi), %xmm3 + PCMPEQ %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 48(%rdi), %xmm3 + PCMPEQ %xmm1, %xmm3 + pmovmskb %xmm3, %eax + + add $64, %rdi + test %eax, %eax + jnz L(matches0) + + mov %rdi, %rcx + and $-64, %rdi + and $63, %ecx + add %rcx, %rdx + + .p2align 4 +L(align64_loop): + sub $64, %rdx + jbe L(exit_loop) + movdqa (%rdi), %xmm0 + movdqa 16(%rdi), %xmm2 + movdqa 32(%rdi), %xmm3 + movdqa 48(%rdi), %xmm4 + + PCMPEQ %xmm1, %xmm0 + PCMPEQ %xmm1, %xmm2 + PCMPEQ %xmm1, %xmm3 + PCMPEQ %xmm1, %xmm4 + + pmaxub %xmm0, %xmm3 + pmaxub %xmm2, %xmm4 + pmaxub %xmm3, %xmm4 + pmovmskb %xmm4, %eax + + add $64, %rdi + + test %eax, %eax + jz L(align64_loop) + + sub $64, %rdi + + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + movdqa 32(%rdi), %xmm3 + PCMPEQ %xmm1, %xmm3 + + PCMPEQ 48(%rdi), %xmm1 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + pmovmskb %xmm1, %eax + bsf %eax, %eax + lea 48(%rdi, %rax), %rax + ret + + .p2align 4 +L(exit_loop): + add $32, %edx + jle L(exit_loop_32) + + movdqa (%rdi), %xmm0 + PCMPEQ %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + movdqa 16(%rdi), %xmm2 + PCMPEQ %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + movdqa 32(%rdi), %xmm3 + PCMPEQ %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32_1) + sub $16, %edx + jle L(return_null) + + PCMPEQ 48(%rdi), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches48_1) + xor %eax, %eax + ret + + .p2align 4 +L(exit_loop_32): + add $32, %edx + movdqa (%rdi), %xmm0 + PCMPEQ %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches_1) + sub $16, %edx + jbe L(return_null) + + PCMPEQ 16(%rdi), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches16_1) + xor %eax, %eax + ret + + .p2align 4 +L(matches0): + bsf %eax, %eax + lea -16(%rax, %rdi), %rax + ret + + .p2align 4 +L(matches): + bsf %eax, %eax + add %rdi, %rax + ret + + .p2align 4 +L(matches16): + bsf %eax, %eax + lea 16(%rax, %rdi), %rax + ret + + .p2align 4 +L(matches32): + bsf %eax, %eax + lea 32(%rax, %rdi), %rax + ret + + .p2align 4 +L(matches_1): + bsf %eax, %eax + sub %rax, %rdx + jbe L(return_null) + add %rdi, %rax + ret + + .p2align 4 +L(matches16_1): + bsf %eax, %eax + sub %rax, %rdx + jbe L(return_null) + lea 16(%rdi, %rax), %rax + ret + + .p2align 4 +L(matches32_1): + bsf %eax, %eax + sub %rax, %rdx + jbe L(return_null) + lea 32(%rdi, %rax), %rax + ret + + .p2align 4 +L(matches48_1): + bsf %eax, %eax + sub %rax, %rdx + jbe L(return_null) + lea 48(%rdi, %rax), %rax + ret + + .p2align 4 +L(return_null): + xor %eax, %eax + ret +END(MEMCHR) + +#ifndef USE_AS_WMEMCHR +strong_alias (memchr, __memchr) +libc_hidden_builtin_def(memchr) +#endif
\ No newline at end of file diff --git a/contrib/libs/asmglibc/sysdep.h b/contrib/libs/asmglibc/sysdep.h new file mode 100644 index 0000000000..1cfb71673e --- /dev/null +++ b/contrib/libs/asmglibc/sysdep.h @@ -0,0 +1,12 @@ +#if defined(__APPLE__) + #define ENTRY(X) .globl _## X; .align 1<<3; _ ## X: + #define END(X) + #define L(X) L ## X +#else + #define ENTRY(X) .globl X; .type X,@function; .align 1<<4; X: .cfi_startproc; + #define END(X) .cfi_endproc; .size X,.-X; + #define L(X) .L ## X +#endif + +#define libc_hidden_builtin_def(X) +#define strong_alias(X, Y) diff --git a/contrib/libs/asmlib/CMakeLists.darwin-x86_64.txt b/contrib/libs/asmlib/CMakeLists.darwin-x86_64.txt new file mode 100644 index 0000000000..56e892f3a2 --- /dev/null +++ b/contrib/libs/asmlib/CMakeLists.darwin-x86_64.txt @@ -0,0 +1,192 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(contrib-libs-asmlib) +target_link_libraries(contrib-libs-asmlib PUBLIC + contrib-libs-asmglibc +) +target_sources(contrib-libs-asmlib PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/dummy.c +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/debugbreak64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/cachesize64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/divfixedi64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/rdtsc64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/strcat64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/unalignedisfaster64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/strcpy64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/substring64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/strlen64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/cputype64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memcmp64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memmove64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/stricmp64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/divfixedv64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/physseed64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/cpuid64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/round64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memcpy64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/popcount64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/dispatchpatch64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/procname64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memset64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) diff --git a/contrib/libs/asmlib/CMakeLists.linux-aarch64.txt b/contrib/libs/asmlib/CMakeLists.linux-aarch64.txt new file mode 100644 index 0000000000..d29b43c90a --- /dev/null +++ b/contrib/libs/asmlib/CMakeLists.linux-aarch64.txt @@ -0,0 +1,16 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(contrib-libs-asmlib) +target_link_libraries(contrib-libs-asmlib PUBLIC + contrib-libs-linux-headers +) +target_sources(contrib-libs-asmlib PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/dummy.c +) diff --git a/contrib/libs/asmlib/CMakeLists.linux-x86_64.txt b/contrib/libs/asmlib/CMakeLists.linux-x86_64.txt new file mode 100644 index 0000000000..e4b9975e9f --- /dev/null +++ b/contrib/libs/asmlib/CMakeLists.linux-x86_64.txt @@ -0,0 +1,216 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(contrib-libs-asmlib) +target_link_libraries(contrib-libs-asmlib PUBLIC + contrib-libs-linux-headers +) +target_sources(contrib-libs-asmlib PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/dummy.c +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/sfmt64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/mother64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/mersenne64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/debugbreak64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/cachesize64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/divfixedi64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/rdtsc64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/strcat64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/unalignedisfaster64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/strcpy64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/substring64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/strlen64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/cputype64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memcmp64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memmove64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/stricmp64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/divfixedv64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/physseed64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/cpuid64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/round64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memcpy64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/popcount64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/dispatchpatch64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/procname64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memset64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) diff --git a/contrib/libs/asmlib/CMakeLists.txt b/contrib/libs/asmlib/CMakeLists.txt new file mode 100644 index 0000000000..f8b31df0c1 --- /dev/null +++ b/contrib/libs/asmlib/CMakeLists.txt @@ -0,0 +1,17 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-aarch64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + include(CMakeLists.darwin-x86_64.txt) +elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA) + include(CMakeLists.windows-x86_64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-x86_64.txt) +endif() diff --git a/contrib/libs/asmlib/CMakeLists.windows-x86_64.txt b/contrib/libs/asmlib/CMakeLists.windows-x86_64.txt new file mode 100644 index 0000000000..6e1a2adde6 --- /dev/null +++ b/contrib/libs/asmlib/CMakeLists.windows-x86_64.txt @@ -0,0 +1,213 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(contrib-libs-asmlib) +target_sources(contrib-libs-asmlib PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/dummy.c +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/sfmt64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/mother64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/mersenne64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/debugbreak64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/cachesize64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/divfixedi64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/rdtsc64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/strcat64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/unalignedisfaster64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/strcpy64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/substring64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/strlen64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/cputype64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memcmp64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memmove64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/stricmp64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/divfixedv64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/physseed64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/cpuid64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/round64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memcpy64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/popcount64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/dispatchpatch64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/procname64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) +target_yasm_source(contrib-libs-asmlib + PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memset64.asm + -I + ${CMAKE_BINARY_DIR} + -I + ${CMAKE_SOURCE_DIR} +) diff --git a/contrib/libs/asmlib/cachesize64.asm b/contrib/libs/asmlib/cachesize64.asm new file mode 100644 index 0000000000..c0bce8cf74 --- /dev/null +++ b/contrib/libs/asmlib/cachesize64.asm @@ -0,0 +1,335 @@ +%include "defs.asm" + +;************************* cachesize64.asm *************************************
+; Author: Agner Fog
+; Date created: 2011-07-11
+; Last modified: 2013-08-14
+; Description:
+; Determines the size of the data caches
+;
+; extern "C" site_t DataCacheSize(int level);
+; Input:
+; level: n = 1 - 4: level n data cache
+; 0 = largest level data cache
+; Return value: size in bytes of data cache
+;
+; The latest version of this file is available at:
+; www.agner.org/optimize/asmexamples.zip
+; Copyright (c) 2011-2013 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+default rel
+
+global DataCacheSize: function
+
+; Imported from cputype64.asm
+extern CpuType ; near. Determine CPU vendor
+
+struc data_layout
+ok: resd 2
+level1: resq 1
+level2: resq 1
+level3: resq 1
+level4: resq 1
+descriptortable: resd 60
+endstruc
+
+struc descriptor_record ; record for table of cache descriptors
+d_key: resb 1 ; key from cpuid instruction
+d_level: resb 1 ; cache level
+d_sizem: resb 1 ; size multiplier
+d_2pow: resb 1 ; power of 2. size = d_sizem << d_2pow
+endstruc
+
+SECTION .data
+
+dataref: ; reference point
+ok_: DD 0, 0 ; 1 when values are determined
+level1_: DQ 0 ; level 1 data cache size
+level2_: DQ 0 ; level 2 data cache size
+level3_: DQ 0 ; level 3 data cache size
+level4_: DQ 0 ; level 4 data cache size
+numlevels equ 4 ; max level
+
+; From "Intel Processor Identification and the CPUID Instruction, Application note 485
+descriptortable_: ; table of Intel cache descriptors
+db 0Ah, 1, 1, 13 ; 8 kb L1 data cache
+db 0Ch, 1, 1, 14 ; 16 kb L1 data cache
+db 0Dh, 1, 1, 14 ; 16 kb L1 data cache
+db 21h, 2, 1, 18 ; 256 kb L2 data cache
+db 22h, 3, 1, 19 ; 512 kb L3 data cache
+db 23h, 3, 1, 20 ; 1 Mb L3 data cache
+db 25h, 3, 1, 21 ; 2 Mb L3 data cache
+db 29h, 3, 1, 22 ; 4 Mb L3 data cache
+db 2Ch, 1, 1, 15 ; 32 kb L1 data cache
+db 39h, 2, 1, 17 ; 128 kb L2 data cache
+db 3Ah, 2, 3, 16 ; 192 kb L2 data cache
+db 3Bh, 2, 1, 17 ; 128 kb L1 data cache
+db 3Ch, 2, 1, 18 ; 256 kb L1 data cache
+db 3Dh, 2, 3, 17 ; 384 kb L2 data cache
+db 3Eh, 2, 1, 19 ; 512 kb L2 data cache
+db 41h, 2, 1, 17 ; 128 kb L2 data cache
+db 42h, 2, 1, 18 ; 256 kb L2 data cache
+db 43h, 2, 1, 19 ; 512 kb L2 data cache
+db 44h, 2, 1, 20 ; 1 Mb L2 data cache
+db 45h, 2, 1, 21 ; 2 Mb L2 data cache
+db 46h, 3, 1, 22 ; 4 Mb L3 data cache
+db 47h, 3, 1, 23 ; 8 Mb L3 data cache
+db 48h, 2, 3, 20 ; 3 Mb L2 data cache
+db 49h, 2, 1, 22 ; 4 Mb L2 or 3 data cache
+db 4Ah, 3, 3, 21 ; 6 Mb L3 data cache
+db 4Bh, 3, 1, 23 ; 8 Mb L3 data cache
+db 4Ch, 3, 3, 22 ; 12 Mb L3 data cache
+db 4Dh, 3, 1, 24 ; 16 Mb L3 data cache
+db 4Eh, 2, 3, 21 ; 6 Mb L2 data cache
+db 60h, 1, 1, 14 ; 16 kb L1 data cache
+db 66h, 1, 1, 13 ; 8 kb L1 data cache
+db 67h, 1, 1, 14 ; 16 kb L1 data cache
+db 68h, 1, 1, 15 ; 32 kb L1 data cache
+db 78h, 2, 1, 20 ; 1 Mb L2 data cache
+db 79h, 2, 1, 17 ; 128 kb L2 data cache
+db 7Ah, 2, 1, 18 ; 256 kb L2 data cache
+db 7Bh, 2, 1, 19 ; 512 kb L2 data cache
+db 7Ch, 2, 1, 20 ; 1 Mb L2 data cache
+db 7Dh, 2, 1, 21 ; 2 Mb L2 data cache
+db 7Fh, 2, 1, 19 ; 512 kb L2 data cache
+db 82h, 2, 1, 18 ; 256 kb L2 data cache
+db 83h, 2, 1, 19 ; 512 kb L2 data cache
+db 84h, 2, 1, 20 ; 1 Mb L2 data cache
+db 85h, 2, 1, 21 ; 2 Mb L2 data cache
+db 86h, 2, 1, 19 ; 512 kb L2 data cache
+db 87h, 2, 1, 20 ; 1 Mb L2 data cache
+db 0D0h, 3, 1, 19 ; 512 kb L3 data cache
+db 0D1h, 3, 1, 20 ; 1 Mb L3 data cache
+db 0D2h, 3, 1, 21 ; 2 Mb L3 data cache
+db 0D6h, 3, 1, 20 ; 1 Mb L3 data cache
+db 0D7h, 3, 1, 21 ; 2 Mb L3 data cache
+db 0D8h, 3, 1, 22 ; 4 Mb L3 data cache
+db 0DCh, 3, 3, 19 ; 1.5 Mb L3 data cache
+db 0DDh, 3, 3, 20 ; 3 Mb L3 data cache
+db 0DEh, 3, 3, 21 ; 6 Mb L3 data cache
+db 0E2h, 3, 1, 21 ; 2 Mb L3 data cache
+db 0E3h, 3, 1, 22 ; 4 Mb L3 data cache
+db 0E4h, 3, 1, 23 ; 8 Mb L3 data cache
+db 0EAh, 3, 3, 22 ; 12 Mb L3 data cache
+db 0EBh, 3, 9, 21 ; 18 Mb L3 data cache
+db 0ECh, 3, 3, 23 ; 24 Mb L3 data cache
+descriptortablelength equ ($ - descriptortable_) / descriptor_record_size
+
+
+SECTION .text
+
+; extern "C" site_t DataCacheSize(int level);
+
+; Function entry:
+DataCacheSize:
+ push rbx
+ push r14
+%ifdef WINDOWS
+ push rsi
+ push rdi
+ mov r14d, ecx ; level
+%else ; UNIX
+ mov r14d, edi ; level
+%endif
+ ; check if called before
+ lea r9, [dataref]
+ cmp dword [r9+ok], 1 ; ok
+ je D800
+
+ ; find cpu vendor
+ push 0
+%ifdef WINDOWS
+ mov rcx, rsp
+ xor edx, edx
+ xor r8d, r8d
+%else ; UNIX
+ mov rdi, rsp
+ xor esi, esi
+ xor edx, edx
+%endif
+ call CpuType
+ lea r9, [dataref]
+ pop rax ; eax = vendor
+ dec eax
+ jz Intel
+ dec eax
+ jz AMD
+ dec eax
+ jz VIA
+ ; unknown vendor, try all methods
+ call IntelNewMethod
+ jnc D800 ; not carry = success
+ call AMDMethod
+ jnc D800 ; not carry = success
+ call IntelOldMethod
+ jmp D800 ; return whether success or not
+
+Intel: call IntelNewMethod
+ jnc D800 ; not carry = success
+ call IntelOldMethod
+ jmp D800 ; return whether success or not
+
+AMD: ; AMD and VIA use same method
+VIA: call AMDMethod
+
+D800: ; cache data known, get desired return value
+ xor eax, eax
+ cmp r14d, numlevels
+ ja D900
+ cmp r14d, 0
+ je D820
+ ; level = 1 .. numlevels
+ mov rax, [r9 + r14*8] ; size of selected cache
+ jmp D850
+D820: ; level = 0. Get size of largest level cache
+ mov rax, [r9 + level3] ; level3
+ test rax, rax
+ jnz D850
+ mov rax, [r9 + level2] ; level2
+ test rax, rax
+ jnz D850
+ mov eax, [r9 + level1] ; level1
+D850: mov dword [r9 + ok], 1 ; remember called, whether success or not
+D900:
+%ifdef WINDOWS
+ pop rdi
+ pop rsi
+%endif
+ pop r14
+ pop rbx
+ ret
+
+
+; Determine cache sizes by CPUID function 4
+; input: esi = pointer to dataref
+; output: values returned in dataref + level1, level2, level3
+; carry flag = 0 on succes
+IntelNewMethod:
+ xor eax, eax
+ cpuid ; get number of CPUID functions
+ cmp eax, 4
+ jb I900 ; fail
+ xor esi, esi ; loop counter
+I100: mov eax, 4
+ mov ecx, esi
+ cpuid ; get cache parameters
+ mov edx, eax
+ and edx, 11111b ; cache type
+ jz I500 ; no more caches
+ cmp edx, 2
+ je I200 ; code cache, ignore
+ inc ecx ; sets
+ mov edx, ebx
+ shr edx, 22
+ inc edx ; ways
+ imul ecx, edx
+ mov edx, ebx
+ shr edx, 12
+ and edx, 1111111111b
+ inc edx ; partitions
+ imul ecx, edx
+ and ebx, 111111111111b
+ inc ebx ; line size
+ imul rcx, rbx ; calculated cache size (64 bit)
+ shr eax, 5
+ and eax, 111b ; cache level
+ cmp eax, numlevels
+ jna I180
+ mov eax, numlevels ; limit higher levels
+I180: mov [r9+rax*8], rcx ; store size of data cache level eax
+I200: inc esi
+ cmp esi, 100h ; avoid infinite loop
+ jb I100 ; next cache
+I500: ; loop finished
+ ; check if OK
+ mov eax, [r9+level1] ; level1
+ cmp eax, 1024
+I900: ret ; carry flag set if fail
+
+; Determine cache sizes by CPUID function 2
+; input: esi = pointer to dataref
+; output: values returned in dataref + level1, level2, level3
+; carry flag = 0 on succes
+IntelOldMethod:
+ xor eax, eax
+ cpuid ; get number of CPUID functions
+ cmp eax, 2
+ jb J900 ; fail
+ mov eax, 2
+ xor ecx, ecx
+ cpuid ; get 16 descriptor bytes in eax, ebx, ecx, edx
+ mov al, 0 ; al does not contain a descriptor
+ sub rsp, 16
+ mov [rsp], eax ; save all descriptors
+ mov [rsp+4], ebx
+ mov [rsp+8], ecx
+ mov [rsp+12], edx
+ mov edx, 15 ; loop counter
+ ; loop to read 16 descriptor bytes
+J100: mov al, byte [rsp+rdx]
+ ; find in table
+ mov ebx, descriptortablelength-1 ; loop counter
+ ; loop to search in descriptortable
+J200: cmp al, [r9 + descriptortable + rbx*4 + d_key]
+ jne J300
+ ; descriptor found
+ movzx eax, byte [r9 + descriptortable + rbx*4 + d_sizem]
+ mov cl, [r9 + descriptortable + rbx*4 + d_2pow]
+ shl eax, cl ; compute size
+ movzx ecx, byte [r9 + descriptortable + rbx*4 + d_level]
+ ; check that level = 1-3
+ cmp ecx, 3
+ ja J300
+ mov [r9+rcx*8], rax ; store size eax of data cache level ecx
+J300: dec ebx
+ jns J200 ; inner loop
+ dec edx
+ jns J100 ; outer loop
+ add rsp, 16 ; remove from stack
+ ; check if OK
+ mov eax, [r9 + level1]
+ cmp eax, 1024
+J900: ret ; carry flag set if fail
+
+
+; Determine cache sizes by CPUID function 80000005H - 80000006H
+; input: esi = pointer to dataref
+; output: values returned in dataref
+; carry flag = 0 on succes
+AMDMethod:
+ mov eax, 80000000H
+ cpuid ; get number of CPUID functions
+ cmp eax, 6
+ jb K900 ; fail
+ mov eax, 80000005H
+ cpuid ; get L1 cache size
+ shr ecx, 24 ; L1 data cache size in kbytes
+ shl ecx, 10 ; L1 data cache size in bytes
+ mov [r9 + level1], ecx ; store L1 data cache size
+ mov eax, 80000006H
+ cpuid ; get L2 and L3 cache sizes
+ shr ecx, 16 ; L2 data cache size in kbytes
+ shl ecx, 10 ; L2 data cache size in bytes
+ mov [r9 + level2], ecx ; store L2 data cache size
+ mov ecx, edx
+ shr ecx, 18 ; L3 data cache size / 512 kbytes
+ shl rcx, 19 ; L3 data cache size in bytes
+%if 0 ; AMD manual is unclear:
+ ; do we have to increase the value if the number of ways is not a power or 2?
+ shr edx, 12
+ and edx, 1111b ; L3 associativity
+ cmp edx, 3
+ jb K100
+ test edx, 1
+ jz K100
+ ; number of ways is not a power of 2, multiply by 1.5 ?
+ mov rax, rcx
+ shr rax, 1
+ add rcx, rax
+%endif
+K100: mov [r9 + level3], rcx ; store L3 data cache size
+ ; check if OK
+ mov eax, [r9 + level1]
+ cmp eax, 1024
+K900: ret ; carry flag set if fail
diff --git a/contrib/libs/asmlib/cpuid64.asm b/contrib/libs/asmlib/cpuid64.asm new file mode 100644 index 0000000000..95f1b5a22d --- /dev/null +++ b/contrib/libs/asmlib/cpuid64.asm @@ -0,0 +1,55 @@ +%include "defs.asm" + +;************************* cpuid64.asm *********************************
+; Author: Agner Fog
+; Date created: 2008-12-14
+; Last modified: 2011-07-01
+; Source URL: www.agner.org/optimize
+; Project: asmlib.zip
+; Description:
+; This function calls the CPUID instruction.
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+default rel
+
+global cpuid_ex: function
+
+SECTION .text align=16
+
+; ********** cpuid_ex function **********
+; C++ prototype:
+; extern "C" void cpuid_ex (int abcd[4], int a, int c);
+; Input: a = eax, c = ecx
+; Output: abcd[0] = eax, abcd[1] = ebx, abcd[2] = ecx, abcd[3] = edx
+
+
+cpuid_ex:
+
+%IFDEF WINDOWS
+; parameters: rcx = abcd, edx = a, r8d = c
+ push rbx
+ xchg rcx, r8
+ mov eax, edx
+ cpuid ; input eax, ecx. output eax, ebx, ecx, edx
+ mov [r8], eax
+ mov [r8+4], ebx
+ mov [r8+8], ecx
+ mov [r8+12], edx
+ pop rbx
+%ENDIF
+%IFDEF UNIX
+; parameters: rdi = abcd, esi = a, edx = c
+ push rbx
+ mov eax, esi
+ mov ecx, edx
+ cpuid ; input eax, ecx. output eax, ebx, ecx, edx
+ mov [rdi], eax
+ mov [rdi+4], ebx
+ mov [rdi+8], ecx
+ mov [rdi+12], edx
+ pop rbx
+%ENDIF
+ ret
+;cpuid_ex END
diff --git a/contrib/libs/asmlib/cputype64.asm b/contrib/libs/asmlib/cputype64.asm new file mode 100644 index 0000000000..633ebee86a --- /dev/null +++ b/contrib/libs/asmlib/cputype64.asm @@ -0,0 +1,127 @@ +%include "defs.asm" + +;************************* cputype64.asm **********************************
+; Author: Agner Fog
+; Date created: 2011-07-09
+; Last modified: 2011-07-09
+; Source URL: www.agner.org/optimize
+; Project: asmlib.zip
+; Language: assembly, NASM/YASM syntax, 64 bit
+;
+; C++ prototype:
+; extern "C" void CpuType(int * vendor, int * family, int * model);
+;
+; Description:
+; This function finds the vendor, family and model number of the CPU
+; and returns the values through the pointers. If a pointer is zero
+; then the value is not returned.
+;
+; Vendor:
+; 0 = unknown
+; 1 = Intel
+; 2 = AMD
+; 3 = VIA/Centaur
+; 4 = Cyrix
+; 5 = NexGen
+;
+; Family: This is the sum of the family and extended family fields of the cpuid
+; Model: This is the model + (extended model << 8)
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+;
+; C++ prototype:
+; extern "C" void CpuType(int * vendor, int * family, int * model);
+
+global CpuType: function
+
+
+SECTION .text
+
+CpuType:
+ push rbx
+%ifdef UNIX
+ mov r8, rdx
+%endif
+%ifdef WINDOWS
+ push rsi
+ push rdi
+ mov rdi, rcx
+ mov rsi, rdx
+%endif
+
+; parameters
+; vendor rdi
+; family rsi
+; model r8
+
+ xor r9d, r9d ; vendor
+ xor r10d, r10d ; family
+ xor r11d, r11d ; model
+
+ xor eax, eax
+ cpuid ; get vendor
+ ; ecx = last 4 characters of vendor string
+ ; ebx = first 4 characters of vendor string
+ cmp ecx, 'ntel' ; 'GenuineIntel'
+ je C110
+ cmp ecx, 'cAMD' ; 'AuthenticAMD'
+ je C120
+ cmp ebx, 'Cent' ; 'CentaurHauls'
+ je C130
+ cmp ebx, 'VIA ' ; 'VIA VIA VIA '
+ je C130
+ cmp ebx, 'Cyri' ; 'CyrixInstead'
+ je C140
+ cmp ebx, 'NexG' ; 'NexGenDriven'
+ je C150
+ jmp C200 ; other
+C110: or r9d, 1
+ jmp C200
+C120: or r9d, 2
+ jmp C200
+C130: or r9d, 3
+ jmp C200
+C140: or r9d, 4
+ jmp C200
+C150: or r9d, 5
+ ;jmp C200
+C200:
+
+ ; Get family and model
+ mov eax, 1
+ cpuid
+ mov ebx, eax
+ mov r10d, eax
+ shr ebx, 8
+ and ebx, 0FH ; Family
+ shr r10d, 20
+ and r10d, 0FFH ; Extended family
+ add r10d, ebx ; Family + extended family
+
+ mov r11d, eax
+ shr r11d, 4
+ and r11d, 0FH ; Model
+ shr eax, 12
+ and eax, 0F0H ; Extended model
+ or r11d, eax ; extended model | Model
+
+C300: ; return r9d = vendor, r10d = family, r11d = model
+ test rdi, rdi
+ jz C310
+ mov [rdi], r9d
+C310: test rsi, rsi
+ jz C320
+ mov [rsi], r10d
+C320: test r8, r8
+ jz C330
+ mov [r8], r11d
+C330: xor eax, eax
+ ; return
+%ifdef WINDOWS
+ pop rdi
+ pop rsi
+%endif
+ pop rbx
+ ret
+;CpuType ENDP
diff --git a/contrib/libs/asmlib/debugbreak64.asm b/contrib/libs/asmlib/debugbreak64.asm new file mode 100644 index 0000000000..ed2971cd24 --- /dev/null +++ b/contrib/libs/asmlib/debugbreak64.asm @@ -0,0 +1,33 @@ +%include "defs.asm" + +;************************* debugbreak64.asm **********************************
+; Author: Agner Fog
+; Date created: 2011-07-09
+; Last modified: 2011-07-09
+; Source URL: www.agner.org/optimize
+; Project: asmlib.zip
+; Language: assembly, NASM/YASM syntax, 32 bit
+;
+; C++ prototype:
+; extern "C" void A_DebugBreak(void);
+;
+; Description:
+; Makes a debug breakpoint. Works only when running under a debugger
+;
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+;
+; C++ prototype:
+; extern "C" void A_DebugBreak(void);
+
+global A_DebugBreak: function
+
+
+SECTION .text
+
+A_DebugBreak:
+ int3
+ nop
+ ret
+;A_DebugBreak ENDP
diff --git a/contrib/libs/asmlib/defs.asm b/contrib/libs/asmlib/defs.asm new file mode 100644 index 0000000000..db313e6cf1 --- /dev/null +++ b/contrib/libs/asmlib/defs.asm @@ -0,0 +1,22 @@ +%ifdef UNIX + %ifdef DARWIN + %define EXP(x) _ %+ x + %else + %define EXP(x) x + %endif +%else + %define EXP(x) _ %+ x + %define WINDOWS +%endif + +%define ALLOW_OVERRIDE 1 + +%ifdef WINDOWS + %define WEAK_SYM(x) global x +%else + %ifdef DARWIN + %define WEAK_SYM(x) global x + %else + %define WEAK_SYM(x) weak x + %endif +%endif diff --git a/contrib/libs/asmlib/dispatchpatch64.asm b/contrib/libs/asmlib/dispatchpatch64.asm new file mode 100644 index 0000000000..205fac543d --- /dev/null +++ b/contrib/libs/asmlib/dispatchpatch64.asm @@ -0,0 +1,303 @@ +%include "defs.asm" + +;*********************** dispatchpatch64.asm ********************************
+; Author: Agner Fog
+; Date created: 2007-07-20
+; Last modified: 2013-08-21
+; Source URL: www.agner.org/optimize
+; Project: asmlib.zip
+; Language: assembly, NASM/YASM syntax, 64 bit
+;
+; C++ prototype:
+; extern "C" int __intel_cpu_indicator = 0;
+; extern "C" void __intel_cpu_indicator_init()
+;
+; Description:
+; Example of how to replace Intel CPU dispatcher in order to improve
+; compatibility of Intel function libraries with non-Intel processors.
+; Only works with static link libraries (*.lib, *.a), not dynamic libraries
+; (*.dll, *.so). Linking in this as an object file will override the functions
+; with the same name in the library.;
+;
+; Copyright (c) 2007-2013 GNU LGPL License v. 3.0 www.gnu.org/licenses/lgpl.html
+;******************************************************************************
+
+; extern InstructionSet: function
+%include "instrset64.asm" ; include code for InstructionSet function
+
+; InstructionSet function return value:
+; 4 or above = SSE2 supported
+; 5 or above = SSE3 supported
+; 6 or above = Supplementary SSE3
+; 8 or above = SSE4.1 supported
+; 9 or above = POPCNT supported
+; 10 or above = SSE4.2 supported
+; 11 or above = AVX supported by processor and operating system
+; 12 or above = PCLMUL and AES supported
+; 13 or above = AVX2 supported
+; 14 or above = FMA3, F16C, BMI1, BMI2, LZCNT
+; 15 or above = HLE + RTM supported
+
+
+global __intel_cpu_indicator
+global __intel_cpu_indicator_init
+
+
+SECTION .data
+intel_cpu_indicator@: ; local name
+__intel_cpu_indicator: dd 0
+
+; table of indicator values
+itable DD 1 ; 0: generic version, 80386 instruction set
+ DD 8, 8 ; 1, 2: MMX
+ DD 0x80 ; 3: SSE
+ DD 0x200 ; 4: SSE2
+ DD 0x800 ; 5: SSE3
+ DD 0x1000, 0x1000 ; 6, 7: SSSE3
+ DD 0x2000, 0x2000 ; 8, 9: SSE4.1
+ DD 0x8000, 0x8000 ; 10, 11: SSE4.2 and popcnt
+ DD 0x20000, 0x20000 ; 12, 13: AVX, pclmul, aes
+ DD 0x400000 ; 14: AVX2, F16C, BMI1, BMI2, LZCNT, FMA3
+ DD 0x800000 ; 15: HLE, RTM
+itablelen equ ($ - itable) / 4 ; length of table
+
+SECTION .text
+
+__intel_cpu_indicator_init:
+ push rax ; registers must be pushed
+ push rcx
+ push rdx
+ push r8
+ push r9
+ push r10
+ push r11
+ push rsi
+ push rdi
+ call InstructionSet
+ cmp eax, itablelen
+ jb L100
+ mov eax, itablelen - 1 ; limit to table length
+L100: lea rdx, [rel itable]
+ mov eax, [rdx + 4*rax]
+ mov [rel intel_cpu_indicator@], eax ; store in __intel_cpu_indicator
+ pop rdi
+ pop rsi
+ pop r11
+ pop r10
+ pop r9
+ pop r8
+ pop rdx
+ pop rcx
+ pop rax
+ ret
+
+;__intel_cpu_indicator_init ENDP
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Dispatcher for Math Kernel Library (MKL),
+; version 10.2 and higher
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+WEAK_SYM(mkl_serv_cpu_detect)
+
+SECTION .data
+; table of indicator values
+; Note: the table is different in 32 bit and 64 bit mode
+
+mkltab DD 0, 0, 0, 0 ; 0-3: generic version, 80386 instruction set
+ DD 0 ; 4: SSE2
+ DD 1 ; 5: SSE3
+ DD 2, 2, 2, 2 ; 6-9: SSSE3
+ DD 3 ; 10: SSE4.2
+ DD 4, 4, 4 ; 11-13: AVX
+ DD 5 ; 14: AVX2, FMA3, BMI1, BMI2, LZCNT, PCLMUL
+mkltablen equ ($ - mkltab) / 4 ; length of table
+
+SECTION .text
+
+mkl_serv_cpu_detect:
+ push rcx ; Perhaps not needed
+ push rdx
+ push r8
+ push r9
+%ifdef WINDOWS
+ push rsi
+ push rdi
+%endif
+ call InstructionSet
+ cmp eax, mkltablen
+ jb M100
+ mov eax, mkltablen - 1 ; limit to table length
+M100:
+ lea rdx, [rel mkltab]
+ mov eax, [rdx + 4*rax]
+%ifdef WINDOWS
+ pop rdi
+ pop rsi
+%endif
+ pop r9
+ pop r8
+ pop rdx
+ pop rcx
+ ret
+; end mkl_serv_cpu_detect
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Dispatcher for Vector Math Library (VML)
+; version 10.0 and higher
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+WEAK_SYM(mkl_vml_serv_cpu_detect)
+
+SECTION .data
+; table of indicator values
+; Note: the table is different in 32 bit and 64 bit mode
+
+vmltab DD 0, 0, 0, 0 ; 0-3: generic version, 80386 instruction set
+ DD 1, 1 ; 4-5: SSE2
+ DD 2, 2 ; 6-7: SSSE3
+ DD 3, 3 ; 8-9: SSE4.1
+ DD 4 ; 10: SSE4.2
+ DD 5, 5, 5 ; 11: AVX
+; DD 6 ??
+vmltablen equ ($ - vmltab) / 4 ; length of table
+
+SECTION .text
+
+mkl_vml_serv_cpu_detect:
+ push rcx ; Perhaps not needed
+ push rdx
+ push r8
+ push r9
+%ifdef WINDOWS
+ push rsi
+ push rdi
+%endif
+ call InstructionSet
+ cmp eax, vmltablen
+ jb V100
+ mov eax, vmltablen - 1 ; limit to table length
+V100:
+ lea rdx, [rel vmltab]
+ mov eax, [rdx + 4*rax]
+%ifdef WINDOWS
+ pop rdi
+ pop rsi
+%endif
+ pop r9
+ pop r8
+ pop rdx
+ pop rcx
+ ret
+; end mkl_vml_serv_cpu_detect
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Dispatcher for __intel_cpu_feature_indicator
+; version 13 and higher
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+global __intel_cpu_features_init
+global __intel_cpu_feature_indicator
+global __intel_cpu_fms_indicator
+global __intel_cpu_features_init_x
+global __intel_cpu_feature_indicator_x
+global __intel_cpu_fms_indicator_x
+
+SECTION .data
+; table of indicator values
+
+intel_cpu_feature_indicator@:
+__intel_cpu_feature_indicator:
+__intel_cpu_feature_indicator_x DD 0, 0
+intel_cpu_fms_indicator@:
+__intel_cpu_fms_indicator:
+__intel_cpu_fms_indicator_x: DD 0, 0
+
+
+feattab DD 1 ; 0 default
+ DD 0BH ; 1 MMX
+ DD 0FH ; 2 conditional move and FCOMI supported
+ DD 3FH ; 3 SSE
+ DD 7FH ; 4 SSE2
+ DD 0FFH ; 5 SSE3
+ DD 1FFH, 1FFH ; 6 Supplementary SSE3
+ DD 3FFH ; 8 SSE4.1
+ DD 0BFFH ; 9 POPCNT
+ DD 0FFFH ; 10 SSE4.2
+ DD 10FFFH ; 11 AVX
+ DD 16FFFH ; 12 PCLMUL and AES
+ DD 816FFFH ; 13 AVX2
+ DD 9DEFFFH ; 14 FMA3, F16C, BMI1, BMI2, LZCNT
+ DD 0FDEFFFH ; 15 HLE, RTM
+
+feattablen equ ($ - feattab) / 4 ; length of table
+
+SECTION .text
+
+__intel_cpu_features_init:
+__intel_cpu_features_init_x:
+ push rbx + push rcx ; Perhaps not needed
+ push rdx
+ push r8
+ push r9
+%ifdef WINDOWS
+ push rsi
+ push rdi
+%endif
+ call InstructionSet
+ cmp eax, feattablen
+ jb F100
+ mov eax, vmltablen - 1 ; limit to table length
+F100:
+ lea rdx, [rel feattab]
+ mov ebx, [rdx + 4*rax] ; look up in table
+ push rbx
+ mov eax, 1
+ cpuid
+ pop rbx
+ bt ecx, 22 ; MOVBE
+ jnc F200
+ or ebx, 1000H
+F200: mov [intel_cpu_feature_indicator@], rbx
+
+ ; get family and model
+ mov edx, eax
+ and eax, 0FH ; stepping bit 0-3
+ mov ecx, edx
+ shr ecx, 4
+ and ecx, 0FH ; model
+ mov ebx, edx
+ shr ebx, 12
+ and ebx, 0F0H ; x model
+ or ecx, ebx ; full model
+ mov ah, cl ; model bit 8 - 15
+ mov ecx, edx
+ shr ecx, 8
+ and ecx, 0FH ; family
+ mov ebx, edx
+ shr ebx, 20
+ and ebx, 0FFH ; x family
+ add ecx, ebx ; full family
+ shl ecx, 16
+ or eax, ecx ; full family bit 16 - 23
+ mov [intel_cpu_fms_indicator@], eax
+
+%ifdef WINDOWS
+ pop rdi
+ pop rsi
+%endif
+ pop r9
+ pop r8
+ pop rdx
+ pop rcx
+ pop rbx + ret
+; end __intel_cpu_features_init
+
+
+
+
diff --git a/contrib/libs/asmlib/divfixedi64.asm b/contrib/libs/asmlib/divfixedi64.asm new file mode 100644 index 0000000000..bf8ab137a9 --- /dev/null +++ b/contrib/libs/asmlib/divfixedi64.asm @@ -0,0 +1,173 @@ +%include "defs.asm" + +;************************* divfixedi64.asm *********************************
+; Author: Agner Fog
+; Date created: 2011-07-22
+; Last modified: 2011-07-22
+;
+; Function prototypes:
+; void setdivisori32(int buffer[2], int d);
+; int dividefixedi32(const int buffer[2], int x);
+; void setdivisoru32(uint32_t buffer[2], uint32_t d);
+; uint32_t dividefixedu32(const uint32_t buffer[2], uint32_t x);
+;
+; Description:
+; Functions for fast repeated integer division by the same divisor, signed
+; and unsigned 32-bit integer versions. The divisor must be positive.
+;
+; The setdivisor functions calculate the reciprocal divisor and shift counts,
+; the dividefixed functions do the division by multiplication and shift.
+;
+; The methods used are described by:
+; T. Granlund and P. L. Montgomery: Division by Invariant Integers Using Multiplication,
+; Proceedings of the SIGPLAN 1994 Conference on Programming Language Design and Implementation.
+; http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556
+;
+; Mathematical formula, unsigned division:
+; x = dividend
+; d = divisor
+; n = integer size, bits
+; L = ceil(log2(d))
+; m = 1 + 2^n * (2^L-d) / d [2^L should overflow to 0 if L = n]
+; sh1 = min(L,1)
+; sh2 = max(L-1,0)
+; t = m*x >> n [high part of unsigned multiplication]
+; x/d = (((x-t) >> sh1) + t) >> sh2
+;
+; Mathematical formula, signed division:
+; x = dividend
+; d = abs(divisor)
+; n = integer size, bits
+; L = ceil(log2(d))
+; L = max(L,1)
+; m = 1 + 2^(n+L-1)/d - 2^n [division should overflow to 0 if d = 1]
+; sh1 = L-1
+; q = x + (m*x >> n) [high part of signed multiplication]
+; q = (q >> sh1) - (x<0 ? -1 : 0)
+; if (divisor < 0) q = -q [negative divisor not supported in present implementation]
+; x/d = q
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+%IFDEF WINDOWS
+%define par1 rcx ; function parameter 1
+%define par2 edx ; function parameter 2
+%define buf r9 ; copy of function parameter 1: buffer
+%define rx r8
+%define rxd r8d ; d or x
+%ELSE ; UNIX
+%define par1 rdi ; function parameter 1
+%define par2 esi ; function parameter 2
+%define buf rdi ; function parameter 1: buffer
+%define rx rsi
+%define rxd esi ; d or x
+%ENDIF
+
+
+section .text
+
+; extern "C" void setdivisori32(int buffer[2], int d);
+; 32 bit signed
+
+global setdivisori32: function
+setdivisori32:
+%IFDEF WINDOWS
+ mov rxd, edx ; x
+ mov buf, rcx ; buffer
+%ENDIF
+ dec rxd ; rxd = r8d or esi
+ mov ecx, -1 ; value for bsr if rxd = 0 (assuming bsr leaves dest unchanged if src = 0, this works on both Intel, AMD and VIA processors)
+ bsr ecx, rxd ; floor(log2(d-1))
+ inc rxd
+ js H120 ; d < 0. Generate error
+ inc ecx ; L = ceil(log2(d))
+ sub ecx, 1 ; shift count = L - 1
+ adc ecx, 0 ; avoid negative shift count
+ xor eax, eax
+ mov edx, 1
+ cmp rxd, edx
+ je H110 ; avoid overflow when d = 1
+ shl edx, cl
+ div rxd
+H110: inc eax
+ mov [buf], eax ; multiplier
+ mov [buf+4], ecx ; shift count
+ ret
+
+H120: ; d <= 0 not supported. Generate error
+ mov edx, 1
+ div edx ; will overflow
+ ud2
+
+
+; extern "C" int dividefixedi32(int buffer[2], int x);
+global dividefixedi32: function
+dividefixedi32:
+%IFDEF WINDOWS
+ mov eax, edx
+ mov rxd, edx ; x
+ mov buf, rcx ; buffer
+%ELSE
+ mov eax, esi
+%ENDIF
+ imul dword [buf] ; m
+ lea eax, [rdx+rx] ; rx = r8 or rsi
+ mov ecx, [buf+4] ; shift count
+ sar eax, cl
+ sar rxd, 31 ; sign(x)
+ sub eax, rxd
+ ret
+
+
+;extern "C" void setdivisoru32(int buffer[2], int d);
+; 32 bit unsigned
+
+global setdivisoru32: function
+setdivisoru32:
+%IFDEF WINDOWS
+ mov rxd, edx ; x
+ mov buf, rcx ; buffer
+%ENDIF
+ dec rxd ; rxd = r8d or esi
+ mov ecx, -1 ; value for bsr if r8d = 0
+ bsr ecx, rxd ; floor(log2(d-1))
+ inc rxd
+ inc ecx ; L = ceil(log2(d))
+ mov edx, 1
+ shl rdx, cl ; 2^L (64 bit shift because cl may be 32)
+ sub edx, rxd
+ xor eax, eax
+ div rxd
+ inc eax
+ mov [buf], eax ; multiplier
+ sub ecx, 1
+ setae dl
+ movzx edx, dl ; shift1
+ seta al
+ neg al
+ and al,cl
+ movzx eax, al ; shift 2
+ shl eax, 8
+ or eax, edx
+ mov [buf+4], eax ; shift 1 and shift 2
+ ret
+
+;extern "C" int dividefixedu32(int buffer[2], int x);
+global dividefixedu32: function ; unsigned
+dividefixedu32:
+%IFDEF WINDOWS
+ mov eax, edx
+ mov rxd, edx ; x
+ mov buf, rcx ; buffer
+%ELSE
+ mov eax, esi
+%ENDIF
+ mul dword [buf] ; m
+ sub rxd, edx ; x-t
+ mov ecx, [buf+4] ; shift 1 and shift 2
+ shr rxd, cl
+ lea eax, [rx+rdx]
+ shr ecx, 8
+ shr eax, cl
+ ret
diff --git a/contrib/libs/asmlib/divfixedv64.asm b/contrib/libs/asmlib/divfixedv64.asm new file mode 100644 index 0000000000..a4f0e177ec --- /dev/null +++ b/contrib/libs/asmlib/divfixedv64.asm @@ -0,0 +1,498 @@ +%include "defs.asm" + +;************************* divfixedv64.asm *********************************
+; Author: Agner Fog
+; Date created: 2011-07-25
+; Last modified: 2012-03-10
+;
+; Function prototypes:
+; void setdivisorV8i16(__m128i buf[2], int16_t d);
+; void setdivisorV8u16(__m128i buf[2], uint16_t d);
+; void setdivisorV4i32(__m128i buf[2], int32_t d);
+; void setdivisorV4u32(__m128i buf[2], uint32_t d);
+;
+; __m128i dividefixedV8i16(const __m128i buf[2], __m128i x);
+; __m128i dividefixedV8u16(const __m128i buf[2], __m128i x);;
+; __m128i dividefixedV4i32(const __m128i buf[2], __m128i x);
+; __m128i dividefixedV4u32(const __m128i buf[2], __m128i x);
+;
+; Alternative versions for VectorClass.h:
+; (These versions pack all parameters into a single register)
+; __m128i setdivisor8s(int16_t d);
+; __m128i setdivisor8us(uint16_t d);
+; __m128i setdivisor4i(int32_t d);
+; __m128i setdivisor4ui(uint32_t d);
+;
+; Description:
+; Functions for integer vector division by the same divisor, signed
+; and unsigned 16-bit and 32-bit integer versions.
+;
+; The setdivisor functions calculate the reciprocal divisor and shift counts,
+; the dividefixed functions do the division by multiplication and shift of the
+; vector elements of packed 16-bit or 32-bit signed or unsigned integers.
+;
+; The divisor must be positive. A zero divisor generated a divide by zero error.
+; A negative divisor generates a division overflow error. To divide by a negative
+; divisor, change the sign of the divisor and the result.
+;
+; The methods used are described in this article:
+; T. Granlund and P. L. Montgomery: Division by Invariant Integers Using Multiplication,
+; Proceedings of the SIGPLAN 1994 Conference on Programming Language Design and Implementation.
+; http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556
+;
+; Mathematical formula, unsigned division:
+; x = dividend
+; d = divisor
+; n = integer size, bits
+; L = ceil(log2(d))
+; m = 1 + 2^n * (2^L-d) / d [2^L should overflow to 0 if L = n]
+; sh1 = min(L,1)
+; sh2 = max(L-1,0)
+; t = m*x >> n [high part of unsigned multiplication]
+; x/d = (((x-t) >> sh1) + t) >> sh2
+;
+; Mathematical formula, signed division:
+; x = dividend
+; d = abs(divisor)
+; n = integer size, bits
+; L = ceil(log2(d))
+; L = max(L,1)
+; m = 1 + 2^(n+L-1)/d - 2^n [division should overflow to 0 if d = 1]
+; sh1 = L-1
+; q = x + (m*x >> n) [high part of signed multiplication]
+; q = (q >> sh1) - (x<0 ? -1 : 0)
+; if (divisor < 0) q = -q [negative divisor not supported in present implementation]
+; x/d = q
+;
+; Copyright (c) 2011 - 2012 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+default rel
+
+%IFDEF WINDOWS
+%define par1 rcx ; function parameter 1
+%define par1d ecx
+%define par1w cx
+%define par2 rdx ; function parameter 2
+%define par2d edx
+%define par2w dx
+%define buf r8 ; pointer to buffer
+%ENDIF
+%IFDEF UNIX
+%define par1 rdi ; function parameter 1
+%define par1d edi
+%define par1w di
+%define par2 rsi ; function parameter 2
+%define par2d esi
+%define par2w si
+%define buf rdi ; pointer to buffer
+%ENDIF
+
+
+; Imported from instrset64.asm:
+extern InstructionSet ; Instruction set for CPU dispatcher
+
+section .text align = 16
+
+;******************************************************************************
+; 16 bit signed integers
+;******************************************************************************
+
+; extern "C" __m128i setdivisor8s(int16_t d);
+; vector of 8 x 16 bit signed integers
+
+global setdivisor8s: function
+setdivisor8s:
+ push rbx
+ movsx ebx, par1w ; d
+ dec ebx
+ mov ecx, -1 ; value for bsr if ebx = 0
+ bsr ecx, ebx ; floor(log2(d-1))
+ inc ebx
+ js H120 ; Generate error if d < 0. (error for d=0 will come in the div instruction)
+ inc ecx ; L = ceil(log2(d))
+ sub ecx, 1 ; shift count = L - 1
+ adc ecx, 0 ; avoid negative shift count
+ xor eax, eax
+ mov edx, 1
+ cmp ebx, edx
+ je H110 ; avoid division overflow when d = 1
+ shl edx, cl
+ div bx ; 2^(16+L-1)/d
+H110: inc eax
+ movd xmm0, eax ; multiplier
+ pshuflw xmm0, xmm0, 0 ; broadcast into lower 4 words
+ movd xmm1, ecx ; shift count
+ punpcklqdq xmm0, xmm1 ; insert shift count into upper half
+ pop rbx
+ ret
+H120: ; d < 0 not supported. Generate error
+ mov edx, 1
+ div edx
+ ud2
+; setdivisor8s end
+
+
+; extern "C" void setdivisorV8i16(__m128i buf[2], int16_t d);
+; vector of 8 x 16 bit signed integers
+
+global setdivisorV8i16: function
+setdivisorV8i16:
+ push par1 ; buf
+ mov par1d, par2d ; d
+ call setdivisor8s
+ pop rax ; buf
+ punpcklqdq xmm0, xmm0 ; copy multiplier into upper 4 words
+ movdqa [rax], xmm0 ; multiplier
+ movdqa [rax+16], xmm1 ; shift count is still in xmm1
+ ret
+; setdivisorV8i16 end
+
+
+; extern "C" int dividefixedV8i16(const __m128i buf[2], __m128i x);
+global dividefixedV8i16: function
+
+dividefixedV8i16:
+; buf = par1
+; x = xmm0 (UNIX) or [par2] (Windows)
+%IFDEF WINDOWS
+ movdqa xmm0, [par2] ; x
+%ENDIF
+ movdqa xmm1, xmm0 ; x
+ pmulhw xmm0, [par1] ; multiply high signed words
+ paddw xmm0, xmm1
+ movd xmm2, [par1+16] ; shift count
+ psraw xmm0, xmm2 ; shift right arithmetic
+ psraw xmm1, 15 ; sign of x
+ psubw xmm0, xmm1
+ ret
+;dividefixedV8i16 end
+
+
+
+;******************************************************************************
+; 16 bit unsigned integers
+;******************************************************************************
+
+; extern "C" __m128i setdivisor8us(uint16_t d);
+; vector of 8 x 16 bit unsigned integers
+
+align 16
+global setdivisor8us: function
+setdivisor8us:
+ push rbx
+ movzx ebx, par1w ; d
+ dec ebx
+ mov ecx, -1 ; value for bsr if ebx = 0
+ bsr ecx, ebx ; floor(log2(d-1))
+ inc ebx
+ inc ecx ; L = ceil(log2(d))
+ mov edx, 1
+ shl edx, cl ; 2^L [32-bit shift to allow overflow]
+ sub edx, ebx
+ xor eax, eax
+ div bx
+ inc eax
+ movd xmm0, eax
+ pshuflw xmm0, xmm0, 0 ; broadcast into lower 4 words
+ sub ecx, 1
+ setae dl
+ movzx edx, dl ; shift 1
+ seta al
+ neg al
+ and al,cl
+ movzx eax, al ; shift 2
+ movd xmm1, edx ; shift 1
+ movd xmm2, eax ; shift 2
+ punpckldq xmm1, xmm2 ; combine into two dwords
+ punpcklqdq xmm0, xmm1 ; multipliers, shift1, shift2
+ pop rbx
+ ret
+; setdivisor8us end
+
+
+;extern "C" void setdivisorV8u16(__m128i buf[2], uint16_t d);
+; 8 x 16 bit unsigned
+
+global setdivisorV8u16: function
+setdivisorV8u16:
+ push par1 ; buf
+ mov par1d, par2d ; d
+ call setdivisor8us
+ pop rax ; buf
+ punpcklqdq xmm0, xmm0 ; copy multiplier into upper 4 words
+ movdqa [rax], xmm0 ; multiplier
+ movdqa [rax+16], xmm1 ; shift counts are still in xmm1
+ ret
+; setdivisorV8u16 end
+
+
+;extern "C" __m128i dividefixedV8u16(const __m128i buf[2], __m128i x);
+global dividefixedV8u16: function
+
+align 16
+dividefixedV8u16:
+; buf = par1
+; x = xmm0 (UNIX) or [par2] (Windows)
+%IFDEF WINDOWS
+ movdqa xmm0, [par2] ; x
+%ENDIF
+ movdqa xmm1, xmm0 ; x
+ pmulhuw xmm0, [par1] ; multiply high unsigned words
+ psubw xmm1, xmm0
+ movd xmm2, [par1+16] ; shift1
+ psrlw xmm1, xmm2
+ paddw xmm0, xmm1
+ movd xmm2, [par1+20] ; shift2
+ psrlw xmm0, xmm2
+ ret
+;dividefixedV8u16 end
+
+
+
+;******************************************************************************
+; 32 bit signed integers
+;******************************************************************************
+
+; extern "C" __m128i setdivisor4i(int32_t d);
+; vector of 4 x 32 bit signed integers
+
+align 16
+global setdivisor4i: function
+setdivisor4i:
+ push rbx
+ mov ebx, par1d ; d
+ dec ebx
+ mov ecx, -1 ; value for bsr if ebx = 0
+ bsr ecx, ebx ; floor(log2(d-1))
+ inc ebx
+ js K120 ; Generate error if d < 0. (error for d=0 will come in the div instruction)
+ inc ecx ; L = ceil(log2(d))
+ sub ecx, 1 ; shift count = L - 1
+ adc ecx, 0 ; avoid negative shift count
+ xor eax, eax
+ mov edx, 1
+ cmp ebx, edx
+ je K110 ; avoid division overflow when d = 1
+ shl edx, cl
+ div ebx ; 2^(16+L-1)/d
+K110: inc eax
+ movd xmm0, eax ; multiplier
+ pshufd xmm0, xmm0, 0 ; broadcast into 4 dwords
+ movd xmm1, ecx ; shift count
+ punpcklqdq xmm0, xmm1 ; insert shift count into upper half
+ pop rbx
+ ret
+
+K120: ; d < 0 not supported. Generate error
+ mov edx, 1
+ div edx
+ ud2
+; setdivisor4i end
+
+
+; extern "C" void setdivisorV4i32(__m128i buf[2], int32_t d);
+; vector of 4 x 32 bit signed integers
+
+global setdivisorV4i32: function
+setdivisorV4i32:
+ push par1 ; buf
+ mov par1d, par2d ; d
+ call setdivisor4i
+ pop rax ; buf
+ punpcklqdq xmm0, xmm0 ; copy multiplier into upper 4 words
+ movdqa [rax], xmm0 ; multiplier
+ movdqa [rax+16], xmm1 ; shift count is still in xmm1
+ ret
+; setdivisorV4i32 end
+
+
+; extern "C" int dividefixedV4i32(const __m128i buf[2], __m128i x);
+global dividefixedV4i32: function
+
+; Direct entries to CPU-specific versions
+global dividefixedV4i32SSE2: function
+global dividefixedV4i32SSE41: function
+
+align 8
+dividefixedV4i32: ; function dispatching
+ jmp near [dividefixedV4i32Dispatch] ; Go to appropriate version, depending on instruction set
+
+align 16
+dividefixedV4i32SSE41:
+; buf = par1
+; x = xmm0 (UNIX) or [par2] (Windows)
+%IFDEF WINDOWS
+ movdqa xmm0,[par2] ; x
+%ENDIF
+ movdqa xmm1, xmm0 ; x
+ movdqa xmm2, xmm0 ; x
+ movdqa xmm3, [par1] ; multiplier
+ pmuldq xmm0, xmm3 ; 32 x 32 -> 64 bit signed multiplication of x[0] and x[2]
+ psrlq xmm0, 32 ; high dword of result 0 and 2
+ psrlq xmm1, 32 ; get x[1] and x[3] into position for multiplication
+ pmuldq xmm1, xmm3 ; 32 x 32 -> 64 bit signed multiplication of x[1] and x[3]
+ pcmpeqd xmm3, xmm3
+ psllq xmm3, 32 ; generate mask of dword 1 and 3
+ pand xmm1, xmm3 ; high dword of result 1 and 3
+ por xmm0, xmm1 ; combine all four results into one vector
+ paddd xmm0, xmm2
+ movd xmm3, [par1+16] ; shift count
+ psrad xmm0, xmm3 ; shift right arithmetic
+ psrad xmm2, 31 ; sign of x
+ psubd xmm0, xmm2
+ ret
+;dividefixedV4i32SSE41 end
+
+dividefixedV4i32SSE2:
+; I have tried to change sign and use pmuludq, but get rounding error (gives 9/10 = 1).
+; This solution, with 4 separate multiplications, is probably faster anyway despite store forwarding stall
+ push rbp
+ mov rbp, rsp
+%IFDEF WINDOWS
+ movdqa xmm0,[par2] ; x
+ mov buf, par1
+%ENDIF
+ sub rsp, 16 ; allocate stack space
+ and rsp, -16 ; stack should be aligned already. align anyway to be safe
+ movdqa [rsp], xmm0 ; store x
+ movdqa xmm2, xmm0 ; x
+ mov ecx, [buf] ; multiplier
+ ; do four signed high multiplications
+ mov eax, [rsp]
+ imul ecx
+ mov [rsp], edx
+ mov eax, [rsp+4]
+ imul ecx
+ mov [rsp+4], edx
+ mov eax, [rsp+8]
+ imul ecx
+ mov [rsp+8], edx
+ mov eax, [rsp+12]
+ imul ecx
+ mov [rsp+12], edx
+ movdqa xmm0, [rsp] ; x*m vector
+ paddd xmm0, xmm2
+ movd xmm3, [buf+16] ; shift count
+ psrad xmm0, xmm3 ; shift right arithmetic
+ psrad xmm2, 31 ; sign of x
+ psubd xmm0, xmm2
+ mov rsp, rbp
+ pop rbp
+ ret
+;dividefixedV4i32SSE2 end
+
+
+; ********************************************************************************
+; CPU dispatching for dividefixedV4i32. This is executed only once
+; ********************************************************************************
+
+dividefixedV4i32CPUDispatch:
+ ; get supported instruction set
+ push par1
+ push par2
+ call InstructionSet
+ pop par2
+ pop par1
+ ; Point to generic version
+ lea r8, [dividefixedV4i32SSE2]
+ cmp eax, 8 ; check if PMULDQ supported
+ jb Q100
+ ; SSE4.1 supported
+ ; Point to SSE4.1 version of strstr
+ lea r8, [dividefixedV4i32SSE41]
+Q100: mov [dividefixedV4i32Dispatch], r8
+ ; Continue in appropriate version
+ jmp r8
+
+SECTION .data
+
+; Pointer to appropriate versions. Initially point to dispatcher
+dividefixedV4i32Dispatch Dq dividefixedV4i32CPUDispatch
+
+section .text
+
+
+;******************************************************************************
+; 32 bit unsigned integers
+;******************************************************************************
+
+; extern "C" __m128i setdivisor4ui(uint32_t d);
+; vector of 4 x 32 bit unsigned integers
+
+align 16
+global setdivisor4ui: function
+setdivisor4ui:
+ push rbx
+ mov ebx, par1d ; d
+ dec ebx
+ mov ecx, -1 ; value for bsr if ebx = 0
+ bsr ecx, ebx ; floor(log2(d-1))
+ inc ebx
+ inc ecx ; L = ceil(log2(d))
+ mov edx, 1
+ shl rdx, cl ; 2^L [64 bit shift to allow overflow]
+ sub edx, ebx
+ xor eax, eax
+ div ebx
+ inc eax
+ movd xmm0, eax
+ pshufd xmm0, xmm0, 0 ; broadcast into 4 dwords
+ sub ecx, 1
+ setae dl
+ movzx edx, dl ; shift1
+ seta al
+ neg al
+ and al,cl
+ movzx eax, al
+ movd xmm1, edx ; shift 1
+ movd xmm2, eax ; shift 2
+ punpckldq xmm1, xmm2 ; combine into two dwords
+ punpcklqdq xmm0, xmm1 ; multipliers, shift1, shift2
+ pop rbx
+ ret
+; setdivisor4ui end
+
+;extern "C" void setdivisorV4u32(__m128i buf[2], uint32_t d);
+; 4 x 32 bit unsigned
+
+global setdivisorV4u32: function
+setdivisorV4u32:
+ push par1 ; buf
+ mov par1d, par2d ; d
+ call setdivisor4ui
+ pop rax ; buf
+ punpcklqdq xmm0, xmm0 ; copy multiplier into upper 4 words
+ movdqa [rax], xmm0 ; multiplier
+ movdqa [rax+16], xmm1 ; shift counts are still in xmm1
+ ret
+; setdivisorV4u32 end
+
+;extern "C" __m128i dividefixedV4u32(const __m128i buf[2], __m128i x);
+global dividefixedV4u32: function
+
+align 16
+dividefixedV4u32:
+; buf = par1
+; x = xmm0 (UNIX) or [par2] (Windows)
+%IFDEF WINDOWS
+ movdqa xmm0,[par2] ; x
+%ENDIF
+ movdqa xmm1, xmm0 ; x
+ movdqa xmm2, xmm0 ; x
+ movdqa xmm3, [par1] ; multiplier
+ pmuludq xmm0, xmm3 ; 32 x 32 -> 64 bit unsigned multiplication of x[0] and x[2]
+ psrlq xmm0, 32 ; high dword of result 0 and 2
+ psrlq xmm1, 32 ; get x[1] and x[3] into position for multiplication
+ pmuludq xmm1, xmm3 ; 32 x 32 -> 64 bit unsigned multiplication of x[1] and x[3]
+ pcmpeqd xmm3, xmm3
+ psllq xmm3, 32 ; generate mask of dword 1 and 3
+ pand xmm1, xmm3 ; high dword of result 1 and 3
+ por xmm0, xmm1 ; combine all four results into one vector
+ psubd xmm2, xmm0
+ movd xmm3, [par1+16] ; shift1
+ psrld xmm2, xmm3
+ paddd xmm0, xmm2
+ movd xmm3, [par1+20] ; shift2
+ psrld xmm0, xmm3
+ ret
+;dividefixedV4u32 end
diff --git a/contrib/libs/asmlib/dummy.c b/contrib/libs/asmlib/dummy.c new file mode 100644 index 0000000000..e69de29bb2 --- /dev/null +++ b/contrib/libs/asmlib/dummy.c diff --git a/contrib/libs/asmlib/instrset64.asm b/contrib/libs/asmlib/instrset64.asm new file mode 100644 index 0000000000..c8cdd34a19 --- /dev/null +++ b/contrib/libs/asmlib/instrset64.asm @@ -0,0 +1,184 @@ +%include "defs.asm" + +;************************* instrset64.asm **********************************
+; Author: Agner Fog
+; Date created: 2003-12-12
+; Last modified: 2013-09-11
+; Source URL: www.agner.org/optimize
+; Project: asmlib.zip
+; Language: assembly, NASM/YASM syntax, 64 bit
+;
+; C++ prototype:
+; extern "C" int InstructionSet (void);
+;
+; Description:
+; This function returns an integer indicating which instruction set is
+; supported by the microprocessor and operating system. A program can
+; call this function to determine if a particular set of instructions can
+; be used.
+;
+; The method used here for detecting whether XMM instructions are enabled by
+; the operating system is different from the method recommended by Intel.
+; The method used here has the advantage that it is independent of the
+; ability of the operating system to catch invalid opcode exceptions. The
+; method used here has been thoroughly tested on many different versions of
+; Intel and AMD microprocessors, and is believed to work reliably. For further
+; discussion of this method, see my manual "Optimizing subroutines in assembly
+; language" (www.agner.org/optimize/).
+;
+; Copyright (c) 2003-2013 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+;
+; ********** InstructionSet function **********
+; C++ prototype:
+; extern "C" int InstructionSet (void);
+;
+; return value:
+; 0 = 80386 instruction set only
+; 1 or above = MMX instructions supported
+; 2 or above = conditional move and FCOMI supported
+; 3 or above = SSE (XMM) supported by processor and operating system
+; 4 or above = SSE2 supported
+; 5 or above = SSE3 supported
+; 6 or above = Supplementary SSE3
+; 8 or above = SSE4.1 supported
+; 9 or above = POPCNT supported
+; 10 or above = SSE4.2 supported
+; 11 or above = AVX supported by processor and operating system
+; 12 or above = PCLMUL and AES supported
+; 13 or above = AVX2 supported
+; 14 or above = FMA3, F16C, BMI1, BMI2, LZCNT
+; 15 or above = HLE + RTM supported
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+
+global InstructionSet: function
+global IInstrSet
+
+
+SECTION .data
+align 16
+
+IInstrSet@: ; local name to avoid problems in shared objects
+IInstrSet: dd -1 ; this global variable is valid after first call
+
+
+SECTION .text align=16
+
+; ********** InstructionSet function **********
+; C++ prototype:
+; extern "C" int InstructionSet (void);
+
+; return value:
+; 4 or above = SSE2 supported
+; 5 or above = SSE3 supported
+; 6 or above = Supplementary SSE3 supported
+; 8 or above = SSE4.1 supported
+; 9 or above = POPCNT supported
+; 10 or above = SSE4.2 supported
+; 11 or above = AVX supported by processor and operating system
+; 12 or above = PCLMUL and AES supported
+
+
+InstructionSet:
+ ; Check if this function has been called before
+ mov eax, [IInstrSet@]
+ test eax, eax
+ js FirstTime ; Negative means first time
+ ; Early return. Has been called before
+ ret ; Return value is in eax
+
+FirstTime:
+ push rbx
+
+ mov eax, 1
+ cpuid ; get features into edx and ecx
+
+ mov eax, 4 ; at least SSE2 supported in 64 bit mode
+ test ecx, 1 ; SSE3 support by microprocessor
+ jz ISEND
+ inc eax ; 5
+
+ bt ecx, 9 ; Suppl-SSE3 support by microprocessor
+ jnc ISEND
+ inc eax ; 6
+
+ bt ecx, 19 ; SSE4.1 support by microprocessor
+ jnc ISEND
+ mov al, 8 ; 8
+
+ bt ecx, 23 ; POPCNT support by microprocessor
+ jnc ISEND
+ inc eax ; 9
+
+ bt ecx, 20 ; SSE4.2 support by microprocessor
+ jnc ISEND
+ inc eax ; 10
+
+ ; check OS support for YMM registers (AVX)
+ bt ecx, 27 ; OSXSAVE: XGETBV supported
+ jnc ISEND
+ push rax
+ push rcx
+ push rdx
+ xor ecx, ecx
+ db 0FH, 01H, 0D0H ; XGETBV
+ and eax, 6
+ cmp eax, 6 ; AVX support by OS
+ pop rdx
+ pop rcx
+ pop rax
+ jne ISEND
+
+ bt ecx, 28 ; AVX support by microprocessor
+ jnc ISEND
+ inc eax ; 11
+
+ bt ecx, 1 ; PCLMUL support
+ jnc ISEND
+ bt ecx, 25 ; AES support
+ jnc ISEND
+ inc eax ; 12
+
+ push rax
+ push rcx
+ mov eax, 7
+ xor ecx, ecx
+ cpuid ; check for AVX2
+ bt ebx, 5
+ pop rcx
+ pop rax
+ jnc ISEND
+ inc eax ; 13
+
+; 14 or above = FMA3, F16C, BMI1, BMI2, LZCNT
+ bt ecx, 12 ; FMA3
+ jnc ISEND
+ bt ecx, 29 ; F16C
+ jnc ISEND
+ bt ebx, 3 ; BMI1
+ jnc ISEND
+ bt ebx, 8 ; BMI2
+ jnc ISEND
+
+ push rax
+ push rbx
+ push rcx
+ mov eax, 80000001H
+ cpuid
+ bt ecx, 5 ; LZCNT
+ pop rcx
+ pop rbx
+ pop rax
+ jnc ISEND
+
+ inc eax ; 14
+
+ISEND: mov [IInstrSet@], eax ; save value in global variable
+
+ pop rbx
+ ret ; return value is in eax
+
+;InstructionSet ENDP
diff --git a/contrib/libs/asmlib/memcmp64.asm b/contrib/libs/asmlib/memcmp64.asm new file mode 100644 index 0000000000..b8a8ab5fbc --- /dev/null +++ b/contrib/libs/asmlib/memcmp64.asm @@ -0,0 +1,295 @@ +%include "defs.asm" + +;************************* memcmp64.asm *************************************
+; Author: Agner Fog
+; Date created: 2013-10-03
+; Last modified: 2013-10-03
+; Description:
+; Faster version of the standard memcmp function:
+;
+; int A_memcmp (const void * ptr1, const void * ptr2, size_t count);
+;
+; Compares two memory blocks of size num.
+; The return value is zero if the two memory blocks ptr1 and ptr2 are equal
+; The return value is positive if the first differing byte of ptr1 is bigger
+; than ptr2 when compared as unsigned bytes.
+; The return value is negative if the first differing byte of ptr1 is smaller
+; than ptr2 when compared as unsigned bytes.
+;
+; Overriding standard function memcmp:
+; The alias ?OVR_memcmp is changed to _memcmp in the object file if
+; it is desired to override the standard library function memcmp.
+;
+; Optimization:
+; Uses XMM registers if SSE2 is available, uses YMM registers if AVX2.
+;
+; The latest version of this file is available at:
+; www.agner.org/optimize/asmexamples.zip
+; Copyright (c) 2013 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+global A_memcmp: function ; Function memcmp
+global EXP(memcmp): function ; ?OVR_ removed if standard function memcmp overridden
+; Direct entries to CPU-specific versions
+global memcmpSSE2: function ; SSE2 version
+global memcmpAVX2: function ; AVX2 version
+
+; Imported from instrset64.asm
+extern InstructionSet ; Instruction set for CPU dispatcher
+
+default rel
+
+; define registers used for parameters
+%IFDEF WINDOWS
+%define par1 rcx ; function parameter 1
+%define par2 rdx ; function parameter 2
+%define par3 r8 ; function parameter 3
+%define par4 r9 ; scratch register
+%define par4d r9d ; scratch register
+%ENDIF
+%IFDEF UNIX
+%define par1 rdi ; function parameter 1
+%define par2 rsi ; function parameter 2
+%define par3 rdx ; function parameter 3
+%define par4 rcx ; scratch register
+%define par4d ecx ; scratch register
+%ENDIF
+
+
+
+SECTION .text align=16
+
+; extern "C" int A_memcmp (const void * ptr1, const void * ptr2, size_t count);
+; Function entry:
+A_memcmp:
+EXP(memcmp):
+ jmp qword [memcmpDispatch] ; Go to appropriate version, depending on instruction set
+
+
+align 16
+memcmpAVX2: ; AVX2 version. Use ymm register
+memcmpAVX2@: ; internal reference
+
+ add par1, par3 ; use negative index from end of memory block
+ add par2, par3
+ neg par3
+ jz A900
+ mov par4d, 0FFFFH
+ cmp par3, -32
+ ja A100
+
+A000: ; loop comparing 32 bytes
+ vmovdqu ymm1, [par1+par3]
+ vpcmpeqb ymm0, ymm1, [par2+par3] ; compare 32 bytes
+ vpmovmskb eax, ymm0 ; get byte mask
+ xor eax, -1 ; not eax would not set flags
+ jnz A700 ; difference found
+ add par3, 32
+ jz A900 ; finished, equal
+ cmp par3, -32
+ jna A000 ; next 32 bytes
+ vzeroupper ; end ymm state
+
+A100: ; less than 32 bytes left
+ cmp par3, -16
+ ja A200
+ movdqu xmm1, [par1+par3]
+ movdqu xmm2, [par2+par3]
+ pcmpeqb xmm1, xmm2 ; compare 16 bytes
+ pmovmskb eax, xmm1 ; get byte mask
+ xor eax, par4d ; invert lower 16 bits
+ jnz A701 ; difference found
+ add par3, 16
+ jz A901 ; finished, equal
+
+A200: ; less than 16 bytes left
+ cmp par3, -8
+ ja A300
+ ; compare 8 bytes
+ movq xmm1, [par1+par3]
+ movq xmm2, [par2+par3]
+ pcmpeqb xmm1, xmm2 ; compare 8 bytes
+ pmovmskb eax, xmm1 ; get byte mask
+ xor eax, par4d
+ jnz A701 ; difference found
+ add par3, 8
+ jz A901
+
+A300: ; less than 8 bytes left
+ cmp par3, -4
+ ja A400
+ ; compare 4 bytes
+ movd xmm1, [par1+par3]
+ movd xmm2, [par2+par3]
+ pcmpeqb xmm1, xmm2 ; compare 4 bytes
+ pmovmskb eax, xmm1 ; get byte mask
+ xor eax, par4d ; not ax
+ jnz A701 ; difference found
+ add par3, 4
+ jz A901
+
+A400: ; less than 4 bytes left
+ cmp par3, -2
+ ja A500
+ movzx eax, word [par1+par3]
+ movzx par4d, word [par2+par3]
+ sub eax, par4d
+ jnz A800 ; difference in byte 0 or 1
+ add par3, 2
+ jz A901
+
+A500: ; less than 2 bytes left
+ test par3, par3
+ jz A901 ; no bytes left
+
+A600: ; one byte left
+ movzx eax, byte [par1+par3]
+ movzx par4d, byte [par2+par3]
+ sub eax, par4d ; return result
+ ret
+
+A700: ; difference found. find position
+ vzeroupper
+A701:
+ bsf eax, eax
+ add par3, rax
+ movzx eax, byte [par1+par3]
+ movzx par4d, byte [par2+par3]
+ sub eax, par4d ; return result
+ ret
+
+A800: ; difference in byte 0 or 1
+ neg al
+ sbb par3, -1 ; add 1 to par3 if al == 0
+ movzx eax, byte [par1+par3]
+ movzx par4d, byte [par2+par3]
+ sub eax, par4d ; return result
+ ret
+
+A900: ; equal
+ vzeroupper
+A901: xor eax, eax
+ ret
+
+
+memcmpSSE2: ; SSE2 version. Use xmm register
+memcmpSSE2@: ; internal reference
+
+ add par1, par3 ; use negative index from end of memory block
+ add par2, par3
+ neg par3
+ jz S900
+ mov par4d, 0FFFFH
+ cmp par3, -16
+ ja S200
+
+S100: ; loop comparing 16 bytes
+ movdqu xmm1, [par1+par3]
+ movdqu xmm2, [par2+par3]
+ pcmpeqb xmm1, xmm2 ; compare 16 bytes
+ pmovmskb eax, xmm1 ; get byte mask
+ xor eax, par4d ; not ax
+ jnz S700 ; difference found
+ add par3, 16
+ jz S900 ; finished, equal
+ cmp par3, -16
+ jna S100 ; next 16 bytes
+
+S200: ; less than 16 bytes left
+ cmp par3, -8
+ ja S300
+ ; compare 8 bytes
+ movq xmm1, [par1+par3]
+ movq xmm2, [par2+par3]
+ pcmpeqb xmm1, xmm2 ; compare 8 bytes
+ pmovmskb eax, xmm1 ; get byte mask
+ xor eax, par4d ; not ax
+ jnz S700 ; difference found
+ add par3, 8
+ jz S900
+
+S300: ; less than 8 bytes left
+ cmp par3, -4
+ ja S400
+ ; compare 4 bytes
+ movd xmm1, [par1+par3]
+ movd xmm2, [par2+par3]
+ pcmpeqb xmm1, xmm2 ; compare 4 bytes
+ pmovmskb eax, xmm1 ; get byte mask
+ xor eax, par4d ; not ax
+ jnz S700 ; difference found
+ add par3, 4
+ jz S900
+
+S400: ; less than 4 bytes left
+ cmp par3, -2
+ ja S500
+ movzx eax, word [par1+par3]
+ movzx par4d, word [par2+par3]
+ sub eax, par4d
+ jnz S800 ; difference in byte 0 or 1
+ add par3, 2
+ jz S900
+
+S500: ; less than 2 bytes left
+ test par3, par3
+ jz S900 ; no bytes left
+
+ ; one byte left
+ movzx eax, byte [par1+par3]
+ movzx par4d, byte [par2+par3]
+ sub eax, par4d ; return result
+ ret
+
+S700: ; difference found. find position
+ bsf eax, eax
+ add par3, rax
+ movzx eax, byte [par1+par3]
+ movzx par4d, byte [par2+par3]
+ sub eax, par4d ; return result
+ ret
+
+S800: ; difference in byte 0 or 1
+ neg al
+ sbb par3, -1 ; add 1 to par3 if al == 0
+S820: movzx eax, byte [par1+par3]
+ movzx par4d, byte [par2+par3]
+ sub eax, par4d ; return result
+ ret
+
+S900: ; equal
+ xor eax, eax
+ ret
+
+
+; CPU dispatching for memcmp. This is executed only once
+memcmpCPUDispatch:
+ push par1
+ push par2
+ push par3
+ call InstructionSet ; get supported instruction set
+ ; SSE2 always supported
+ lea par4, [memcmpSSE2@]
+ cmp eax, 13 ; check AVX2
+ jb Q100
+ ; AVX2 supported
+ lea par4, [memcmpAVX2@]
+Q100: ; save pointer
+ mov qword [memcmpDispatch], par4
+; Continue in appropriate version of memcmp
+ pop par3
+ pop par2
+ pop par1
+ jmp par4
+
+
+SECTION .data
+align 16
+
+
+; Pointer to appropriate version.
+; This initially points to memcmpCPUDispatch. memcmpCPUDispatch will
+; change this to the appropriate version of memcmp, so that
+; memcmpCPUDispatch is only executed once:
+memcmpDispatch DQ memcmpCPUDispatch
+
diff --git a/contrib/libs/asmlib/memcpy64.asm b/contrib/libs/asmlib/memcpy64.asm new file mode 100644 index 0000000000..d590990b99 --- /dev/null +++ b/contrib/libs/asmlib/memcpy64.asm @@ -0,0 +1,1332 @@ +%include "defs.asm"
+
+;************************* memcpy64.asm ************************************
+; Author: Agner Fog
+; Date created: 2008-07-19
+; Last modified: 2016-11-12 (patched version with AVX512 support removed)
+;
+; Description:
+; Faster version of the standard memcpy function:
+; void * A_memcpy(void *dest, const void *src, size_t count);
+; Copies 'count' bytes from 'src' to 'dest'
+;
+; Overriding standard function memcpy:
+; The alias ?OVR_memcpy is changed to _memcpy in the object file if
+; it is desired to override the standard library function memcpy.
+;
+; The function uses non-temporal writes to bypass the cache when the size is
+; bigger than half the size of the largest_level cache. This limit can be
+; read with GetMemcpyCacheLimit and changed with SetMemcpyCacheLimit
+; C++ prototypes:
+; extern "C" size_t GetMemcpyCacheLimit(); // in memcpy64.asm
+; extern "C" void SetMemcpyCacheLimit(); // in memmove64.asm
+; extern "C" void SetMemcpyCacheLimit1(); // used internally
+;
+; Position-independent code is generated if POSITIONINDEPENDENT is defined.
+;
+; CPU dispatching included SSE2, Suppl-SSE3 and AVX instruction sets.
+;
+; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+default rel
+
+global A_memcpy: function ; Function A_memcpy
+global EXP(memcpy): function ; ?OVR removed if standard function memcpy overridden
+global memcpySSE2: function ; Version for processors with only SSE2
+global memcpySSSE3: function ; Version for processors with SSSE3
+global memcpyU: function ; Version for processors with fast unaligned read
+global memcpyU256: function ; Version for processors with fast 256-bit read/write
+
+global GetMemcpyCacheLimit: function ; Get the size limit for bypassing cache when copying with memcpy and memmove
+global SetMemcpyCacheLimit1: function ; Set the size limit for bypassing cache when copying with memcpy
+
+
+; Imported from instrset64.asm
+extern InstructionSet ; Instruction set for CPU dispatcher
+
+; Imported from unalignedisfaster64.asm:
+extern UnalignedIsFaster ; Tells if unaligned read is faster than PALIGNR
+extern Store256BitIsFaster ; Tells if a 256 bit store is faster than two 128 bit stores
+
+; Imported from cachesize32.asm:
+extern DataCacheSize ; Gets size of data cache
+
+
+; Define prolog for this function
+%MACRO PROLOGM 0
+%IFDEF WINDOWS
+ push rsi
+ push rdi
+ mov rdi, rcx ; dest
+ mov r9, rcx ; dest
+ mov rsi, rdx ; src
+ mov rcx, r8 ; count
+%ELSE ; Unix
+ mov rcx, rdx ; count
+ mov r9, rdi ; dest
+%ENDIF
+%ENDM
+
+; Define return from this function
+%MACRO RETURNM 0
+%IFDEF WINDOWS
+ pop rdi
+ pop rsi
+%ENDIF
+ mov rax, r9 ; Return value = dest
+ ret
+%ENDM
+
+
+SECTION .text align=16
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Common entry for dispatch
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; extern "C" void * A_memcpy(void * dest, const void * src, size_t count);
+; Function entry:
+A_memcpy:
+EXP(memcpy):
+ jmp qword [memcpyDispatch] ; Go to appropriate version, depending on instruction set
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; AVX Version for processors with fast unaligned read and fast 32 bytes write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+memcpyU256: ; global label
+memcpyU256@: ; local label
+ PROLOGM
+ cmp rcx, 40H
+ jb A1000 ; Use simpler code if count < 64
+
+ ; count >= 64
+ ; Calculate size of first block up to first regular boundary of dest
+ mov edx, edi
+ neg edx
+ and edx, 1FH
+ jz B3100 ; Skip if dest aligned by 32
+
+ ; edx = size of first partial block, 1 - 31 bytes
+ test dl, 3
+ jz B3030
+ test dl, 1
+ jz B3020
+ ; move 1 byte
+ movzx eax, byte [rsi]
+ mov [rdi], al
+ inc rsi
+ inc rdi
+B3020: test dl, 2
+ jz B3030
+ ; move 2 bytes
+ movzx eax, word [rsi]
+ mov [rdi], ax
+ add rsi, 2
+ add rdi, 2
+B3030: test dl, 4
+ jz B3040
+ ; move 4 bytes
+ mov eax, [rsi]
+ mov [rdi], eax
+ add rsi, 4
+ add rdi, 4
+B3040: test dl, 8
+ jz B3050
+ ; move 8 bytes
+ mov rax, [rsi]
+ mov [rdi], rax
+ add rsi, 8
+ add rdi, 8
+B3050: test dl, 16
+ jz B3060
+ ; move 16 bytes
+ movups xmm0, [rsi]
+ movaps [rdi], xmm0
+ add rsi, 16
+ add rdi, 16
+B3060: sub rcx, rdx
+
+B3100: ; Now dest is aligned by 32. Any partial block has been moved
+
+ ; Set up for loop moving 32 bytes per iteration:
+ mov rdx, rcx ; Save count
+ and rcx, -20H ; Round down to nearest multiple of 32
+ add rsi, rcx ; Point to the end
+ add rdi, rcx ; Point to the end
+ sub rdx, rcx ; Remaining data after loop
+ ; Check if count very big
+ cmp rcx, [CacheBypassLimit]
+ ja I3100 ; Use non-temporal store if count > CacheBypassLimit
+ neg rcx ; Negative index from the end
+
+H3100: ; copy -rcx bytes in blocks of 32 bytes.
+
+ ; Check for false memory dependence: The CPU may falsely assume
+ ; a partial overlap between the written destination and the following
+ ; read source if source is unaligned and
+ ; (src-dest) modulo 4096 is close to 4096
+ test sil, 1FH
+ jz H3110 ; aligned
+ mov eax, esi
+ sub eax, edi
+ and eax, 0FFFH ; modulo 4096
+ cmp eax, 1000H - 200H
+ ja J3100
+
+align 16
+H3110: ; main copy loop, 32 bytes at a time
+ ; rcx has negative index from the end, counting up to zero
+ vmovups ymm0, [rsi+rcx]
+ vmovaps [rdi+rcx], ymm0
+ add rcx, 20H
+ jnz H3110
+ sfence
+ vzeroupper ; end of AVX mode
+
+H3120: ; Move the remaining edx bytes (0 - 31):
+ add rsi, rdx
+ add rdi, rdx
+ neg rdx
+ jz H3500 ; Skip if no more data
+ ; move 16-8-4-2-1 bytes, aligned
+ cmp edx, -10H
+ jg H3200
+ ; move 16 bytes
+ movups xmm0, [rsi+rdx]
+ movaps [rdi+rdx], xmm0
+ add rdx, 10H
+H3200: cmp edx, -8
+ jg H3210
+ ; move 8 bytes
+ movq xmm0, qword [rsi+rdx]
+ movq qword [rdi+rdx], xmm0
+ add rdx, 8
+ jz H500 ; Early skip if count divisible by 8
+H3210: cmp edx, -4
+ jg H3220
+ ; move 4 bytes
+ mov eax, [rsi+rdx]
+ mov [rdi+rdx], eax
+ add rdx, 4
+H3220: cmp edx, -2
+ jg H3230
+ ; move 2 bytes
+ movzx eax, word [rsi+rdx]
+ mov [rdi+rdx], ax
+ add rdx, 2
+H3230: cmp edx, -1
+ jg H3500
+ ; move 1 byte
+ movzx eax, byte [rsi+rdx]
+ mov [rdi+rdx], al
+H3500: ; finished
+ RETURNM
+
+I3100: ; non-temporal move
+ neg rcx ; Negative index from the end
+
+align 16
+I3110: ; main copy loop, 32 bytes at a time
+ ; rcx has negative index from the end, counting up to zero
+ vmovups ymm0, [rsi+rcx]
+ vmovntps [rdi+rcx], ymm0
+ add rcx, 20H
+ jnz I3110
+ sfence
+ vzeroupper ; end of AVX mode
+ jmp H3120 ; Move the remaining edx bytes (0 - 31)
+
+
+align 16
+J3100: ; There is a false memory dependence.
+ ; check if src and dest overlap, if not then it is safe
+ ; to copy backwards to avoid false memory dependence
+%if 1
+ ; Use this version if you want consistent behavior in the case
+ ; where dest > src and overlap. However, this case is undefined
+ ; anyway because part of src is overwritten before copying
+ push rdx
+ mov rax, rsi
+ sub rax, rdi
+ cqo
+ xor rax, rdx
+ sub rax, rdx ; abs(src-dest)
+ neg rcx ; size
+ pop rdx ; restore rdx
+ cmp rax, rcx
+ jnb J3110
+ neg rcx ; restore rcx
+ jmp H3110 ; overlap between src and dest. Can't copy backwards
+%else
+ ; save time by not checking the case that is undefined anyway
+ mov rax, rsi
+ sub rax, rdi
+ neg rcx ; size
+ cmp rax, rcx
+ jnb J3110 ; OK to copy backwards
+ ; must copy forwards
+ neg rcx ; restore ecx
+ jmp H3110 ; copy forwards
+
+%endif
+
+J3110: ; copy backwards, rcx = size. rsi, rdi = end of src, dest
+ push rsi
+ push rdi
+ sub rsi, rcx
+ sub rdi, rcx
+J3120: ; loop backwards
+ vmovups ymm0, [rsi+rcx-20H]
+ vmovaps [rdi+rcx-20H], ymm0
+ sub rcx, 20H
+ jnz J3120
+ sfence
+ vzeroupper
+ pop rdi
+ pop rsi
+ jmp H3120
+
+align 16
+ ; count < 64. Move 32-16-8-4-2-1 bytes
+ ; multiple CPU versions (SSSE3 and above)
+A1000: add rsi, rcx ; end of src
+ add rdi, rcx ; end of dest
+ neg rcx ; negative index from the end
+ cmp ecx, -20H
+ jg A1100
+ ; move 32 bytes
+ ; movdqu is faster than 64-bit moves on processors with SSSE3
+ movups xmm0, [rsi+rcx]
+ movups xmm1, [rsi+rcx+10H]
+ movups [rdi+rcx], xmm0
+ movups [rdi+rcx+10H], xmm1
+ add rcx, 20H
+A1100: cmp ecx, -10H
+ jg A1200
+ ; move 16 bytes
+ movups xmm0, [rsi+rcx]
+ movups [rdi+rcx], xmm0
+ add rcx, 10H
+A1200: cmp ecx, -8
+ jg A1300
+ ; move 8 bytes
+ mov rax, qword [rsi+rcx]
+ mov qword [rdi+rcx], rax
+ add rcx, 8
+A1300: cmp ecx, -4
+ jg A1400
+ ; move 4 bytes
+ mov eax, [rsi+rcx]
+ mov [rdi+rcx], eax
+ add rcx, 4
+ jz A1900 ; early out if count divisible by 4
+A1400: cmp ecx, -2
+ jg A1500
+ ; move 2 bytes
+ movzx eax, word [rsi+rcx]
+ mov [rdi+rcx], ax
+ add rcx, 2
+A1500: cmp ecx, -1
+ jg A1900
+ ; move 1 byte
+ movzx eax, byte [rsi+rcx]
+ mov [rdi+rcx], al
+A1900: ; finished
+ RETURNM
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Version for processors with fast unaligned read and fast 16 bytes write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+memcpyU: ; global label
+memcpyU@: ; local label
+ PROLOGM
+ cmp rcx, 40H
+ jb A1000 ; Use simpler code if count < 64
+
+ ; count >= 64
+ ; Calculate size of first block up to first regular boundary of dest
+ mov edx, edi
+ neg edx
+ and edx, 0FH
+ jz B2100 ; Skip if dest aligned by 16
+
+ ; edx = size of first partial block, 1 - 15 bytes
+ test dl, 3
+ jz B2030
+ test dl, 1
+ jz B2020
+ ; move 1 byte
+ movzx eax, byte [rsi]
+ mov [rdi], al
+ inc rsi
+ inc rdi
+B2020: test dl, 2
+ jz B2030
+ ; move 2 bytes
+ movzx eax, word [rsi]
+ mov [rdi], ax
+ add rsi, 2
+ add rdi, 2
+B2030: test dl, 4
+ jz B2040
+ ; move 4 bytes
+ mov eax, [rsi]
+ mov [rdi], eax
+ add rsi, 4
+ add rdi, 4
+B2040: test dl, 8
+ jz B2050
+ ; move 8 bytes
+ mov rax, [rsi]
+ mov [rdi], rax
+ add rsi, 8
+ add rdi, 8
+B2050: sub rcx, rdx
+B2100: ; Now dest is aligned by 16. Any partial block has been moved
+
+ ; Set up for loop moving 32 bytes per iteration:
+ mov rdx, rcx ; Save count
+ and rcx, -20H ; Round down to nearest multiple of 32
+ add rsi, rcx ; Point to the end
+ add rdi, rcx ; Point to the end
+ sub rdx, rcx ; Remaining data after loop
+
+ ; Check if count very big
+ cmp rcx, [CacheBypassLimit]
+ ja I100 ; Use non-temporal store if count > CacheBypassLimit
+ neg rcx ; Negative index from the end
+
+H100: ; copy -rcx bytes in blocks of 32 bytes.
+
+ ; Check for false memory dependence: The CPU may falsely assume
+ ; a partial overlap between the written destination and the following
+ ; read source if source is unaligned and
+ ; (src-dest) modulo 4096 is close to 4096
+ test sil, 0FH
+ jz H110 ; aligned
+ mov eax, esi
+ sub eax, edi
+ and eax, 0FFFH ; modulo 4096
+ cmp eax, 1000H - 200H
+ ja J100
+
+H110: ; main copy loop, 32 bytes at a time
+ ; rcx has negative index from the end, counting up to zero
+ movups xmm0, [rsi+rcx]
+ movups xmm1, [rsi+rcx+10H]
+ movaps [rdi+rcx], xmm0
+ movaps [rdi+rcx+10H], xmm1
+ add rcx, 20H
+ jnz H110
+
+H120: ; Move the remaining edx bytes (0 - 31):
+ add rsi, rdx
+ add rdi, rdx
+ neg rdx
+ jz H500 ; Skip if no more data
+ ; move 16-8-4-2-1 bytes, aligned
+ cmp edx, -10H
+ jg H200
+ ; move 16 bytes
+ movups xmm0, [rsi+rdx]
+ movaps [rdi+rdx], xmm0
+ add rdx, 10H
+H200: cmp edx, -8
+ jg H210
+ ; move 8 bytes
+ movq xmm0, qword [rsi+rdx]
+ movq qword [rdi+rdx], xmm0
+ add rdx, 8
+ jz H500 ; Early skip if count divisible by 8
+H210: cmp edx, -4
+ jg H220
+ ; move 4 bytes
+ mov eax, [rsi+rdx]
+ mov [rdi+rdx], eax
+ add rdx, 4
+H220: cmp edx, -2
+ jg H230
+ ; move 2 bytes
+ movzx eax, word [rsi+rdx]
+ mov [rdi+rdx], ax
+ add rdx, 2
+H230: cmp edx, -1
+ jg H500
+ ; move 1 byte
+ movzx eax, byte [rsi+rdx]
+ mov [rdi+rdx], al
+H500: ; finished
+ RETURNM
+
+I100: ; non-temporal move
+ neg rcx ; Negative index from the end
+
+align 16
+I110: ; main copy loop, 32 bytes at a time
+ ; rcx has negative index from the end, counting up to zero
+ movups xmm0, [rsi+rcx]
+ movups xmm1, [rsi+rcx+10H]
+ movntps [rdi+rcx], xmm0
+ movntps [rdi+rcx+10H], xmm1
+ add rcx, 20H
+ jnz I110
+ sfence
+ jmp H120 ; Move the remaining edx bytes (0 - 31):
+
+
+align 16
+J100: ; There is a false memory dependence.
+ ; check if src and dest overlap, if not then it is safe
+ ; to copy backwards to avoid false memory dependence
+%if 1
+ ; Use this version if you want consistent behavior in the case
+ ; where dest > src and overlap. However, this case is undefined
+ ; anyway because part of src is overwritten before copying
+ push rdx
+ mov rax, rsi
+ sub rax, rdi
+ cqo
+ xor rax, rdx
+ sub rax, rdx ; abs(src-dest)
+ neg rcx ; size
+ pop rdx ; restore rdx
+ cmp rax, rcx
+ jnb J110
+ neg rcx ; restore rcx
+ jmp H110 ; overlap between src and dest. Can't copy backwards
+%else
+ ; save time by not checking the case that is undefined anyway
+ mov rax, rsi
+ sub rax, rdi
+ neg rcx ; size
+ cmp rax, rcx
+ jnb J110 ; OK to copy backwards
+ ; must copy forwards
+ neg rcx ; restore ecx
+ jmp H110 ; copy forwards
+
+%endif
+
+J110: ; copy backwards, rcx = size. rsi, rdi = end of src, dest
+ push rsi
+ push rdi
+ sub rsi, rcx
+ sub rdi, rcx
+J120: ; loop backwards
+ movups xmm1, [rsi+rcx-20H]
+ movups xmm0, [rsi+rcx-10H]
+ movaps [rdi+rcx-20H], xmm1
+ movaps [rdi+rcx-10H], xmm0
+ sub rcx, 20H
+ jnz J120
+ pop rdi
+ pop rsi
+ jmp H120
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Version for processors with SSSE3. Aligned read + shift + aligned write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+memcpySSSE3: ; global label
+memcpySSSE3@: ; local label
+ PROLOGM
+ cmp rcx, 40H
+ jb A1000 ; Use simpler code if count < 64
+
+ ; count >= 64
+ ; Calculate size of first block up to first regular boundary of dest
+ mov edx, edi
+ neg edx
+ and edx, 0FH
+ jz B1200 ; Skip if dest aligned by 16
+
+ ; edx = size of first partial block, 1 - 15 bytes
+ test dl, 3
+ jz B1030
+ test dl, 1
+ jz B1020
+ ; move 1 byte
+ movzx eax, byte [rsi]
+ mov [rdi], al
+ inc rsi
+ inc rdi
+B1020: test dl, 2
+ jz B1030
+ ; move 2 bytes
+ movzx eax, word [rsi]
+ mov [rdi], ax
+ add rsi, 2
+ add rdi, 2
+B1030: test dl, 4
+ jz B1040
+ ; move 4 bytes
+ mov eax, [rsi]
+ mov [rdi], eax
+ add rsi, 4
+ add rdi, 4
+B1040: test dl, 8
+ jz B1050
+ ; move 8 bytes
+ mov rax, [rsi]
+ mov [rdi], rax
+ add rsi, 8
+ add rdi, 8
+B1050: sub rcx, rdx
+B1200: ; Now dest is aligned by 16. Any partial block has been moved
+ ; Find alignment of src modulo 16 at this point:
+ mov eax, esi
+ and eax, 0FH
+
+ ; Set up for loop moving 32 bytes per iteration:
+ mov edx, ecx ; Save count (lower 32 bits)
+ and rcx, -20H ; Round down count to nearest multiple of 32
+ add rsi, rcx ; Point to the end
+ add rdi, rcx ; Point to the end
+ sub edx, ecx ; Remaining data after loop (0-31)
+ sub rsi, rax ; Nearest preceding aligned block of src
+
+ ; Check if count very big
+ cmp rcx, [CacheBypassLimit]
+ ja B1400 ; Use non-temporal store if count > CacheBypassLimit
+ neg rcx ; Negative index from the end
+
+ ; Dispatch to different codes depending on src alignment
+ lea r8, [AlignmentDispatchSSSE3]
+ jmp near [r8+rax*8]
+
+B1400: neg rcx
+ ; Dispatch to different codes depending on src alignment
+ lea r8, [AlignmentDispatchNT]
+ jmp near [r8+rax*8]
+
+
+align 16
+C100: ; Code for aligned src. SSE2 and SSSE3 versions
+ ; The nice case, src and dest have same alignment.
+
+ ; Loop. rcx has negative index from the end, counting up to zero
+ movaps xmm0, [rsi+rcx]
+ movaps xmm1, [rsi+rcx+10H]
+ movaps [rdi+rcx], xmm0
+ movaps [rdi+rcx+10H], xmm1
+ add rcx, 20H
+ jnz C100
+
+ ; Move the remaining edx bytes (0 - 31):
+ add rsi, rdx
+ add rdi, rdx
+ neg rdx
+ jz C500 ; Skip if no more data
+ ; move 16-8-4-2-1 bytes, aligned
+ cmp edx, -10H
+ jg C200
+ ; move 16 bytes
+ movaps xmm0, [rsi+rdx]
+ movaps [rdi+rdx], xmm0
+ add rdx, 10H
+C200: cmp edx, -8
+ jg C210
+ ; move 8 bytes
+ mov rax, [rsi+rdx]
+ mov [rdi+rdx], rax
+ add rdx, 8
+ jz C500 ; Early skip if count divisible by 8
+C210: cmp edx, -4
+ jg C220
+ ; move 4 bytes
+ mov eax, [rsi+rdx]
+ mov [rdi+rdx], eax
+ add rdx, 4
+C220: cmp edx, -2
+ jg C230
+ ; move 2 bytes
+ movzx eax, word [rsi+rdx]
+ mov [rdi+rdx], ax
+ add rdx, 2
+C230: cmp edx, -1
+ jg C500
+ ; move 1 byte
+ movzx eax, byte [rsi+rdx]
+ mov [rdi+rdx], al
+C500: ; finished
+ RETURNM
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Version for processors with SSE2. Aligned read + shift + aligned write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+memcpySSE2: ; global label
+memcpySSE2@: ; local label
+ PROLOGM
+ cmp rcx, 40H
+ jae B0100 ; Use simpler code if count < 64
+
+ ; count < 64. Move 32-16-8-4-2-1 bytes
+ add rsi, rcx ; end of src
+ add rdi, rcx ; end of dest
+ neg rcx ; negative index from the end
+ cmp ecx, -20H
+ jg A100
+ ; move 32 bytes
+ ; mov r64 is faster than movdqu on Intel Pentium M and Core 1
+ ; movdqu is fast on Nehalem and later
+ mov rax, [rsi+rcx]
+ mov rdx, [rsi+rcx+8]
+ mov [rdi+rcx], rax
+ mov [rdi+rcx+8], rdx
+ mov rax, qword [rsi+rcx+10H]
+ mov rdx, qword [rsi+rcx+18H]
+ mov qword [rdi+rcx+10H], rax
+ mov qword [rdi+rcx+18H], rdx
+ add rcx, 20H
+A100: cmp ecx, -10H
+ jg A200
+ ; move 16 bytes
+ mov rax, [rsi+rcx]
+ mov rdx, [rsi+rcx+8]
+ mov [rdi+rcx], rax
+ mov [rdi+rcx+8], rdx
+ add rcx, 10H
+A200: cmp ecx, -8
+ jg A300
+ ; move 8 bytes
+ mov rax, qword [rsi+rcx]
+ mov qword [rdi+rcx], rax
+ add rcx, 8
+A300: cmp ecx, -4
+ jg A400
+ ; move 4 bytes
+ mov eax, [rsi+rcx]
+ mov [rdi+rcx], eax
+ add rcx, 4
+ jz A900 ; early out if count divisible by 4
+A400: cmp ecx, -2
+ jg A500
+ ; move 2 bytes
+ movzx eax, word [rsi+rcx]
+ mov [rdi+rcx], ax
+ add rcx, 2
+A500: cmp ecx, -1
+ jg A900
+ ; move 1 byte
+ movzx eax, byte [rsi+rcx]
+ mov [rdi+rcx], al
+A900: ; finished
+ RETURNM
+
+B0100: ; count >= 64
+ ; Calculate size of first block up to first regular boundary of dest
+ mov edx, edi
+ neg edx
+ and edx, 0FH
+ jz B0200 ; Skip if dest aligned by 16
+
+ ; edx = size of first partial block, 1 - 15 bytes
+ test dl, 3
+ jz B0030
+ test dl, 1
+ jz B0020
+ ; move 1 byte
+ movzx eax, byte [rsi]
+ mov [rdi], al
+ inc rsi
+ inc rdi
+B0020: test dl, 2
+ jz B0030
+ ; move 2 bytes
+ movzx eax, word [rsi]
+ mov [rdi], ax
+ add rsi, 2
+ add rdi, 2
+B0030: test dl, 4
+ jz B0040
+ ; move 4 bytes
+ mov eax, [rsi]
+ mov [rdi], eax
+ add rsi, 4
+ add rdi, 4
+B0040: test dl, 8
+ jz B0050
+ ; move 8 bytes
+ mov rax, [rsi]
+ mov [rdi], rax
+ add rsi, 8
+ add rdi, 8
+B0050: sub rcx, rdx
+B0200: ; Now dest is aligned by 16. Any partial block has been moved
+
+ ; This part will not always work if count < 64
+ ; Calculate size of first block up to first regular boundary of dest
+ mov edx, edi
+ neg edx
+ and edx, 0FH
+ jz B300 ; Skip if dest aligned by 16
+
+ ; rdx = size of first partial block, 1 - 15 bytes
+ add rsi, rdx
+ add rdi, rdx
+ sub rcx, rdx
+ neg rdx
+ cmp edx, -8
+ jg B200
+ ; move 8 bytes
+ mov rax, [rsi+rdx]
+ mov [rdi+rdx], rax
+ add rdx, 8
+B200: cmp edx, -4
+ jg B210
+ ; move 4 bytes
+ mov eax, [rsi+rdx]
+ mov [rdi+rdx], eax
+ add rdx, 4
+ jz B300 ; early out if aligned by 4
+B210: cmp edx, -2
+ jg B220
+ ; move 2 bytes
+ movzx eax, word [rsi+rdx]
+ mov [rdi+rdx], ax
+ add rdx, 2
+B220: cmp edx, -1
+ jg B300
+ ; move 1 byte
+ movzx eax, byte [rsi+rdx]
+ mov [rdi+rdx], al
+
+B300: ; Now dest is aligned by 16. Any partial block has been moved
+ ; Find alignment of src modulo 16 at this point:
+ mov eax, esi
+ and eax, 0FH
+
+ ; Set up for loop moving 32 bytes per iteration:
+ mov edx, ecx ; Save count (lower 32 bits)
+ and rcx, -20H ; Round down count to nearest multiple of 32
+ add rsi, rcx ; Point to the end
+ add rdi, rcx ; Point to the end
+ sub edx, ecx ; Remaining data after loop (0-31)
+ sub rsi, rax ; Nearest preceding aligned block of src
+
+ ; Check if count very big
+ cmp rcx, [CacheBypassLimit]
+ ja B400 ; Use non-temporal store if count > CacheBypassLimit
+ neg rcx ; Negative index from the end
+
+ ; Dispatch to different codes depending on src alignment
+ lea r8, [AlignmentDispatchSSE2]
+ jmp near [r8+rax*8]
+
+B400: neg rcx
+ ; Dispatch to different codes depending on src alignment
+ lea r8, [AlignmentDispatchNT]
+ jmp near [r8+rax*8]
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Macros and alignment jump tables
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Macros for each src alignment, SSE2 instruction set:
+; Make separate code for each alignment u because the shift instructions
+; have the shift count as a constant:
+
+%MACRO MOVE_UNALIGNED_SSE2 2 ; u, nt
+; Move rcx + rdx bytes of data
+; Source is misaligned. (src-dest) modulo 16 = %1
+; %2 = 1 if non-temporal store desired
+; eax = %1
+; rsi = src - %1 = nearest preceding 16-bytes boundary
+; rdi = dest (aligned)
+; rcx = - (count rounded down to nearest divisible by 32)
+; edx = remaining bytes to move after loop
+ movdqa xmm0, [rsi+rcx] ; Read from nearest preceding 16B boundary
+%%L1: ; Loop. rcx has negative index from the end, counting up to zero
+ movdqa xmm1, [rsi+rcx+10H] ; Read next two blocks aligned
+ movdqa xmm2, [rsi+rcx+20H]
+ movdqa xmm3, xmm1 ; Copy because used twice
+ psrldq xmm0, %1 ; shift right
+ pslldq xmm1, 16-%1 ; shift left
+ por xmm0, xmm1 ; combine blocks
+ %IF %2 == 0
+ movdqa [rdi+rcx], xmm0 ; Save aligned
+ %ELSE
+ movntdq [rdi+rcx], xmm0 ; non-temporal save
+ %ENDIF
+ movdqa xmm0, xmm2 ; Save for next iteration
+ psrldq xmm3, %1 ; shift right
+ pslldq xmm2, 16-%1 ; shift left
+ por xmm3, xmm2 ; combine blocks
+ %IF %2 == 0
+ movdqa [rdi+rcx+10H], xmm3 ; Save aligned
+ %ELSE
+ movntdq [rdi+rcx+10H], xmm3 ; non-temporal save
+ %ENDIF
+ add rcx, 20H ; Loop through negative values up to zero
+ jnz %%L1
+
+ ; Set up for edx remaining bytes
+ add rsi, rdx
+ add rdi, rdx
+ neg rdx
+ cmp edx, -10H
+ jg %%L2
+ ; One more 16-bytes block to move
+ movdqa xmm1, [rsi+rdx+10H]
+ psrldq xmm0, %1 ; shift right
+ pslldq xmm1, 16-%1 ; shift left
+ por xmm0, xmm1 ; combine blocks
+ %IF %2 == 0
+ movdqa [rdi+rdx], xmm0 ; Save aligned
+ %ELSE
+ movntdq [rdi+rdx], xmm0 ; non-temporal save
+ %ENDIF
+ add rdx, 10H
+%%L2: ; Get src pointer back to misaligned state
+ add rsi, rax
+ %IF %2 == 1
+ sfence
+ %ENDIF
+ ; Move remaining 0 - 15 bytes, unaligned
+ jmp C200
+%ENDMACRO
+
+
+%MACRO MOVE_UNALIGNED_SSE2_4 1 ; nt
+; Special case for u = 4
+; %1 = 1 if non-temporal store desired
+ movaps xmm0, [rsi+rcx] ; Read from nearest preceding 16B boundary
+%%L1: ; Loop. rcx has negative index from the end, counting up to zero
+ movaps xmm1, [rsi+rcx+10H] ; Read next two blocks aligned
+ movss xmm0, xmm1 ; Moves 4 bytes, leaves remaining bytes unchanged
+ shufps xmm0, xmm0, 00111001B ; Rotate
+ %IF %1 == 0
+ movaps [rdi+rcx], xmm0 ; Save aligned
+ %ELSE
+ movntps [rdi+rcx], xmm0 ; Non-temporal save
+ %ENDIF
+ movaps xmm0, [rsi+rcx+20H]
+ movss xmm1, xmm0
+ shufps xmm1, xmm1, 00111001B
+ %IF %1 == 0
+ movaps [rdi+rcx+10H], xmm1 ; Save aligned
+ %ELSE
+ movntps [rdi+rcx+10H], xmm1 ; Non-temporal save
+ %ENDIF
+ add rcx, 20H ; Loop through negative values up to zero
+ jnz %%L1
+ ; Set up for edx remaining bytes
+ add rsi, rdx
+ add rdi, rdx
+ neg rdx
+ cmp edx, -10H
+ jg %%L2
+ ; One more 16-bytes block to move
+ movaps xmm1, [rsi+rdx+10H] ; Read next two blocks aligned
+ movss xmm0, xmm1
+ shufps xmm0, xmm0, 00111001B
+ %IF %1 == 0
+ movaps [rdi+rdx], xmm0 ; Save aligned
+ %ELSE
+ movntps [rdi+rdx], xmm0 ; Non-temporal save
+ %ENDIF
+ add rdx, 10H
+%%L2: ; Get src pointer back to misaligned state
+ add rsi, rax
+ %IF %1 == 1
+ sfence
+ %ENDIF
+ ; Move remaining 0 - 15 bytes, unaligned
+ jmp C200
+%ENDMACRO
+
+
+%MACRO MOVE_UNALIGNED_SSE2_8 1 ; nt
+; Special case for u = 8
+; %1 = 1 if non-temporal store desired
+ movaps xmm0, [rsi+rcx] ; Read from nearest preceding 16B boundary
+%%L1: ; Loop. rcx has negative index from the end, counting up to zero
+ movaps xmm1, [rsi+rcx+10H] ; Read next two blocks aligned
+ movsd xmm0, xmm1 ; Moves 8 bytes, leaves remaining bytes unchanged
+ shufps xmm0, xmm0, 01001110B ; Rotate
+ %IF %1 == 0
+ movaps [rdi+rcx], xmm0 ; Save aligned
+ %ELSE
+ movntps [rdi+rcx], xmm0 ; Non-temporal save
+ %ENDIF
+ movaps xmm0, [rsi+rcx+20H]
+ movsd xmm1, xmm0
+ shufps xmm1, xmm1, 01001110B
+ %IF %1 == 0
+ movaps [rdi+rcx+10H], xmm1 ; Save aligned
+ %ELSE
+ movntps [rdi+rcx+10H], xmm1 ; Non-temporal save
+ %ENDIF
+ add rcx, 20H ; Loop through negative values up to zero
+ jnz %%L1
+ ; Set up for edx remaining bytes
+ add rsi, rdx
+ add rdi, rdx
+ neg rdx
+ cmp edx, -10H
+ jg %%L2
+ ; One more 16-bytes block to move
+ movaps xmm1, [rsi+rdx+10H] ; Read next two blocks aligned
+ movsd xmm0, xmm1
+ shufps xmm0, xmm0, 01001110B
+ %IF %1 == 0
+ movaps [rdi+rdx], xmm0 ; Save aligned
+ %ELSE
+ movntps [rdi+rdx], xmm0 ; Non-temporal save
+ %ENDIF
+ add rdx, 10H
+%%L2: ; Get src pointer back to misaligned state
+ add rsi, rax
+ %IF %1 == 1
+ sfence
+ %ENDIF
+ ; Move remaining 0 - 15 bytes, unaligned
+ jmp C200
+%ENDMACRO
+
+
+%MACRO MOVE_UNALIGNED_SSE2_12 1 ; nt
+; Special case for u = 12
+; %1 = 1 if non-temporal store desired
+ movaps xmm0, [rsi+rcx] ; Read from nearest preceding 16B boundary
+ shufps xmm0, xmm0, 10010011B
+%%L1: ; Loop. rcx has negative index from the end, counting up to zero
+ movaps xmm1, [rsi+rcx+10H] ; Read next two blocks aligned
+ movaps xmm2, [rsi+rcx+20H]
+ shufps xmm1, xmm1, 10010011B
+ shufps xmm2, xmm2, 10010011B
+ movaps xmm3, xmm2
+ movss xmm2, xmm1 ; Moves 4 bytes, leaves remaining bytes unchanged
+ movss xmm1, xmm0 ; Moves 4 bytes, leaves remaining bytes unchanged
+ %IF %1 == 0
+ movaps [rdi+rcx], xmm1 ; Save aligned
+ movaps [rdi+rcx+10H], xmm2 ; Save aligned
+ %ELSE
+ movntps [rdi+rcx], xmm1 ; Non-temporal save
+ movntps [rdi+rcx+10H], xmm2 ; Non-temporal save
+ %ENDIF
+ movaps xmm0, xmm3 ; Save for next iteration
+ add rcx, 20H ; Loop through negative values up to zero
+ jnz %%L1
+ ; Set up for edx remaining bytes
+ add rsi, rdx
+ add rdi, rdx
+ neg rdx
+ cmp edx, -10H
+ jg %%L2
+ ; One more 16-bytes block to move
+ movaps xmm1, [rsi+rdx+10H] ; Read next two blocks aligned
+ shufps xmm1, xmm1, 10010011B
+ movss xmm1, xmm0 ; Moves 4 bytes, leaves remaining bytes unchanged
+ %IF %1 == 0
+ movaps [rdi+rdx], xmm1 ; Save aligned
+ %ELSE
+ movntps [rdi+rdx], xmm1 ; Non-temporal save
+ %ENDIF
+ add rdx, 10H
+%%L2: ; Get src pointer back to misaligned state
+ add rsi, rax
+ %IF %1 == 1
+ sfence
+ %ENDIF
+ ; Move remaining 0 - 15 bytes, unaligned
+ jmp C200
+%ENDMACRO
+
+
+; Macros for each src alignment, Suppl.SSE3 instruction set:
+; Make separate code for each alignment u because the palignr instruction
+; has the shift count as a constant:
+
+%MACRO MOVE_UNALIGNED_SSSE3 1 ; u
+; Move rcx + rdx bytes of data
+; Source is misaligned. (src-dest) modulo 16 = %1
+; eax = %1
+; rsi = src - %1 = nearest preceding 16-bytes boundary
+; rdi = dest (aligned)
+; rcx = - (count rounded down to nearest divisible by 32)
+; edx = remaining bytes to move after loop
+ movdqa xmm0, [rsi+rcx] ; Read from nearest preceding 16B boundary
+
+%%L1: ; Loop. rcx has negative index from the end, counting up to zero
+ movdqa xmm2, [rsi+rcx+10H] ; Read next two blocks
+ movdqa xmm3, [rsi+rcx+20H]
+ movdqa xmm1, xmm0 ; Save xmm0
+ movdqa xmm0, xmm3 ; Save for next iteration
+ palignr xmm3, xmm2, %1 ; Combine parts into aligned block
+ palignr xmm2, xmm1, %1 ; Combine parts into aligned block
+ movdqa [rdi+rcx], xmm2 ; Save aligned
+ movdqa [rdi+rcx+10H], xmm3 ; Save aligned
+ add rcx, 20H
+ jnz %%L1
+
+ ; Set up for edx remaining bytes
+ add rsi, rdx
+ add rdi, rdx
+ neg rdx
+ cmp edx, -10H
+ jg %%L2
+ ; One more 16-bytes block to move
+ movdqa xmm2, [rsi+rdx+10H]
+ palignr xmm2, xmm0, %1
+ movdqa [rdi+rdx], xmm2
+ add rdx, 10H
+%%L2: ; Get src pointer back to misaligned state
+ add rsi, rax
+ ; Move remaining 0 - 15 bytes
+ jmp C200
+%ENDMACRO
+
+
+; Make 15 instances of SSE2 macro for each value of the alignment u.
+; These are pointed to by the jump table AlignmentDispatchSSE2 below
+; (alignments and fillers are inserted manually to minimize the number
+; of 16-bytes boundaries inside loops)
+
+align 16
+D104: MOVE_UNALIGNED_SSE2_4 0
+times 4 nop
+D108: MOVE_UNALIGNED_SSE2_8 0
+times 4 nop
+D10C: MOVE_UNALIGNED_SSE2_12 0
+times 1 nop
+D101: MOVE_UNALIGNED_SSE2 1, 0
+D102: MOVE_UNALIGNED_SSE2 2, 0
+D103: MOVE_UNALIGNED_SSE2 3, 0
+D105: MOVE_UNALIGNED_SSE2 5, 0
+D106: MOVE_UNALIGNED_SSE2 6, 0
+D107: MOVE_UNALIGNED_SSE2 7, 0
+D109: MOVE_UNALIGNED_SSE2 9, 0
+times 1 nop
+D10A: MOVE_UNALIGNED_SSE2 0AH, 0
+D10B: MOVE_UNALIGNED_SSE2 0BH, 0
+D10D: MOVE_UNALIGNED_SSE2 0DH, 0
+D10E: MOVE_UNALIGNED_SSE2 0EH, 0
+D10F: MOVE_UNALIGNED_SSE2 0FH, 0
+
+; Make 15 instances of Suppl-SSE3 macro for each value of the alignment u.
+; These are pointed to by the jump table AlignmentDispatchSupSSE3 below
+
+align 16
+E104: MOVE_UNALIGNED_SSSE3 4
+E108: MOVE_UNALIGNED_SSSE3 8
+E10C: MOVE_UNALIGNED_SSSE3 0CH
+E101: MOVE_UNALIGNED_SSSE3 1
+E102: MOVE_UNALIGNED_SSSE3 2
+E103: MOVE_UNALIGNED_SSSE3 3
+E105: MOVE_UNALIGNED_SSSE3 5
+E106: MOVE_UNALIGNED_SSSE3 6
+E107: MOVE_UNALIGNED_SSSE3 7
+E109: MOVE_UNALIGNED_SSSE3 9
+times 1 nop
+E10A: MOVE_UNALIGNED_SSSE3 0AH
+E10B: MOVE_UNALIGNED_SSSE3 0BH
+E10D: MOVE_UNALIGNED_SSSE3 0DH
+E10E: MOVE_UNALIGNED_SSSE3 0EH
+E10F: MOVE_UNALIGNED_SSSE3 0FH
+
+; Codes for non-temporal move. Aligned case first
+
+align 16
+F100: ; Non-temporal move, src and dest have same alignment.
+ ; Loop. rcx has negative index from the end, counting up to zero
+ movaps xmm0, [rsi+rcx] ; Read
+ movaps xmm1, [rsi+rcx+10H]
+ movntps [rdi+rcx], xmm0 ; Write non-temporal (bypass cache)
+ movntps [rdi+rcx+10H], xmm1
+ add rcx, 20H
+ jnz F100 ; Loop through negative rcx up to zero
+
+ ; Move the remaining edx bytes (0 - 31):
+ add rsi, rdx
+ add rdi, rdx
+ neg rdx
+ jz C500 ; Skip if no more data
+ ; Check if we can more one more 16-bytes block
+ cmp edx, -10H
+ jg C200
+ ; move 16 bytes, aligned
+ movaps xmm0, [rsi+rdx]
+ movntps [rdi+rdx], xmm0
+ add rdx, 10H
+ sfence
+ ; move the remaining 0 - 15 bytes
+ jmp C200
+
+; Make 15 instances of MOVE_UNALIGNED_SSE2 macro for each value of
+; the alignment u.
+; These are pointed to by the jump table AlignmentDispatchNT below
+
+;align 16
+F104: MOVE_UNALIGNED_SSE2_4 1
+F108: MOVE_UNALIGNED_SSE2_8 1
+F10C: MOVE_UNALIGNED_SSE2_12 1
+F101: MOVE_UNALIGNED_SSE2 1, 1
+F102: MOVE_UNALIGNED_SSE2 2, 1
+F103: MOVE_UNALIGNED_SSE2 3, 1
+F105: MOVE_UNALIGNED_SSE2 5, 1
+F106: MOVE_UNALIGNED_SSE2 6, 1
+F107: MOVE_UNALIGNED_SSE2 7, 1
+F109: MOVE_UNALIGNED_SSE2 9, 1
+F10A: MOVE_UNALIGNED_SSE2 0AH, 1
+F10B: MOVE_UNALIGNED_SSE2 0BH, 1
+F10D: MOVE_UNALIGNED_SSE2 0DH, 1
+F10E: MOVE_UNALIGNED_SSE2 0EH, 1
+F10F: MOVE_UNALIGNED_SSE2 0FH, 1
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; CPU dispatcher
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+memcpyCPUDispatch: ; CPU dispatcher, check for instruction sets and which method is fastest
+ ; This part is executed only once
+ push rbx
+ push rcx
+ push rdx
+ push rsi
+ push rdi
+ push r8
+ ; set CacheBypassLimit to half the size of the largest level cache
+ call GetMemcpyCacheLimit@
+ mov eax, 1
+ cpuid ; Get feature flags
+ lea rbx, [memcpySSE2@]
+ bt ecx, 9 ; Test bit for SupplSSE3
+ jnc Q100
+ lea rbx, [memcpySSSE3@]
+ call UnalignedIsFaster ; Test if unaligned read is faster than aligned read and shift
+ test eax, eax
+ jz Q100
+ lea rbx, [memcpyU@]
+ call Store256BitIsFaster ; Test if 256-bit read/write is available and faster than 128-bit read/write
+ test eax, eax
+ jz Q100
+ lea rbx, [memcpyU256@]
+Q100:
+ ; Insert appropriate pointer
+ mov [memcpyDispatch], rbx
+ mov rax, rbx
+ pop r8
+ pop rdi
+ pop rsi
+ pop rdx
+ pop rcx
+ pop rbx
+ ; Jump according to the replaced function pointer
+ jmp rax
+
+; extern "C" size_t GetMemcpyCacheLimit();
+GetMemcpyCacheLimit:
+GetMemcpyCacheLimit@: ; local limit
+ mov rax, [CacheBypassLimit]
+ test rax, rax
+ jnz U200
+ ; Get half the size of the largest level cache
+%ifdef WINDOWS
+ xor ecx, ecx ; 0 means largest level cache
+%else
+ xor edi, edi ; 0 means largest level cache
+%endif
+ call DataCacheSize ; get cache size
+ shr rax, 1 ; half the size
+ jnz U100
+ mov eax, 400000H ; cannot determine cache size. use 4 Mbytes
+U100: mov [CacheBypassLimit], rax
+U200: ret
+
+; Note: SetMemcpyCacheLimit is defined in memmove64.asm, calling SetMemcpyCacheLimit1
+SetMemcpyCacheLimit1:
+%ifdef WINDOWS
+ mov rax, rcx
+%else
+ mov rax, rdi
+%endif
+ test rax, rax
+ jnz U400
+ ; zero, means default
+ mov [CacheBypassLimit], rax
+ call GetMemcpyCacheLimit@
+U400: mov [CacheBypassLimit], rax
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; getDispatch, for testing only
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+getDispatch:
+mov rax,[memcpyDispatch]
+ret
+
+global getDispatch
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; data section. jump tables, dispatch function pointer, cache size
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Data segment must be included in function namespace
+SECTION .data
+align 16
+
+; Jump tables for alignments 0 - 15:
+; The CPU dispatcher replaces AlignmentDispatch with
+; AlignmentDispatchSSE2 or AlignmentDispatchSupSSE3 if Suppl-SSE3
+; is supported.
+
+; Code pointer for each alignment for SSE2 instruction set
+AlignmentDispatchSSE2:
+DQ C100, D101, D102, D103, D104, D105, D106, D107
+DQ D108, D109, D10A, D10B, D10C, D10D, D10E, D10F
+
+; Code pointer for each alignment for Suppl-SSE3 instruction set
+AlignmentDispatchSSSE3:
+DQ C100, E101, E102, E103, E104, E105, E106, E107
+DQ E108, E109, E10A, E10B, E10C, E10D, E10E, E10F
+
+; Code pointer for each alignment for non-temporal store
+AlignmentDispatchNT:
+DQ F100, F101, F102, F103, F104, F105, F106, F107
+DQ F108, F109, F10A, F10B, F10C, F10D, F10E, F10F
+
+; Pointer to appropriate version.
+; This initially points to memcpyCPUDispatch. memcpyCPUDispatch will
+; change this to the appropriate version of memcpy, so that
+; memcpyCPUDispatch is only executed once:
+memcpyDispatch DQ memcpyCPUDispatch
+
+; Bypass cache by using non-temporal moves if count > CacheBypassLimit
+; The optimal value of _CacheBypassLimit is difficult to estimate, but
+; a reasonable value is half the size of the largest cache:
+CacheBypassLimit: DQ 0
diff --git a/contrib/libs/asmlib/memmove64.asm b/contrib/libs/asmlib/memmove64.asm new file mode 100644 index 0000000000..1c61032541 --- /dev/null +++ b/contrib/libs/asmlib/memmove64.asm @@ -0,0 +1,1090 @@ +%include "defs.asm"
+
+;************************* memmove64.asm ***********************************
+; Author: Agner Fog
+; Date created: 2008-07-18
+; Last modified: 2016-11-16 (patched version with AVX512 support removed)
+; Description:
+; Faster version of the standard memmove function:
+; void * A_memmove(void *dest, const void *src, size_t count);
+; Moves 'count' bytes from 'src' to 'dest'. src and dest may overlap.
+;
+; Overriding standard function memmove:
+; The alias ?OVR_memmove is changed to _memmove in the object file if
+; it is desired to override the standard library function memmove.
+;
+; CPU dispatching included for different CPUs
+;
+; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+default rel
+
+global A_memmove: function ; Function A_memmove
+global EXP(memmove): function ; ?OVR removed if standard function memmove overridden
+global memmoveSSE2: function ; Version for processors with only SSE2
+global memmoveSSSE3: function ; Version for processors with SSSE3
+global memmoveU: function ; Version for processors with fast unaligned read
+global memmoveU256: function ; Version for processors with fast 256-bit read/write
+global SetMemcpyCacheLimit ; Change limit for bypassing cache
+
+; Imported from memcpy64.asm:
+extern A_memcpy ; function entry
+extern memcpySSE2 ; CPU specific function entry
+extern memcpySSSE3 ; CPU specific function entry
+extern memcpyU ; CPU specific function entry
+extern memcpyU256 ; CPU specific function entry
+
+; Imported from instrset64.asm
+extern InstructionSet ; Instruction set for CPU dispatcher
+
+; Imported from unalignedisfaster64.asm:
+extern UnalignedIsFaster ; Tells if unaligned read is faster than PALIGNR
+extern Store256BitIsFaster ; Tells if a 256 bit store is faster than two 128 bit stores
+
+; Imported from memcpy64.asm
+extern GetMemcpyCacheLimit ; Get the size limit for bypassing cache when copying with memcpy and memmove
+extern SetMemcpyCacheLimit1 ; Set the size limit for bypassing cache when copying with memcpy
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Prolog macro. Determine if we should move forwards or backwards
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Define prolog for this function
+; Parameter 1 is forward function label
+%MACRO PROLOGM 1
+%IFDEF WINDOWS
+ ; Check if dest overlaps src
+ mov rax, rcx
+ sub rax, rdx
+ cmp rax, r8
+ ; We can avoid testing for dest < src by using unsigned compare:
+ ; (Assume that the memory block cannot span across address 0)
+ ; Must move backwards if unsigned(dest-src) < count
+ jae %1 ; Jump to memcpy if we can move forwards
+ push rsi
+ push rdi
+ mov rdi, rcx ; dest
+ mov r9, rcx ; dest
+ mov rsi, rdx ; src
+ mov rcx, r8 ; count
+%ELSE ; Unix
+ ; Check if dest overlaps src
+ mov rax, rdi
+ sub rax, rsi
+ cmp rax, rdx
+ ; Must move backwards if unsigned(dest-src) < count
+ jae %1 ; Jump to memcpy if we can move forwards
+ mov rcx, rdx ; count
+ mov r9, rdi ; dest
+%ENDIF
+%ENDM
+
+
+; Define return from this function
+%MACRO RETURNM 0
+%IFDEF WINDOWS
+ pop rdi
+ pop rsi
+%ENDIF
+ mov rax, r9 ; Return value = dest
+ ret
+%ENDMACRO
+
+
+SECTION .text align=16
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Common entry for dispatch
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; extern "C" void * A_memmove(void * dest, const void * src, size_t count);
+; Function entry:
+A_memmove:
+EXP(memmove):
+ jmp qword [memmoveDispatch] ; Go to appropriate version, depending on instruction set
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; AVX Version for processors with fast unaligned read and fast 32 bytes write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+memmoveU256: ; Version for processors with fast 256-bit read/write
+memmoveU256@: ; local label
+ PROLOGM memcpyU256
+
+ cmp rcx, 40H
+ jb A1000 ; Use simpler code if count < 64
+
+ ; count >= 64
+ ; Note: this part will not always work if count < 64
+ ; Calculate size of last block after last regular boundary of dest
+ lea edx, [rdi+rcx] ; end of dext
+ and edx, 1FH
+ jz B4300 ; Skip if end of dest aligned by 32
+
+ ; edx = size of last partial block, 1 - 31 bytes
+ test dl, 3
+ jz B4210
+ test dl, 1
+ jz B4201 ; B4200 if we haven't tested edx,3
+ ; move 1 byte
+ dec rcx
+ movzx eax, byte [rsi+rcx]
+ mov [rdi+rcx], al
+B4200: test dl, 2
+ jz B4210
+B4201: ; move 2 bytes
+ sub rcx, 2
+ movzx eax, word [rsi+rcx]
+ mov [rdi+rcx], ax
+B4210: test dl, 4
+ jz B4220
+ ; move 4 bytes
+ sub rcx, 4
+ mov eax, [rsi+rcx]
+ mov [rdi+rcx], eax
+B4220: test dl, 8
+ jz B4230
+ ; move 8 bytes
+ sub rcx, 8
+ mov rax, [rsi+rcx]
+ mov [rdi+rcx], rax
+B4230: test dl, 16
+ jz B4300
+ ; move 16 bytes
+ sub rcx, 16
+ movups xmm0, [rsi+rcx]
+ movaps [rdi+rcx], xmm0
+
+B4300: ; Now end of dest is aligned by 32. Any partial block has been moved
+ mov rdx, rcx
+ and ecx, 1FH ; remaining size after 32 bytes blocks moved
+ and rdx, -20H ; number of 32 bytes blocks
+ jz H4100
+ add rsi, rcx
+ add rdi, rcx
+
+ ; Check if count very big
+ cmp rdx, [CacheBypassLimit]
+ ja H4800 ; Use non-temporal store if count > _CacheBypassLimit
+
+align 16
+H4000: ; 32 bytes move loop
+ vmovups ymm0, [rsi+rdx-20H]
+ vmovaps [rdi+rdx-20H], ymm0
+ sub rdx, 20H
+ jnz H4000
+ vzeroupper
+
+H4090: sub rsi, rcx
+ sub rdi, rcx
+
+H4100: ; remaining 0-31 bytes
+ test ecx, ecx
+ jz H4600
+ test cl, 10H
+ jz H4200
+ ; move 16 bytes
+ sub ecx, 10H
+ movups xmm0, [rsi+rcx]
+ movaps [rdi+rcx], xmm0
+ jz H4600 ; early out if count divisible by 16
+H4200: test cl, 8
+ jz H4300
+ ; move 8 bytes
+ sub ecx, 8
+ mov rax, [rsi+rcx]
+ mov [rdi+rcx], rax
+H4300: test cl, 4
+ jz H4400
+ ; move 4 bytes
+ sub ecx, 4
+ mov eax, [rsi+rcx]
+ mov [rdi+rcx], eax
+ jz H4600 ; early out if count divisible by 4
+H4400: test cl, 2
+ jz H4500
+ ; move 2 bytes
+ sub ecx, 2
+ movzx eax, word [rsi+rcx]
+ mov [rdi+rcx], ax
+H4500: test cl, 1
+ jz H4600
+ ; move 1 byte
+ movzx eax, byte [rsi] ; rcx-1 = 0
+ mov [rdi], al
+H4600: ; finished
+ RETURNM
+
+align 16
+H4800: ; 32 bytes move loop, bypass cache
+ vmovups ymm0, [rsi+rdx-20H]
+ vmovntps [rdi+rdx-20H], ymm0
+ sub rdx, 20H
+ jnz H4800
+ sfence
+ vzeroupper
+ jmp H4090
+
+A1000: ; count < 64. Move 32-16-8-4-2-1 bytes
+ test cl, 20H
+ jz A1100
+ ; move 32 bytes
+ ; movups is faster on processors with SSSE3
+ sub ecx, 20H
+ movups xmm0, [rsi+rcx+10H]
+ movups xmm1, [rsi+rcx]
+ movups [rdi+rcx+10H], xmm0
+ movups [rdi+rcx], xmm1
+A1100: test cl, 10H
+ jz A1200
+ ; move 16 bytes
+ sub ecx, 10H
+ movups xmm0, [rsi+rcx]
+ movups [rdi+rcx], xmm0
+A1200: test cl, 8
+ jz A1300
+ ; move 8 bytes
+ sub ecx, 8
+ mov rax, [rsi+rcx]
+ mov [rdi+rcx], rax
+A1300: test cl, 4
+ jz A1400
+ ; move 4 bytes
+ sub ecx, 4
+ mov eax, [rsi+rcx]
+ mov [rdi+rcx], eax
+ jz A1900 ; early out if count divisible by 4
+A1400: test cl, 2
+ jz A1500
+ ; move 2 bytes
+ sub ecx, 2
+ movzx eax, word [rsi+rcx]
+ mov [rdi+rcx], ax
+A1500: test cl, 1
+ jz A1900
+ ; move 1 byte
+ movzx eax, byte [rsi] ; rcx-1 = 0
+ mov [rdi], al
+A1900: ; finished
+ RETURNM
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Version for processors with fast unaligned read and fast 16 bytes write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+memmoveU: ; Version for processors with fast unaligned read
+memmoveU@: ; local label
+ PROLOGM memcpyU
+
+ cmp rcx, 40H
+ jb A1000 ; Use simpler code if count < 64
+
+ ; count >= 64
+ ; Note: this part will not always work if count < 64
+ ; Calculate size of last block after last regular boundary of dest
+ lea edx, [rdi+rcx] ; end of dext
+ and edx, 0FH
+ jz B3300 ; Skip if end of dest aligned by 16
+
+ ; edx = size of last partial block, 1 - 15 bytes
+ test dl, 3
+ jz B3210
+ test dl, 1
+ jz B3201 ; B3200 if we haven't tested edx,3
+ ; move 1 byte
+ dec rcx
+ movzx eax, byte [rsi+rcx]
+ mov [rdi+rcx], al
+B3200: test dl, 2
+ jz B3210
+B3201: ; move 2 bytes
+ sub rcx, 2
+ movzx eax, word [rsi+rcx]
+ mov [rdi+rcx], ax
+B3210: test dl, 4
+ jz B3220
+ ; move 4 bytes
+ sub rcx, 4
+ mov eax, [rsi+rcx]
+ mov [rdi+rcx], eax
+B3220: test dl, 8
+ jz B3300
+ ; move 8 bytes
+ sub rcx, 8
+ mov rax, [rsi+rcx]
+ mov [rdi+rcx], rax
+
+B3300: ; Now end of dest is aligned by 16. Any partial block has been moved
+ mov rdx, rcx
+ and ecx, 1FH ; remaining size after 32 bytes blocks moved
+ and rdx, -20H ; number of 32 bytes blocks
+ jz H1100
+ add rsi, rcx
+ add rdi, rcx
+
+ ; Check if count very big
+ cmp rdx, [CacheBypassLimit]
+ ja H1800 ; Use non-temporal store if count > _CacheBypassLimit
+
+align 16 ; minimize 16-bytes boundaries in H1000 loop
+H1000: ; 32 bytes move loop
+ movups xmm1, [rsi+rdx-20H]
+ movups xmm0, [rsi+rdx-10H]
+ movaps [rdi+rdx-20H], xmm1
+ movaps [rdi+rdx-10H], xmm0
+ sub rdx, 20H
+ jnz H1000
+
+H1090: sub rsi, rcx
+ sub rdi, rcx
+
+H1100: ; remaining 0-31 bytes
+ test ecx, ecx
+ jz H1600
+ test cl, 10H
+ jz H1200
+ ; move 16 bytes
+ sub ecx, 10H
+ movups xmm0, [rsi+rcx]
+ movaps [rdi+rcx], xmm0
+ jz H1600 ; early out if count divisible by 16
+H1200: test cl, 8
+ jz H1300
+ ; move 8 bytes
+ sub ecx, 8
+ mov rax, [rsi+rcx]
+ mov [rdi+rcx], rax
+H1300: test cl, 4
+ jz H1400
+ ; move 4 bytes
+ sub ecx, 4
+ mov eax, [rsi+rcx]
+ mov [rdi+rcx], eax
+ jz H1600 ; early out if count divisible by 4
+H1400: test cl, 2
+ jz H1500
+ ; move 2 bytes
+ sub ecx, 2
+ movzx eax, word [rsi+rcx]
+ mov [rdi+rcx], ax
+H1500: test cl, 1
+ jz H1600
+ ; move 1 byte
+ movzx eax, byte [rsi] ; rcx-1 = 0
+ mov [rdi], al
+H1600: ; finished
+ RETURNM
+
+align 16
+H1800: ; 32 bytes move loop, bypass cache
+ movups xmm1, [rsi+rdx-20H]
+ movups xmm0, [rsi+rdx-10H]
+ movntps [rdi+rdx-20H], xmm1
+ movntps [rdi+rdx-10H], xmm0
+ sub rdx, 20H
+ jnz H1800
+ sfence
+ jmp H1090
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Version for processors with SSSE3. Aligned read + shift + aligned write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+memmoveSSSE3: ; SSSE3 version begins here
+memmoveSSSE3@: ; local label
+ PROLOGM memcpySSSE3
+
+ ; Cannot use memcpy. Must move backwards because of overlap between src and dest
+ cmp rcx, 40H
+ jb A1000 ; Use simpler code if count < 64
+ ; count >= 64
+ ; Note: this part will not always work if count < 64
+ ; Calculate size of last block after last regular boundary of dest
+ lea edx, [rdi+rcx] ; end of dext
+ and edx, 0FH
+ jz B1300 ; Skip if end of dest aligned by 16
+
+ ; edx = size of last partial block, 1 - 15 bytes
+ test dl, 3
+ jz B1210
+ test dl, 1
+ jz B1201 ; B1200 if we haven't tested edx,3
+ ; move 1 byte
+ dec rcx
+ movzx eax, byte [rsi+rcx]
+ mov [rdi+rcx], al
+B1200: test dl, 2
+ jz B1210
+B1201: ; move 2 bytes
+ sub rcx, 2
+ movzx eax, word [rsi+rcx]
+ mov [rdi+rcx], ax
+B1210: test dl, 4
+ jz B1220
+ ; move 4 bytes
+ sub rcx, 4
+ mov eax, [rsi+rcx]
+ mov [rdi+rcx], eax
+B1220: test dl, 8
+ jz B1300
+ ; move 8 bytes
+ sub rcx, 8
+ mov rax, [rsi+rcx]
+ mov [rdi+rcx], rax
+
+B1300: ; Now end of dest is aligned by 16. Any partial block has been moved
+ ; Find alignment of end of src modulo 16 at this point:
+ lea eax, [rsi+rcx]
+ and eax, 0FH
+
+ ; Set up for loop moving 32 bytes per iteration:
+ mov edx, ecx ; Save count
+ and rcx, -20H ; Round down to nearest multiple of 32
+ sub edx, ecx ; Remaining data after loop
+ sub rsi, rax ; Nearest preceding aligned block of src
+ ; Add the same to rsi and rdi as we have subtracted from rcx
+ add rsi, rdx
+ add rdi, rdx
+
+ ; Check if count very big
+ cmp rcx, [CacheBypassLimit]
+ ja B1400 ; Use non-temporal store if count > CacheBypassLimit
+
+ ; Dispatch to different codes depending on src alignment
+ lea r8, [MAlignmentDispatchSSSE3]
+ jmp near [r8+rax*8]
+
+B1400: ; Dispatch to different codes depending on src alignment
+ lea r8, [MAlignmentDispatchNT]
+ jmp near [r8+rax*8]
+
+
+align 16
+C100: ; Code for aligned src. SSE2 and later CPUs
+ ; The nice case, src and dest have same alignment.
+
+ ; Loop. rcx has positive index from the beginning, counting down to zero
+ movaps xmm0, [rsi+rcx-10H]
+ movaps xmm1, [rsi+rcx-20H]
+ movaps [rdi+rcx-10H], xmm0
+ movaps [rdi+rcx-20H], xmm1
+ sub rcx, 20H
+ jnz C100
+
+ ; Move the remaining edx bytes (0 - 31):
+ ; move 16-8-4-2-1 bytes, aligned
+ test edx, edx
+ jz C500 ; Early out if no more data
+ test dl, 10H
+ jz C200
+ ; move 16 bytes
+ sub rcx, 10H
+ movaps xmm0, [rsi+rcx]
+ movaps [rdi+rcx], xmm0
+
+C200: ; Other branches come in here, rcx may contain arbitrary offset
+ test edx, edx
+ jz C500 ; Early out if no more data
+ test dl, 8
+ jz C210
+ ; move 8 bytes
+ sub rcx, 8
+ mov rax, [rsi+rcx]
+ mov [rdi+rcx], rax
+C210: test dl, 4
+ jz C220
+ ; move 4 bytes
+ sub rcx, 4
+ mov eax, [rsi+rcx]
+ mov [rdi+rcx], eax
+ jz C500 ; Early out if count divisible by 4
+C220: test dl, 2
+ jz C230
+ ; move 2 bytes
+ sub rcx, 2
+ movzx eax, word [rsi+rcx]
+ mov [rdi+rcx], ax
+C230: test dl, 1
+ jz C500
+ ; move 1 byte
+ movzx eax, byte [rsi+rcx-1] ; rcx-1 is not always 0 here
+ mov [rdi+rcx-1], al
+C500: ; finished
+ RETURNM
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Version for processors with SSE2. Aligned read + shift + aligned write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+memmoveSSE2: ; SSE2 version begins here
+memmoveSSE2@: ; local label
+ PROLOGM memcpySSE2
+
+ ; Cannot use memcpy. Must move backwards because of overlap between src and dest
+ cmp rcx, 40H
+ jae B0100 ; Use simpler code if count < 64
+
+ ; count < 64. Move 32-16-8-4-2-1 bytes
+ test cl, 20H
+ jz A100
+ ; move 32 bytes
+ ; mov is faster than movdqu on SSE2 processors,
+ ; movdqu is faster on later processors
+ sub ecx, 20H
+ mov rax, [rsi+rcx+18H]
+ mov rdx, [rsi+rcx+10H]
+ mov [rdi+rcx+18H], rax
+ mov [rdi+rcx+10H], rdx
+ mov rax, [rsi+rcx+8]
+ mov rdx, [rsi+rcx]
+ mov [rdi+rcx+8], rax
+ mov [rdi+rcx], rdx
+A100: test cl, 10H
+ jz A200
+ ; move 16 bytes
+ sub ecx, 10H
+ mov rax, [rsi+rcx+8]
+ mov rdx, [rsi+rcx]
+ mov [rdi+rcx+8], rax
+ mov [rdi+rcx], rdx
+A200: test cl, 8
+ jz A300
+ ; move 8 bytes
+ sub ecx, 8
+ mov rax, [rsi+rcx]
+ mov [rdi+rcx], rax
+A300: test cl, 4
+ jz A400
+ ; move 4 bytes
+ sub ecx, 4
+ mov eax, [rsi+rcx]
+ mov [rdi+rcx], eax
+ jz A900 ; early out if count divisible by 4
+A400: test cl, 2
+ jz A500
+ ; move 2 bytes
+ sub ecx, 2
+ movzx eax, word [rsi+rcx]
+ mov [rdi+rcx], ax
+A500: test cl, 1
+ jz A900
+ ; move 1 byte
+ movzx eax, byte [rsi] ; rcx-1 = 0
+ mov [rdi], al
+A900: ; finished
+ RETURNM
+
+B0100: ; count >= 64
+ ; Note: this part will not always work if count < 64
+ ; Calculate size of last block after last regular boundary of dest
+ lea edx, [rdi+rcx] ; end of dext
+ and edx, 0FH
+ jz B0300 ; Skip if end of dest aligned by 16
+
+ ; edx = size of last partial block, 1 - 15 bytes
+ test dl, 3
+ jz B0210
+ test dl, 1
+ jz B0201 ; B0200 if we haven't tested edx,3
+ ; move 1 byte
+ dec rcx
+ movzx eax, byte [rsi+rcx]
+ mov [rdi+rcx], al
+B0200: test dl, 2
+ jz B0210
+B0201: ; move 2 bytes
+ sub rcx, 2
+ movzx eax, word [rsi+rcx]
+ mov [rdi+rcx], ax
+B0210: test dl, 4
+ jz B0220
+ ; move 4 bytes
+ sub rcx, 4
+ mov eax, [rsi+rcx]
+ mov [rdi+rcx], eax
+B0220: test dl, 8
+ jz B0300
+ ; move 8 bytes
+ sub rcx, 8
+ mov rax, [rsi+rcx]
+ mov [rdi+rcx], rax
+
+B0300: ; Now end of dest is aligned by 16. Any partial block has been moved
+ ; Find alignment of end of src modulo 16 at this point:
+ lea eax, [rsi+rcx]
+ and eax, 0FH
+
+ ; Set up for loop moving 32 bytes per iteration:
+ mov edx, ecx ; Save count
+ and rcx, -20H ; Round down to nearest multiple of 32
+ sub edx, ecx ; Remaining data after loop
+ sub rsi, rax ; Nearest preceding aligned block of src
+ ; Add the same to rsi and rdi as we have subtracted from rcx
+ add rsi, rdx
+ add rdi, rdx
+
+ ; Check if count very big
+ cmp rcx, [CacheBypassLimit]
+ ja B0400 ; Use non-temporal store if count > CacheBypassLimit
+
+ ; Dispatch to different codes depending on src alignment
+ lea r8, [MAlignmentDispatchSSE2]
+ jmp near [r8+rax*8]
+
+B0400: ; Dispatch to different codes depending on src alignment
+ lea r8, [MAlignmentDispatchNT]
+ jmp near [r8+rax*8]
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Macros and alignment jump tables
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Macros for each src alignment, SSE2 instruction set:
+; Make separate code for each alignment u because the shift instructions
+; have the shift count as a constant:
+
+%MACRO MOVE_REVERSE_UNALIGNED_SSE2 2 ; u, nt
+; Move rcx + rdx bytes of data
+; Source is misaligned. (src-dest) modulo 16 = %1
+; %2 = 1 if non-temporal store desired
+; eax = %1
+; rsi = src - %1 = nearest preceding 16-bytes boundary
+; rdi = dest (aligned)
+; rcx = count rounded down to nearest divisible by 32
+; edx = remaining bytes to move after loop
+ movdqa xmm0, [rsi+rcx] ; Read from nearest following 16B boundary
+%%L1: ; Loop. rcx has positive index from the beginning, counting down to zero
+ sub rcx, 20H
+ movdqa xmm1, [rsi+rcx+10H] ; Read next two blocks aligned
+ movdqa xmm2, [rsi+rcx]
+ movdqa xmm3, xmm1 ; Copy because used twice
+ pslldq xmm0, 16-%1 ; shift left
+ psrldq xmm1, %1 ; shift right
+ por xmm0, xmm1 ; combine blocks
+ %IF %2 == 0
+ movdqa [rdi+rcx+10H], xmm0 ; Save aligned
+ %ELSE
+ movntdq [rdi+rcx+10H], xmm0 ; Save aligned
+ %ENDIF
+ movdqa xmm0, xmm2 ; Save for next iteration
+ pslldq xmm3, 16-%1 ; shift left
+ psrldq xmm2, %1 ; shift right
+ por xmm3, xmm2 ; combine blocks
+ %IF %2 == 0
+ movdqa [rdi+rcx], xmm3 ; Save aligned
+ %ELSE
+ movntdq [rdi+rcx], xmm3 ; Save aligned
+ %ENDIF
+ jnz %%L1
+
+ ; Move edx remaining bytes
+ test dl, 10H
+ jz %%L2
+ ; One more 16-bytes block to move
+ sub rcx, 10H
+ movdqa xmm1, [rsi+rcx]
+ pslldq xmm0, 16-%1 ; shift left
+ psrldq xmm1, %1 ; shift right
+ por xmm0, xmm1 ; combine blocks
+ %IF %2 == 0
+ movdqa [rdi+rcx], xmm0 ; Save aligned
+ %ELSE
+ movntdq [rdi+rcx], xmm0 ; Save aligned
+ %ENDIF
+%%L2: ; Get src pointer back to misaligned state
+ add rsi, rax
+ %IF %2 == 1
+ sfence
+ %ENDIF
+ ; Move remaining 0 - 15 bytes, unaligned
+ jmp C200
+%ENDMACRO
+
+
+%MACRO MOVE_REVERSE_UNALIGNED_SSE2_4 1 ; nt
+; Special case: u = 4
+; %1 = 1 if non-temporal store desired
+ movaps xmm0, [rsi+rcx] ; Read from nearest following 16B boundary
+%%L1: ; Loop. rcx has positive index from the beginning, counting down to zero
+ sub rcx, 20H
+ movaps xmm1, [rsi+rcx+10H] ; Read next two blocks aligned
+ movaps xmm2, [rsi+rcx]
+ movaps xmm3, xmm0
+ movaps xmm0, xmm2
+ movss xmm2, xmm1
+ shufps xmm2, xmm2, 00111001B ; Rotate right
+ movss xmm1, xmm3
+ shufps xmm1, xmm1, 00111001B ; Rotate right
+ %IF %1 == 0
+ movaps [rdi+rcx+10H], xmm1 ; Save aligned
+ movaps [rdi+rcx], xmm2 ; Save aligned
+ %ELSE
+ movntps [rdi+rcx+10H], xmm1 ; Non-temporal save
+ movntps [rdi+rcx], xmm2 ; Non-temporal save
+ %ENDIF
+ jnz %%L1
+
+ ; Move edx remaining bytes
+ test dl, 10H
+ jz %%L2
+ ; One more 16-bytes block to move
+ sub rcx, 10H
+ movaps xmm1, [rsi+rcx]
+ movss xmm1, xmm0
+ shufps xmm1, xmm1, 00111001B ; Rotate right
+ %IF %1 == 0
+ movaps [rdi+rcx], xmm1 ; Save aligned
+ %ELSE
+ movntps [rdi+rcx], xmm1 ; Non-temporal save
+ %ENDIF
+%%L2: ; Get src pointer back to misaligned state
+ add rsi, rax
+ %IF %1 == 1
+ sfence
+ %ENDIF
+ ; Move remaining 0 - 15 bytes, unaligned
+ jmp C200
+%ENDMACRO
+
+
+%MACRO MOVE_REVERSE_UNALIGNED_SSE2_8 1 ; nt
+; Special case: u = 8
+; %1 = 1 if non-temporal store desired
+ movaps xmm0, [rsi+rcx] ; Read from nearest following 16B boundary
+ shufps xmm0, xmm0, 01001110B ; Rotate
+%%L1: ; Loop. rcx has positive index from the beginning, counting down to zero
+ sub rcx, 20H
+ movaps xmm1, [rsi+rcx+10H] ; Read next two blocks aligned
+ shufps xmm1, xmm1, 01001110B ; Rotate
+ movsd xmm0, xmm1
+ %IF %1 == 0
+ movaps [rdi+rcx+10H], xmm0 ; Save aligned
+ %ELSE
+ movntps [rdi+rcx+10H], xmm0 ; Non-temporal save
+ %ENDIF
+ movaps xmm0, [rsi+rcx]
+ shufps xmm0, xmm0, 01001110B ; Rotate
+ movsd xmm1, xmm0
+ %IF %1 == 0
+ movaps [rdi+rcx], xmm1 ; Save aligned
+ %ELSE
+ movntps [rdi+rcx], xmm1 ; Non-temporal save
+ %ENDIF
+ jnz %%L1
+
+ ; Move edx remaining bytes
+ test dl, 10H
+ jz %%L2
+ ; One more 16-bytes block to move
+ sub rcx, 10H
+ movaps xmm1, [rsi+rcx]
+ shufps xmm1, xmm1, 01001110B ; Rotate
+ movsd xmm0, xmm1
+ %IF %1 == 0
+ movaps [rdi+rcx], xmm0 ; Save aligned
+ %ELSE
+ movntps [rdi+rcx], xmm0 ; Non-temporal save
+ %ENDIF
+%%L2: ; Get src pointer back to misaligned state
+ add rsi, rax
+ %IF %1 == 1
+ sfence
+ %ENDIF
+ ; Move remaining 0 - 15 bytes, unaligned
+ jmp C200
+%ENDMACRO
+
+
+%MACRO MOVE_REVERSE_UNALIGNED_SSE2_12 1 ; nt
+; Special case: u = 12
+; %1 = 1 if non-temporal store desired
+ movaps xmm0, [rsi+rcx] ; Read from nearest following 16B boundary
+ shufps xmm0, xmm0, 10010011B ; Rotate right
+%%L1: ; Loop. rcx has positive index from the beginning, counting down to zero
+ sub rcx, 20H
+ movaps xmm1, [rsi+rcx+10H] ; Read next two blocks aligned
+ shufps xmm1, xmm1, 10010011B ; Rotate left
+ movss xmm0, xmm1
+ %IF %1 == 0
+ movaps [rdi+rcx+10H], xmm0 ; Save aligned
+ %ELSE
+ movntps [rdi+rcx+10H], xmm0 ; Non-temporal save
+ %ENDIF
+ movaps xmm0, [rsi+rcx]
+ shufps xmm0, xmm0, 10010011B ; Rotate left
+ movss xmm1, xmm0
+ %IF %1 == 0
+ movaps [rdi+rcx], xmm1 ; Save aligned
+ %ELSE
+ movntps [rdi+rcx], xmm1 ; Non-temporal save
+ %ENDIF
+ jnz %%L1
+
+ ; Move edx remaining bytes
+ test dl, 10H
+ jz %%L2
+ ; One more 16-bytes block to move
+ sub rcx, 10H
+ movaps xmm1, [rsi+rcx]
+ shufps xmm1, xmm1, 10010011B ; Rotate left
+ movss xmm0, xmm1
+ %IF %1 == 0
+ movaps [rdi+rcx], xmm0 ; Save aligned
+ %ELSE
+ movntps [rdi+rcx], xmm0 ; Non-temporal save
+ %ENDIF
+%%L2: ; Get src pointer back to misaligned state
+ add rsi, rax
+ %IF %1 == 1
+ sfence
+ %ENDIF
+ ; Move remaining 0 - 15 bytes, unaligned
+ jmp C200
+%ENDMACRO
+
+
+; Macros for each src alignment, Suppl.SSE3 instruction set:
+; Code for unaligned src, Suppl.SSE3 instruction set.
+; Make separate code for each alignment u because the palignr instruction
+; has the shift count as a constant:
+
+%MACRO MOVE_REVERSE_UNALIGNED_SSSE3 1; u
+; Move rcx + rdx bytes of data
+; Source is misaligned. (src-dest) modulo 16 = %1
+; eax = %1
+; rsi = src - %1 = nearest preceding 16-bytes boundary
+; rdi = dest (aligned)
+; rcx = - (count rounded down to nearest divisible by 32)
+; edx = remaining bytes to move after loop
+ movdqa xmm0, [rsi+rcx] ; Read from nearest following 16B boundary
+
+%%L1: ; Loop. rcx has positive index from the beginning, counting down to zero
+ movdqa xmm1, [rsi+rcx-10H] ; Read next two blocks
+ palignr xmm0, xmm1, %1 ; Combine parts into aligned block
+ movdqa [rdi+rcx-10H], xmm0 ; Save aligned
+ movdqa xmm0, [rsi+rcx-20H]
+ palignr xmm1, xmm0, %1 ; Combine parts into aligned block
+ movdqa [rdi+rcx-20H], xmm1 ; Save aligned
+ sub rcx, 20H
+ jnz %%L1
+
+ ; Set up for edx remaining bytes
+ test dl, 10H
+ jz %%L2
+ ; One more 16-bytes block to move
+ sub rcx, 10H
+ movdqa xmm1, [rsi+rcx] ; Read next two blocks
+ palignr xmm0, xmm1, %1 ; Combine parts into aligned block
+ movdqa [rdi+rcx], xmm0 ; Save aligned
+
+%%L2: ; Get src pointer back to misaligned state
+ add rsi, rax
+ ; Move remaining 0 - 15 bytes
+ jmp C200
+%ENDMACRO
+
+
+; Make 15 instances of SSE2 macro for each value of the alignment u.
+; These are pointed to by the jump table MAlignmentDispatchSSE2 below
+; (aligns and fillers are inserted manually to minimize the
+; number of 16-bytes boundaries inside loops)
+
+align 16
+D104: MOVE_REVERSE_UNALIGNED_SSE2_4 0
+D108: MOVE_REVERSE_UNALIGNED_SSE2_8 0
+D10C: MOVE_REVERSE_UNALIGNED_SSE2_12 0
+D101: MOVE_REVERSE_UNALIGNED_SSE2 1, 0
+D102: MOVE_REVERSE_UNALIGNED_SSE2 2, 0
+D103: MOVE_REVERSE_UNALIGNED_SSE2 3, 0
+D105: MOVE_REVERSE_UNALIGNED_SSE2 5, 0
+D106: MOVE_REVERSE_UNALIGNED_SSE2 6, 0
+D107: MOVE_REVERSE_UNALIGNED_SSE2 7, 0
+D109: MOVE_REVERSE_UNALIGNED_SSE2 9, 0
+D10A: MOVE_REVERSE_UNALIGNED_SSE2 0AH, 0
+D10B: MOVE_REVERSE_UNALIGNED_SSE2 0BH, 0
+D10D: MOVE_REVERSE_UNALIGNED_SSE2 0DH, 0
+D10E: MOVE_REVERSE_UNALIGNED_SSE2 0EH, 0
+D10F: MOVE_REVERSE_UNALIGNED_SSE2 0FH, 0
+
+; Make 15 instances of Suppl-SSE3 macro for each value of the alignment u.
+; These are pointed to by the jump table MAlignmentDispatchSupSSE3 below
+
+align 16
+E104: MOVE_REVERSE_UNALIGNED_SSSE3 4
+E108: MOVE_REVERSE_UNALIGNED_SSSE3 8
+E10C: MOVE_REVERSE_UNALIGNED_SSSE3 0CH
+E101: MOVE_REVERSE_UNALIGNED_SSSE3 1
+E102: MOVE_REVERSE_UNALIGNED_SSSE3 2
+E103: MOVE_REVERSE_UNALIGNED_SSSE3 3
+E105: MOVE_REVERSE_UNALIGNED_SSSE3 5
+E106: MOVE_REVERSE_UNALIGNED_SSSE3 6
+E107: MOVE_REVERSE_UNALIGNED_SSSE3 7
+E109: MOVE_REVERSE_UNALIGNED_SSSE3 9
+E10A: MOVE_REVERSE_UNALIGNED_SSSE3 0AH
+E10B: MOVE_REVERSE_UNALIGNED_SSSE3 0BH
+E10D: MOVE_REVERSE_UNALIGNED_SSSE3 0DH
+E10E: MOVE_REVERSE_UNALIGNED_SSSE3 0EH
+E10F: MOVE_REVERSE_UNALIGNED_SSSE3 0FH
+
+align 16
+F100: ; Non-temporal move, src and dest have same alignment.
+ ; Loop. rcx has positive index from the beginning, counting down to zero
+ sub rcx, 20H
+ movaps xmm0, [rsi+rcx+10H]
+ movaps xmm1, [rsi+rcx]
+ movntps [rdi+rcx+10H], xmm0
+ movntps [rdi+rcx], xmm1
+ jnz F100
+
+ ; Move the remaining edx bytes (0 - 31):
+ ; move 16-8-4-2-1 bytes, aligned
+ test dl, 10H
+ jz C200
+ ; move 16 bytes
+ sub rcx, 10H
+ movaps xmm0, [rsi+rcx]
+ movntps [rdi+rcx], xmm0
+ sfence
+ ; move the remaining 0 - 15 bytes
+ jmp C200
+
+; Non-temporal move, src and dest have different alignment.
+; Make 15 instances of SSE2 macro for each value of the alignment u.
+; These are pointed to by the jump table MAlignmentDispatchNT below
+
+align 16
+F101: MOVE_REVERSE_UNALIGNED_SSE2 1, 1
+F102: MOVE_REVERSE_UNALIGNED_SSE2 2, 1
+F103: MOVE_REVERSE_UNALIGNED_SSE2 3, 1
+F104: MOVE_REVERSE_UNALIGNED_SSE2_4 1
+F105: MOVE_REVERSE_UNALIGNED_SSE2 5, 1
+F106: MOVE_REVERSE_UNALIGNED_SSE2 6, 1
+F107: MOVE_REVERSE_UNALIGNED_SSE2 7, 1
+F108: MOVE_REVERSE_UNALIGNED_SSE2_8 1
+F109: MOVE_REVERSE_UNALIGNED_SSE2 9, 1
+F10A: MOVE_REVERSE_UNALIGNED_SSE2 0AH, 1
+F10B: MOVE_REVERSE_UNALIGNED_SSE2 0BH, 1
+F10C: MOVE_REVERSE_UNALIGNED_SSE2_12 1
+F10D: MOVE_REVERSE_UNALIGNED_SSE2 0DH, 1
+F10E: MOVE_REVERSE_UNALIGNED_SSE2 0EH, 1
+F10F: MOVE_REVERSE_UNALIGNED_SSE2 0FH, 1
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; CPU dispatcher
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+memmoveCPUDispatch: ; CPU dispatcher, check for Suppl-SSE3 instruction set
+ ; This part is executed only once
+ push rbx
+ push rcx
+ push rdx
+ push rsi
+ push rdi
+ push r8
+
+ ; set CacheBypassLimit to half the size of the largest level cache
+%ifdef WINDOWS
+ xor ecx, ecx ; 0 means default
+%else
+ xor edi, edi
+%endif
+ call SetMemcpyCacheLimit@
+ mov eax, 1
+ cpuid ; Get feature flags
+ lea rbx, [memmoveSSE2@]
+ bt ecx, 9 ; Test bit for SupplSSE3
+ jnc Q100
+ lea rbx, [memmoveSSSE3@]
+ call UnalignedIsFaster
+ test eax, eax
+ jz Q100
+ lea rbx, [memmoveU@]
+ call Store256BitIsFaster
+ test eax, eax
+ jz Q100
+ lea rbx, [memmoveU256@]
+
+Q100: ; Insert appropriate pointer
+ mov [memmoveDispatch], rbx
+ mov rax, rbx
+ pop r8
+ pop rdi
+ pop rsi
+ pop rdx
+ pop rcx
+ pop rbx
+ ; Jump according to the replaced function pointer
+ jmp rax
+
+; Note: Must call SetMemcpyCacheLimit1 defined in memcpy64.asm
+SetMemcpyCacheLimit:
+SetMemcpyCacheLimit@:
+ call SetMemcpyCacheLimit1
+ mov [CacheBypassLimit], rax
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; data section. jump tables, dispatch function pointer, cache size
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Data segment must be included in function namespace
+SECTION .data
+align 16
+
+; Jump tables for alignments 0 - 15:
+; The CPU dispatcher replaces MAlignmentDispatch with
+; MAlignmentDispatchSSE2 or MAlignmentDispatchSupSSE3 if Suppl-SSE3
+; is supported.
+
+; Code pointer for each alignment for SSE2 instruction set
+MAlignmentDispatchSSE2:
+DQ C100, D101, D102, D103, D104, D105, D106, D107
+DQ D108, D109, D10A, D10B, D10C, D10D, D10E, D10F
+
+; Code pointer for each alignment for Suppl-SSE3 instruction set
+MAlignmentDispatchSSSE3:
+DQ C100, E101, E102, E103, E104, E105, E106, E107
+DQ E108, E109, E10A, E10B, E10C, E10D, E10E, E10F
+
+; Code pointer for each alignment for non-temporal store
+MAlignmentDispatchNT:
+DQ F100, F101, F102, F103, F104, F105, F106, F107
+DQ F108, F109, F10A, F10B, F10C, F10D, F10E, F10F
+
+memmoveDispatch: DQ memmoveCPUDispatch
+
+; Bypass cache by using non-temporal moves if count > _CacheBypassLimit
+; The optimal value of CacheBypassLimit is difficult to estimate, but
+; a reasonable value is half the size of the largest cache:
+CacheBypassLimit: DD 0
diff --git a/contrib/libs/asmlib/memset64.asm b/contrib/libs/asmlib/memset64.asm new file mode 100644 index 0000000000..52d647984d --- /dev/null +++ b/contrib/libs/asmlib/memset64.asm @@ -0,0 +1,372 @@ +%include "defs.asm"
+
+;************************* memset64.asm *************************************
+; Author: Agner Fog
+; Date created: 2008-07-19
+; Last modified: 2016-11-12 (patched version with AVX512 support removed)
+; Description:
+; Faster version of the standard memset function:
+; void * A_memset(void * dest, int c, size_t count);
+; Sets 'count' bytes from 'dest' to the 8-bit value 'c'
+;
+; Overriding standard function memset:
+; The alias ?OVR_memset is changed to _memset in the object file if
+; it is desired to override the standard library function memset.
+;
+; extern "C" size_t GetMemsetCacheLimit(); // Data blocks bigger than this will be stored uncached by memset
+; extern "C" void SetMemsetCacheLimit(); // Change limit in GetMemsetCacheLimit
+;
+; Optimization:
+; Uses XMM registers to set 16 bytes at a time, aligned.
+;
+; The latest version of this file is available at:
+; www.agner.org/optimize/asmexamples.zip
+; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+default rel
+
+global A_memset: function ; Function memset
+global EXP(memset): function ; ?OVR removed if standard function memset overridden
+global memsetSSE2: function ; SSE2 version
+global memsetAVX: function ; version for CPUs with fast 256-bit store
+global GetMemsetCacheLimit: function ; Data blocks bigger than this will be stored uncached by memset
+global SetMemsetCacheLimit: function ; Change limit in GetMemsetCacheLimit
+
+; Imported from cachesize64.asm:
+extern DataCacheSize ; Get size of data cache
+
+; Imported from unalignedisfaster64.asm:
+extern Store256BitIsFaster ; Tells if a 256 bit store is faster than two 128 bit stores
+
+; Define prolog for this function
+%MACRO PROLOGM 0
+%IFDEF WINDOWS
+%define Rdest rcx ; dest
+ movzx eax, dl ; c
+ mov rdx, r8 ; count
+%define Rcount rdx ; count
+%define Rdest2 r9 ; copy of dest
+%define Rcount2 r8 ; copy of count
+
+%ELSE ; Unix
+%define Rdest rdi ; dest
+ movzx eax, sil ; c
+%define Rcount rdx ; count
+%define Rdest2 rcx ; copy of dest
+%define Rcount2 rsi ; copy of count
+ mov Rcount2, Rcount ; copy count
+%ENDIF
+%ENDMACRO
+
+
+SECTION .text align=16
+
+; extern "C" void * memset(void * dest, int c, size_t count);
+; Function entry:
+A_memset:
+EXP(memset):
+ jmp [memsetDispatch] ; CPU dispatch table
+
+memsetAVX: ; AVX version. Use ymm register
+memsetAVX@: ; local label
+ PROLOGM
+ imul eax, 01010101H ; Broadcast c into all bytes of eax
+ mov Rdest2, Rdest ; save dest
+ cmp Rcount, 16
+ ja B100
+B050: lea r10, [MemsetJTab] ; SSE2 version comes in here
+ jmp qword [r10+Rcount*8] ; jump table for small counts
+
+; Separate code for each count from 0 to 16:
+M16: mov [Rdest+12], eax
+M12: mov [Rdest+8], eax
+M08: mov [Rdest+4], eax
+M04: mov [Rdest], eax
+M00: mov rax, Rdest2 ; return dest
+ ret
+
+M15: mov [Rdest+11], eax
+M11: mov [Rdest+7], eax
+M07: mov [Rdest+3], eax
+M03: mov [Rdest+1], ax
+M01: mov [Rdest], al
+ mov rax, Rdest2 ; return dest
+ ret
+
+M14: mov [Rdest+10], eax
+M10: mov [Rdest+6], eax
+M06: mov [Rdest+2], eax
+M02: mov [Rdest], ax
+ mov rax, Rdest2 ; return dest
+ ret
+
+M13: mov [Rdest+9], eax
+M09: mov [Rdest+5], eax
+M05: mov [Rdest+1], eax
+ mov [Rdest], al
+ mov rax, Rdest2 ; return dest
+ ret
+
+B100: ; AVX version, Rcount > 16
+ movd xmm0, eax
+ pshufd xmm0, xmm0, 0 ; Broadcast c into all bytes of xmm0
+
+ lea rax, [Rdest+Rcount] ; point to end
+
+ cmp Rcount, 20H
+ jbe K600 ; faster to use xmm registers if small
+
+ ; Store the first possibly unaligned 16 bytes
+ ; It is faster to always write 16 bytes, possibly overlapping
+ ; with the subsequent regular part, than to make possibly mispredicted
+ ; branches depending on the size of the first part.
+ movups oword [Rdest], xmm0
+
+ ; store another 16 bytes, aligned
+ add Rdest, 10H
+ and Rdest, -10H
+ movaps oword [Rdest], xmm0
+
+ ; go to next 32 bytes boundary
+ add Rdest, 10H
+ and Rdest, -20H
+
+ ; Check if count very big
+ cmp Rcount, [MemsetCacheLimit]
+ ja K300 ; Use non-temporal store if count > MemsetCacheLimit
+
+ ; find last 32 bytes boundary
+ mov Rcount, rax
+ and Rcount, -20H
+
+ ; - size of 32-bytes blocks
+ sub Rdest, Rcount
+ jnb K200 ; Jump if not negative
+
+ ; extend value to 256 bits
+ vinsertf128 ymm0,ymm0,xmm0,1
+
+align 16
+K100: ; Loop through 32-bytes blocks. Register use is swapped
+ ; Rcount = end of 32-bytes blocks part
+ ; Rdest = negative index from the end, counting up to zero
+ vmovaps [Rcount+Rdest], ymm0
+ add Rdest, 20H
+ jnz K100
+ vzeroupper
+
+K200: ; the last part from Rcount to rax is < 32 bytes. write last 32 bytes with overlap
+ movups [rax-20H], xmm0
+ movups [rax-10H], xmm0
+ mov rax, Rdest2 ; return dest
+ ret
+
+K300: ; Use non-temporal moves, same code as above:
+
+ ; find last 32 bytes boundary
+ mov Rcount, rax
+ and Rcount, -20H
+
+ ; - size of 32-bytes blocks
+ sub Rdest, Rcount
+ jnb K500 ; Jump if not negative
+
+ ; extend value to 256 bits
+ vinsertf128 ymm0,ymm0,xmm0,1
+
+align 16
+K400: ; Loop through 32-bytes blocks. Register use is swapped
+ ; Rcount = end of 32-bytes blocks part
+ ; Rdest = negative index from the end, counting up to zero
+ vmovntps [Rcount+Rdest], ymm0
+ add Rdest, 20H
+ jnz K400
+ sfence
+ vzeroupper
+
+K500: ; the last part from Rcount to rax is < 32 bytes. write last 32 bytes with overlap
+ movups [rax-20H], xmm0
+ movups [rax-10H], xmm0
+ mov rax, Rdest2 ; return dest
+ ret
+
+K600: ; 16 < count <= 32
+ movups [Rdest], xmm0
+ movups [rax-10H], xmm0
+ mov rax, Rdest2 ; return dest
+ ret
+
+
+memsetSSE2: ; count > 16. Use SSE2 instruction set
+memsetSSE2@: ; local label
+ PROLOGM
+ imul eax, 01010101H ; Broadcast c into all bytes of eax
+ mov Rdest2, Rdest ; save dest
+ cmp Rcount, 16
+ jna B050
+
+ movd xmm0, eax
+ pshufd xmm0, xmm0, 0 ; Broadcast c into all bytes of xmm0
+
+ ; Store the first unaligned part.
+ ; The size of this part is 1 - 16 bytes.
+ ; It is faster to always write 16 bytes, possibly overlapping
+ ; with the subsequent regular part, than to make possibly mispredicted
+ ; branches depending on the size of the first part.
+ movq qword [Rdest], xmm0
+ movq qword [Rdest+8], xmm0
+
+ ; Check if count very big
+M150: mov rax, [MemsetCacheLimit]
+ cmp Rcount, rax
+ ja M500 ; Use non-temporal store if count > MemsetCacheLimit
+
+ ; Point to end of regular part:
+ ; Round down dest+count to nearest preceding 16-bytes boundary
+ lea Rcount, [Rdest+Rcount-1]
+ and Rcount, -10H
+
+ ; Point to start of regular part:
+ ; Round up dest to next 16-bytes boundary
+ add Rdest, 10H
+ and Rdest, -10H
+
+ ; -(size of regular part)
+ sub Rdest, Rcount
+ jnb M300 ; Jump if not negative
+
+align 16
+M200: ; Loop through regular part
+ ; Rcount = end of regular part
+ ; Rdest = negative index from the end, counting up to zero
+ movdqa [Rcount+Rdest], xmm0
+ add Rdest, 10H
+ jnz M200
+
+M300: ; Do the last irregular part
+ ; The size of this part is 1 - 16 bytes.
+ ; It is faster to always write 16 bytes, possibly overlapping
+ ; with the preceding regular part, than to make possibly mispredicted
+ ; branches depending on the size of the last part.
+ mov rax, Rdest2 ; dest
+ movq qword [rax+Rcount2-10H], xmm0
+ movq qword [rax+Rcount2-8], xmm0
+ ret
+
+
+M500: ; Use non-temporal moves, same code as above:
+ ; End of regular part:
+ ; Round down dest+count to nearest preceding 16-bytes boundary
+ lea Rcount, [Rdest+Rcount-1]
+ and Rcount, -10H
+
+ ; Start of regular part:
+ ; Round up dest to next 16-bytes boundary
+ add Rdest, 10H
+ and Rdest, -10H
+
+ ; -(size of regular part)
+ sub Rdest, Rcount
+ jnb M700 ; Jump if not negative
+
+align 16
+M600: ; Loop through regular part
+ ; Rcount = end of regular part
+ ; Rdest = negative index from the end, counting up to zero
+ movntdq [Rcount+Rdest], xmm0
+ add Rdest, 10H
+ jnz M600
+ sfence
+
+M700: ; Do the last irregular part
+ ; The size of this part is 1 - 16 bytes.
+ ; It is faster to always write 16 bytes, possibly overlapping
+ ; with the preceding regular part, than to make possibly mispredicted
+ ; branches depending on the size of the last part.
+ mov rax, Rdest2 ; dest
+ movq qword [rax+Rcount2-10H], xmm0
+ movq qword [rax+Rcount2-8], xmm0
+ ret
+
+
+memsetCPUDispatch: ; CPU dispatcher, check for instruction sets and which method is fastest
+ ; This part is executed only once
+ push rbx
+ push rcx
+ push rdx
+ push rsi
+ push rdi
+ push r8
+ ; set CacheBypassLimit to half the size of the largest level cache
+ call GetMemsetCacheLimit@
+ lea rbx, [memsetSSE2@]
+ call Store256BitIsFaster ; Test if 256-bit read/write is available and faster than 128-bit read/write
+ test eax, eax
+ jz Q100
+ lea rbx, [memsetAVX@]
+Q100:
+ ; Insert appropriate pointer
+ mov [memsetDispatch], rbx
+ mov rax, rbx
+ pop r8
+ pop rdi
+ pop rsi
+ pop rdx
+ pop rcx
+ pop rbx
+ ; Jump according to the replaced function pointer
+ jmp rax
+
+
+; extern "C" size_t GetMemsetCacheLimit(); // Data blocks bigger than this will be stored uncached by memset
+GetMemsetCacheLimit:
+GetMemsetCacheLimit@:
+ mov rax, [MemsetCacheLimit]
+ test rax, rax
+ jnz U200
+ ; Get half the size of the largest level cache
+%ifdef WINDOWS
+ xor ecx, ecx ; 0 means largest level cache
+%else
+ xor edi, edi ; 0 means largest level cache
+%endif
+ call DataCacheSize ; get cache size
+ shr eax, 1 ; half the size
+ jnz U100
+ mov eax, 400000H ; cannot determine cache size. use 4 Mbytes
+U100: mov [MemsetCacheLimit], eax
+U200: ret
+
+; extern "C" void SetMemsetCacheLimit(); // Change limit in GetMemsetCacheLimit
+SetMemsetCacheLimit:
+%ifdef WINDOWS
+ mov rax, rcx
+%else
+ mov rax, rdi
+%endif
+ test rax, rax
+ jnz U400
+ ; zero, means default
+ mov [MemsetCacheLimit], rax
+ call GetMemsetCacheLimit@
+U400: mov [MemsetCacheLimit], rax
+ ret
+
+
+SECTION .data
+align 16
+; Jump table for count from 0 to 16:
+MemsetJTab:DQ M00, M01, M02, M03, M04, M05, M06, M07
+ DQ M08, M09, M10, M11, M12, M13, M14, M15, M16
+
+; Pointer to appropriate version.
+; This initially points to memsetCPUDispatch. memsetCPUDispatch will
+; change this to the appropriate version of memset, so that
+; memsetCPUDispatch is only executed once:
+memsetDispatch: DQ memsetCPUDispatch
+
+; Bypass cache by using non-temporal moves if count > MemsetCacheLimit
+; The optimal value of MemsetCacheLimit is difficult to estimate, but
+; a reasonable value is half the size of the largest cache
+MemsetCacheLimit: DQ 0
diff --git a/contrib/libs/asmlib/mersenne64.asm b/contrib/libs/asmlib/mersenne64.asm new file mode 100644 index 0000000000..758075d61d --- /dev/null +++ b/contrib/libs/asmlib/mersenne64.asm @@ -0,0 +1,616 @@ +%include "defs.asm" + +; ----------------------------- MERSENNE64.ASM ---------------------------
+; Author: Agner Fog
+; Date created: 1998
+; Last modified: 2013-09-13
+; Source URL: www.agner.org/optimize
+; Project: asmlib.zip
+; Language: assembly, NASM/YASM syntax, 64 bit
+; Description:
+; Random Number generator 'Mersenne Twister' type MT11213A (or MT19937)
+;
+;
+; This random number generator is described in the article by
+; M. Matsumoto & T. Nishimura, in:
+; ACM Transactions on Modeling and Computer Simulation,
+; vol. 8, no. 1, 1998, pp. 3-30. See also:
+; http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
+;
+; Initialization:
+; MersRandomInit must be called before the first call to any of the other
+; random number functions. The seed is any 32-bit integer.
+; You may use MersRandomInitByArray instead if you want more
+; than 32 bits for seed. length is the number of integers in seeds[].
+; length must be > 0, there is no upper limit for length.
+;
+; Generating random numbers:
+; MersRandom returns a floating point number in the interval 0 <= x < 1 with
+; a resolution of 32 bits.
+; MersIRandom returns an integer in the interval defined by min and max with
+; a resolution of 32 bits.
+; MersIRandomX returns an integer in the interval defined by min and max with
+; exactly equal probabilities of all values in the interval.
+; MersBRandom returns 32 random bits.
+;
+; Error conditions:
+; If MersRandomInit or MersRandomInitByArray has not been called then MersRandom
+; and MersBRandom keep returning 0, and MersIRandom and MersIRandomX return min.
+; MersIRandom and MersIRandomX return a large negative number if max < min.
+;
+; C++ prototypes in randoma.h:
+; Thread-safe versions:
+; extern "C" void MersRandomInit(void * Pthis, int seed); // Re-seed
+; extern "C" void MersRandomInitByArray(void * Pthis, unsigned int seeds[], int length); // Seed by more than 32 bits
+; extern "C" int MersIRandom (void * Pthis, int min, int max); // Output random integer
+; extern "C" int MersIRandomX(void * Pthis, int min, int max); // Output random integer, exact
+; extern "C" double MersRandom(void * Pthis); // Output random float
+; extern "C" unsigned int MersBRandom(void * Pthis); // Output random bits
+;
+; Single-threaded versions:
+; extern "C" void MersenneRandomInit(int seed); // Re-seed
+; extern "C" void MersenneRandomInitByArray(unsigned int seeds[], int length); // Seed by more than 32 bits
+; extern "C" int MersenneIRandom (int min, int max); // Output random integer
+; extern "C" int MersenneIRandomX(int min, int max); // Output random integer, exact
+; extern "C" double MersenneRandom(); // Output random float
+; extern "C" unsigned int MersenneBRandom(); // Output random bits
+;
+; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+
+; structure definition and constants:
+%INCLUDE "randomah.asi"
+
+global MersenneRandomInit, MersenneRandomInitD, MersRandomInit
+global MersenneRandomInitByArray, MersenneRandomInitByArrayD, MersRandomInitByArray
+global MersenneBRandom, MersenneBRandomD, MersBRandom
+global MersenneRandom, MersenneRandomD, MersRandom
+global MersenneIRandom, MersenneIRandomD, MersIRandom
+global MersenneIRandomX, MersenneIRandomXD, MersIRandomX
+
+
+section .data
+align 16
+
+; Data for single instance of random number generator
+MersenneInstance: ISTRUC CRandomMersenneA
+IEND
+; Size of structure
+MersenneSize equ $ - MersenneInstance
+
+
+SECTION .CODE ALIGN=16
+
+MersenneRandomInit: ; PROC
+%IFDEF UNIX
+ mov edx, edi ; seed
+ lea rcx, [MersenneInstance] ; Pthis = point to instance
+ jmp ?Windows_MersRandomInit
+%ENDIF
+%IFDEF WINDOWS
+MersenneRandomInitD: ; alias
+ mov edx, ecx ; seed
+ lea rcx, [MersenneInstance] ; Pthis = point to instance
+ ;jmp ?Windows_MersRandomInit
+%ENDIF
+;MersenneRandomInit ENDP
+
+
+; Thread-safe version:
+; extern "C" void MersRandomInit(void * Pthis, int seed); // Re-seed
+MersRandomInit: ; PROC
+%IFDEF UNIX
+ ; translate calling convention
+ mov edx, esi ; seed
+ mov rcx, rdi ; Pthis
+%ENDIF
+ ; parameters: rcx = Pthis, edx = seed
+ and rcx, -16 ; align buffer
+ ?Windows_MersRandomInit:
+ call Mers_init0 ; initialize mt buffer with seeds
+
+ ; Number of premade numbers that are lost in the initialization when the
+ ; SSE2 implementation makes up to 4 premade numbers at a time:
+%IF MERS_N & 3
+ PREMADELOST equ (MERS_N & 3)
+%ELSE
+ PREMADELOST equ 4
+%ENDIF
+ ; We want the C++ and the assembly implementation to give exactly the same
+ ; sequence. The C++ version discards 37 random numbers after initialization.
+ ; The assembly version generates a sequence that is PREMADELOST + 1 numbers
+ ; behind. Therefore we discard the first 37 + PREMADELOST + 1 numbers if
+ ; SSE2 is supported, otherwise 37 + 1.
+
+ push rbx
+ mov ebx, 37+PREMADELOST+1
+ ; CMP dword [rcx+CRandomMersenneA.Instset], 4 ; can we use XMM registers and SSE2 ?
+ ; jae M110
+ ; sub ebx, PREMADELOST ; SSE2 not supported
+ ; mov dword [rcx+CRandomMersenneA.PreInx], 0 ; reset index to premade list
+M110: ; loop
+M120: call ?Windows_MersBRandom
+ dec ebx
+ jnz M120
+ pop rbx
+ ret
+;MersRandomInit ENDP
+
+
+Mers_init0: ; make random seeds from eax and put them into MT buffer
+; Input parameters:
+; rcx points to CRandomMersenneA
+; edx: seed
+; rcx unchanged by procedure
+
+ push rdi
+ ; clear my buffer
+ push rcx
+ mov rdi, rcx ; Pthis
+ add rdi, 16
+ mov ecx, (MersenneSize - 16) / 4
+ xor eax, eax
+ cld
+ rep stosd
+ pop rcx ; Pthis
+ mov edi, edx ; seed
+
+ ; initialize CRandomMersenneA structure
+ mov dword [rcx+CRandomMersenneA.PreInx], 4*4
+ mov dword [rcx+CRandomMersenneA.Instset], 4
+ mov eax, MERS_B
+ mov [rcx+CRandomMersenneA.TMB], eax
+ mov [rcx+CRandomMersenneA.TMB+4], eax
+ mov [rcx+CRandomMersenneA.TMB+8], eax
+ mov [rcx+CRandomMersenneA.TMB+12], eax
+ mov eax, MERS_C
+ mov [rcx+CRandomMersenneA.TMC], eax
+ mov [rcx+CRandomMersenneA.TMC+4], eax
+ mov [rcx+CRandomMersenneA.TMC+8], eax
+ mov [rcx+CRandomMersenneA.TMC+12], eax
+ mov eax, 3FF00000H ; upper dword of 1.0, double precision
+ mov [rcx+CRandomMersenneA.one+4], eax
+ mov [rcx+CRandomMersenneA.one+12], eax
+ mov dword [rcx+CRandomMersenneA.LMASK], LOWER_MASK
+ mov dword [rcx+CRandomMersenneA.UMASK], UPPER_MASK
+ mov dword [rcx+CRandomMersenneA.MATA], MERS_A
+
+ ; put random numbers into MT buffer
+ xor eax, eax
+M210: mov [rcx+rax*4+CRandomMersenneA.MT], edi
+ mov edx, edi
+ shr edi, 30
+ xor edi, edx
+ imul edi, 1812433253
+ inc eax
+ add edi, eax
+ cmp eax, MERS_N
+ jb M210
+
+ ; Set index MTI to end of list, (scaled by 4)
+ ; Round up to multiple of 4 to avoid alignment error
+ mov dword [rcx+CRandomMersenneA.MTI], ((MERS_N+3) & (-4)) * 4
+
+ pop rdi
+ ret
+
+
+; Single threaded version:
+; extern "C" void MersenneRandomInitByArray(unsigned int seeds[], int length);
+
+MersenneRandomInitByArray: ; PROC ; entry for Linux call
+%IFDEF UNIX
+ mov r8d, esi ; length
+ mov rdx, rdi ; seeds
+ lea rcx, [MersenneInstance] ; Pthis = point to instance
+ jmp ?Windows_MersRandomInitByArray
+%ENDIF
+%IFDEF WINDOWS
+MersenneRandomInitByArrayD: ; LABEL NEAR ; alias
+ mov r8d, edx ; length
+ mov rdx, rcx ; seeds
+ lea rcx, [MersenneInstance] ; Pthis = point to instance
+ jmp ?Windows_MersRandomInitByArray
+%ENDIF
+;MersenneRandomInitByArray ENDP
+
+; Thread-safe version:
+; extern "C" int MersRandomInitByArray(void * Pthis, unsigned int seeds[], int length);
+MersRandomInitByArray: ; PROC
+%IFDEF UNIX
+ ; translate calling convention
+ mov r8d, edx ; length
+ mov rdx, rsi ; seeds
+ mov rcx, rdi ; Pthis
+%ENDIF
+
+?Windows_MersRandomInitByArray:
+; parameters: rcx = Pthis, rdx = seeds, r8d = length
+
+ and rcx, -16 ; align buffer
+ push rbx
+ push rsi
+ push rdi
+ push rbp
+ mov rbx, rdx ; seeds
+ mov ebp, r8d ; length
+
+ mov edx, 19650218
+ call Mers_init0 ; init0(19650218); (rcx unchanged)
+
+ mov r8d, ebp ; r8d = length, ebp = k
+ test ebp, ebp
+ jle M380 ; error: length <= 0
+ xor edi, edi ; j = 0
+ lea esi, [rdi+1] ; i = 1
+ cmp ebp, MERS_N
+ ja M310
+ mov ebp, MERS_N ; k = max (MERS_N,length)
+M310:
+
+ ; for (; k; k--) {
+M320: mov eax, [rcx+rsi*4-4+CRandomMersenneA.MT] ; mt[i-1]
+ mov edx, eax
+ shr eax, 30
+ xor eax, edx ; mt[i-1] ^ (mt[i-1] >> 30)
+ imul eax, 1664525 ; * 1664525
+ xor eax, [rcx+rsi*4+CRandomMersenneA.MT] ; ^ mt[i]
+ add eax, [rbx+rdi*4] ; + seeds[j]
+ add eax, edi ; + j
+ mov [rcx+rsi*4+CRandomMersenneA.MT], eax ; save in mt[i]
+ inc esi ; i++
+ inc edi ; j++
+ cmp esi, MERS_N
+ jb M330 ; if (i>=MERS_N)
+ mov eax, [rcx+(MERS_N-1)*4+CRandomMersenneA.MT]; mt[0] = mt[MERS_N-1];
+ mov [rcx+CRandomMersenneA.MT], eax
+ mov esi, 1 ; i=1;
+M330:
+ cmp edi, r8d ; length
+ jb M340 ; if (j>=length)
+ xor edi, edi ; j = 0;
+M340:
+ dec ebp ; k--
+ jnz M320 ; first k loop
+M350:
+ mov ebp, MERS_N-1 ; k
+M360: mov eax, [rcx+rsi*4-4+CRandomMersenneA.MT] ; mt[i-1]
+ mov edx, eax
+ shr eax, 30
+ xor eax, edx ; mt[i-1] ^ (mt[i-1] >> 30)
+ imul eax, 1566083941 ; * 1566083941
+ xor eax, [rcx+rsi*4+CRandomMersenneA.MT] ; ^ mt[i]
+ sub eax, esi ; - i
+ mov [rcx+rsi*4+CRandomMersenneA.MT], eax ; save in mt[i]
+ inc esi ; i++
+ cmp esi, MERS_N
+ jb M370 ; if (i>=MERS_N)
+ mov eax, [rcx+(MERS_N-1)*4+CRandomMersenneA.MT]; mt[0] = mt[MERS_N-1];
+ mov [rcx+CRandomMersenneA.MT], eax
+ mov esi, 1 ; i=1;
+M370:
+ dec ebp ; k--
+ jnz M360 ; second k loop
+ mov dword [rcx+CRandomMersenneA.MT], 80000000H ; mt[0] = 0x80000000
+M380:
+ mov dword [rcx+CRandomMersenneA.MTI], 0
+ mov dword [rcx+CRandomMersenneA.PreInx], 0
+
+; discard first MERS_N random numbers + PREMADELOST+1 to compensate for lag
+ mov edi, MERS_N + PREMADELOST+1
+M391: call ?Windows_MersBRandom
+ dec edi
+ jnz M391
+
+ pop rbp ; restore registers
+ pop rdi
+ pop rsi
+ pop rbx
+ ret
+;MersRandomInitByArray ENDP
+
+
+; Single threaded version:
+; extern "C" unsigned int MersenneBRandom(); // Output random bits
+
+MersenneBRandom: ; PROC ; entry for both Windows and Linux call
+%IFDEF WINDOWS
+MersenneBRandomD: ; LABEL NEAR ; alias
+%ENDIF
+ lea rcx, [MersenneInstance] ; Point to instance
+ jmp ?Windows_MersBRandom
+;MersenneBRandom ENDP
+
+; Thread-safe version:
+; extern "C" unsigned int MersBRandom(void * Pthis); // Output random bits
+
+MersBRandom: ; PROC
+%IFDEF UNIX
+ mov rcx, rdi ; translate calling convention
+%ENDIF
+
+?Windows_MersBRandom: ; LABEL NEAR ; Label used internally
+ and rcx, -16 ; align buffer
+ mov edx, [rcx+CRandomMersenneA.PreInx] ; index into premade numbers
+ mov eax, [rcx+rdx*1+CRandomMersenneA.PreInt] ; fetch premade random number
+ add edx, 4
+ mov [rcx+CRandomMersenneA.PreInx], edx
+ cmp edx, 4*4
+ jnb M410
+ ret ; return premade number
+
+M410:
+; PREMADE list is empty. Make 4 more numbers ready for next call:
+ mov edx, [rcx+CRandomMersenneA.MTI] ; fetch 4 numbers from MT buffer
+ movdqa xmm0, oword [rcx+rdx*1+CRandomMersenneA.MT]
+
+%IF TEMPERING ; optional tempering algorithm
+ movdqa xmm1, xmm0
+ psrld xmm0, MERS_U
+ pxor xmm0, xmm1
+ movdqa xmm1, xmm0
+ pslld xmm0, MERS_S
+ pand xmm0, oword [rcx+CRandomMersenneA.TMB]
+ pxor xmm0, xmm1
+ movdqa xmm1, xmm0
+ pslld xmm0, MERS_T
+ pand xmm0, oword [rcx+CRandomMersenneA.TMC]
+ pxor xmm0, xmm1
+ movdqa xmm1, xmm0
+ psrld xmm0, MERS_L
+ pxor xmm0, xmm1
+%ENDIF ; tempering
+
+ ; save four premade integers
+ movdqa oword [rcx+CRandomMersenneA.PreInt], xmm0
+ ; premake four floating point numbers
+ pxor xmm1, xmm1
+ pxor xmm2, xmm2
+ punpckldq xmm1, xmm0 ; get first two numbers into bits 32-63 and 96-127
+ punpckhdq xmm2, xmm0 ; get next two numbers into bits 32-63 and 96-127
+ psrlq xmm1, 12 ; get bits into mantissa position
+ psrlq xmm2, 12 ; get bits into mantissa position
+ por xmm1,oword[rcx+CRandomMersenneA.one] ; set exponent for interval [1,2)
+ por xmm2,oword[rcx+CRandomMersenneA.one] ; set exponent for interval [1,2)
+ movdqa oword [rcx+CRandomMersenneA.PreFlt], xmm1 ; store two premade numbers
+ movdqa oword [rcx+CRandomMersenneA.PreFlt+16],xmm2; store two more premade numbers
+ mov dword [rcx+CRandomMersenneA.PreInx], 0 ; index to premade numbers
+ add edx, 4*4 ; increment MTI index into MT buffer by 4
+ mov [rcx+CRandomMersenneA.MTI], edx
+ cmp edx, MERS_N*4
+ jae M420
+ ret ; return random number in eax
+
+; MT buffer exhausted. Make MERS_N new numbers ready for next time
+M420: ; eax is the random number to return
+%IF MERS_N & 3 ; if MERS_N is not divisible by 4
+ NVALID equ MERS_N & 3 ; only NVALID of the 4 premade numbers are valid
+ ; Move premade numbers (4-NVALID) positions forward
+ movdqa xmm0, [rcx+CRandomMersenneA.PreInt]
+ movdqa xmm1, [rcx+CRandomMersenneA.PreFlt]
+ movdqa xmm2, [rcx+CRandomMersenneA.PreFlt+16]
+ movdqu [rcx+CRandomMersenneA.PreInt + (4-NVALID)*4], xmm0
+ movdqu [rcx+CRandomMersenneA.PreFlt + (4-NVALID)*8], xmm1
+%IF NVALID == 3
+ movq [rcx+CRandomMersenneA.PreFlt+16 + 8], xmm2
+%ENDIF
+ ; save index to first valid premade number
+ mov [rcx+CRandomMersenneA.PreInx], (4-NVALID)*4
+%ENDIF
+
+; MT buffer is empty. Fill it up
+ push rbx
+ movd xmm3, [rcx+CRandomMersenneA.UMASK] ; load constants
+ movd xmm4, [rcx+CRandomMersenneA.LMASK]
+ movd xmm5, [rcx+CRandomMersenneA.MATA]
+ pshufd xmm3, xmm3, 0 ; broadcast constants
+ pshufd xmm4, xmm4, 0
+ pshufd xmm5, xmm5, 0
+ xor rbx, rbx ; kk = 0
+ mov edx, MERS_M*4 ; km
+
+; change rcx from pointing to CRandomMersenneA to pointing to CRandomMersenneA.MT
+ add rcx, CRandomMersenneA.MT
+
+M430: ; kk loop
+ movdqa xmm2, [rcx+rbx] ; mt[kk]
+ movd xmm0, dword [rcx+rbx+16]
+ movdqa xmm1, [rcx+rbx] ; mt[kk]
+ movss xmm2, xmm0 ; faster than movdqu xmm2,[]
+ pshufd xmm2, xmm2, 00111001B ; mt[kk+1]
+ movdqu xmm0, oword [rcx+rdx] ; mt[km]
+ ;movq xmm0, qword [rcx+rdx] ; mt[km]
+ ;movhps xmm0, qword [rcx+rdx+8] ; faster than movdqu on older processors
+ pand xmm1, xmm3 ; mt[kk] & UPPER_MASK
+ pand xmm2, xmm4 ; mt[kk+1] & LOWER_MASK
+ por xmm1, xmm2 ; y
+ movdqa xmm2, xmm1 ; y
+ pslld xmm1, 31 ; copy bit 0 into all bits
+ psrad xmm1, 31 ; -(y & 1)
+ pand xmm1, xmm5 ; & MERS_A
+ psrld xmm2, 1 ; y >> 1
+ pxor xmm0, xmm1
+ pxor xmm0, xmm2
+ movdqa [rcx+rbx], xmm0 ; result into mt[kk]
+ cmp ebx, (MERS_N-4)*4
+ jae M440 ; exit loop when kk past end of buffer
+ add ebx, 16 ; kk += 4
+ add rdx, 16 ; km += 4 (signed)
+ cmp edx, (MERS_N-4)*4
+ jbe M430 ; skip unless km wraparound
+ sub rdx, MERS_N*4 ; km wraparound (signed)
+ movdqu xmm0, [rcx+(MERS_N-4)*4] ; copy end to before begin for km wraparound
+ movdqa [rcx-4*4], xmm0
+ movdqa xmm0, [rcx] ; copy begin to after end for kk wraparound
+ movdqu [rcx+MERS_N*4], xmm0
+ jmp M430
+
+M440: ; loop finished. discard excess part of last result
+
+; change ecx back to pointing to CRandomMersenneA
+ sub rcx, CRandomMersenneA.MT
+
+ mov dword [rcx+CRandomMersenneA.MTI], 0
+ pop rbx
+ ret ; random number is still in eax
+
+;MersBRandom ENDP
+
+
+; Single threaded version:
+; extern "C" unsigned int MersenneRandom(); // Get floating point random number
+
+MersenneRandom: ; PROC ; entry for both Windows and Linux call
+%IFDEF WINDOWS
+MersenneRandomD: ; alias
+ lea rcx, [MersenneInstance] ; Point to instance
+ ; continue in next function
+%ENDIF
+%IFDEF UNIX
+ lea rdi, [MersenneInstance] ; Point to instance
+ ; continue in next function
+%ENDIF
+
+; Thread-safe version:
+; extern "C" double MersRandom(void * Pthis); // Get floating point random number
+MersRandom:
+%IFDEF UNIX
+ mov rcx, rdi ; translate calling convention
+%ENDIF
+ mov edx, [rcx+CRandomMersenneA.PreInx] ; index into premade numbers
+ movsd xmm0, [rcx+rdx*2+CRandomMersenneA.PreFlt] ; fetch premade floating point random number
+ subsd xmm0, [rcx+CRandomMersenneA.one] ; subtract 1.0
+ movsd [rcx+CRandomMersenneA.TmpFlt], xmm0 ; store random number
+ call ?Windows_MersBRandom ; prepare next random number
+ movsd xmm0, [rcx+CRandomMersenneA.TmpFlt] ; recall random number
+ ret
+;MersenneRandom ENDP
+
+
+
+; Single threaded version:
+; extern "C" unsigned int MersenneIRandom(int min, int max); // Get integer random number in desired interval
+
+MersenneIRandom: ; PROC
+%IFDEF UNIX
+ push rsi ; max
+ push rdi ; min
+ lea rcx, [MersenneInstance] ; Pthis = point to instance
+ jmp MersIRandom_max_min_on_stack
+%ENDIF
+%IFDEF WINDOWS
+MersenneIRandomD: ; Alias
+ push rdx ; max
+ push rcx ; min
+ lea rcx, [MersenneInstance] ; Pthis = point to instance
+ jmp MersIRandom_max_min_on_stack
+%ENDIF
+;MersenneIRandom ENDP
+
+; Thread-safe version:
+; extern "C" int MersIRandom(void * Pthis, int min, int max); // Get integer random number in desired interval
+MersIRandom: ; PROC
+%IFDEF UNIX
+ ; translate calling convention
+ mov r8d, edx ; max
+ mov edx, esi ; min
+ mov rcx, rdi ; Pthis
+%ENDIF
+ push r8 ; max
+ push rdx ; min
+MersIRandom_max_min_on_stack:
+
+ call ?Windows_MersBRandom ; random bits
+ pop rcx ; min
+ pop rdx ; max
+ sub edx, ecx
+ js short M720 ; max < min
+ add edx, 1 ; interval = max - min + 1
+ mul edx ; multiply random number by interval and truncate
+ lea eax, [rdx+rcx] ; add min
+ ret
+M720: mov eax, 80000000H ; error exit
+ ret
+;MersIRandom ENDP
+
+
+; Single threaded version:
+; extern "C" unsigned int MersenneIRandomX(int min, int max); // Get integer random number in desired interval
+
+MersenneIRandomX: ; PROC
+%IFDEF UNIX
+ mov r8d, esi ; max
+ mov edx, edi ; min
+ lea rcx, [MersenneInstance] ; Pthis = point to instance
+ jmp ?Windows_MersIRandomX
+%ENDIF
+%IFDEF WINDOWS
+MersenneIRandomXD: ; alias
+ mov r8d, edx ; max
+ mov edx, ecx ; min
+ lea rcx, [MersenneInstance] ; Pthis = point to instance
+ jmp ?Windows_MersIRandomX
+%ENDIF
+;MersenneIRandomX ENDP
+
+; Thread-safe version:
+; extern "C" int MersIRandomX(void * Pthis, int min, int max); // Get integer random number in desired interval
+MersIRandomX: ; PROC
+%IFDEF UNIX
+ ; translate calling convention
+ mov r8d, edx ; max
+ mov edx, esi ; min
+ mov rcx, rdi ; Pthis
+%ENDIF
+
+?Windows_MersIRandomX:
+; parameters: rcx = Pthis, edx = min, r8d = max
+
+ and rcx, -16 ; align buffer
+ push rdi
+ mov edi, r8d ; max
+
+ sub edi, edx ; max - min
+ jle short M830 ; max <= min (signed)
+ inc edi ; interval = max - min + 1
+ push rdx ; save min
+
+ ; if (interval != LastInterval) {
+ cmp edi, [rcx+CRandomMersenneA.LastInterval]
+ je M810
+ ; RLimit = uint32(((uint64)1 << 32) / interval) * interval - 1;}
+ xor eax, eax ; 0
+ lea edx, [rax+1] ; 1
+ div edi ; (would give overflow if interval = 1)
+ mul edi
+ dec eax
+ mov [rcx+CRandomMersenneA.RLimit], eax
+ mov [rcx+CRandomMersenneA.LastInterval], edi
+M810:
+M820: ; do { // Rejection loop
+ call ?Windows_MersBRandom ; random bits (rcx is preserved)
+ ; longran = (uint64)BRandom() * interval;
+ mul edi
+ ; } while (remainder > RLimit);
+ cmp eax, [rcx+CRandomMersenneA.RLimit]
+ ja M820
+
+ ; return (int32)iran + min
+ pop rax ; min
+ add eax, edx
+ pop rdi
+ ret
+
+M830: jl M840
+ ; max = min. Return min
+ mov eax, edx
+ pop rdi
+ ret ; max = min exit
+
+M840: ; max < min: error
+ mov eax, 80000000H ; error exit
+ pop rdi
+ ret
+;MersIRandomX ENDP
diff --git a/contrib/libs/asmlib/mother64.asm b/contrib/libs/asmlib/mother64.asm new file mode 100644 index 0000000000..c6fd34ec3b --- /dev/null +++ b/contrib/libs/asmlib/mother64.asm @@ -0,0 +1,242 @@ +%include "defs.asm" + +; ----------------------------- MOTHER64.ASM -----------------------------
+; Author: Agner Fog
+; Date created: 1998
+; Last modified: 2013-09-11
+; Source URL: www.agner.org/optimize
+; Project: asmlib.zip
+; Language: assembly, NASM/YASM syntax, 64 bit
+; Description:
+; Mother-of-All random number generator by Agner Fog
+; 64-bit mode version for x86-64 compatible microprocessors.
+;
+; This is a multiply-with-carry type of random number generator
+; invented by George Marsaglia. The algorithm is:
+; S = 2111111111*X[n-4] + 1492*X[n-3] + 1776*X[n-2] + 5115*X[n-1] + C
+; X[n] = S modulo 2^32
+; C = floor(S / 2^32)
+;
+; C++ prototypes:
+; extern "C" void MotRandomInit(void * Pthis, int seed); // Initialization
+; extern "C" int MotIRandom(void * Pthis, int min, int max); // Get integer random number in desired interval
+; extern "C" double MotRandom(void * Pthis); // Get floating point random number
+; extern "C" unsigned int MotBRandom(void * Pthis); // Output random bits
+;
+; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+
+; structure definition and constants:
+%INCLUDE "randomah.asi"
+
+; publics:
+global MotherBRandom, MotBRandom, ?Windows_MotBRandom
+global MotherRandom, MotRandom, MotherIRandom, MotIRandom
+global MotherRandomInit, MotRandomInit
+
+section .data
+align 16
+
+; Data for single instance of random number generator
+MotherInstance: ISTRUC CRandomMotherA
+IEND
+; Size of structure
+MotherSize equ $-MotherInstance
+
+
+SECTION .CODE ALIGN=16 ; code segment
+
+; Single threaded version:
+; extern "C" unsigned int MotherBRandom(); // Output random bits
+
+MotherBRandom: ; PROC ; entry for both Windows and Linux call
+ lea rcx, [MotherInstance] ; Point to instance
+ jmp ?Windows_MotBRandom
+;MotherBRandom ENDP
+
+; Thread-safe version:
+; extern "C" unsigned int MotBRandom(void * Pthis); // Output random bits
+
+MotBRandom: ; PROC
+%IFDEF UNIX
+ mov rcx, rdi ; translate calling convention
+%ENDIF
+?Windows_MotBRandom:
+ and rcx, -16 ; align
+ movdqa xmm1, oword [rcx+CRandomMotherA.M3] ; load M3,M2,M1,M0
+ mov eax, [rcx+CRandomMotherA.M0] ; Retrieve previous random number
+ movdqa xmm2, xmm1 ; copy
+ movdqa xmm3, oword [rcx+CRandomMotherA.MF3] ; factors
+ psrlq xmm2, 32 ; move M2,M0 down
+ movq qword [rcx+CRandomMotherA.M4], xmm1 ; M4=M3, M3=M2
+ movhps qword [rcx+CRandomMotherA.M2], xmm1 ; M2=M1, M1=M0
+ pmuludq xmm1, xmm3 ; M3*MF3, M1*MF1
+ psrlq xmm3, 32 ; move MF2,MF0 down
+ pmuludq xmm2, xmm3 ; M2*MF2, M0*MF0
+ paddq xmm1, xmm2 ; P2+P3, P0+P1
+ movhlps xmm2, xmm1 ; Get high qword
+ paddq xmm1, xmm2 ; P0+P1+P2+P3
+ paddq xmm1, oword [rcx+CRandomMotherA.MC] ; +carry
+ movq qword [rcx+CRandomMotherA.M0], xmm1 ; Store new M0 and carry
+ ; convert to double precision float
+ psllq xmm1, 32 ; Discard carry bits
+ psrlq xmm1, 12 ; Get bits into mantissa position
+ por xmm1, oword [rcx+CRandomMotherA.one] ; Add exponent bits to get number in interval [1,2)
+ movq [rcx+CRandomMotherA.RanP1], xmm1 ; Store floating point number
+ ret
+
+;MotBRandom ENDP
+
+
+; Single threaded version:
+; extern "C" unsigned int MotherRandom(); // Get floating point random number
+
+MotherRandom:
+%IFDEF UNIX
+ lea rdi, [MotherInstance] ; Point to instance
+%ENDIF
+%IFDEF WINDOWS
+ lea rcx, [MotherInstance] ; Point to instance
+%ENDIF
+
+; Thread-safe version:
+; extern "C" double MotRandom(void * Pthis); // Get floating point random number
+MotRandom:
+%IFDEF UNIX
+ mov rcx, rdi ; translate calling convention
+%ENDIF
+ and rcx, -16 ; align
+ ; get previously prepared random number
+ movsd xmm0, [rcx+CRandomMotherA.RanP1]
+ subsd xmm0, [rcx+CRandomMotherA.one]
+
+ ; make new random number ready for next time
+ call ?Windows_MotBRandom
+ ret
+;MotherRandom ENDP
+
+
+; Single threaded version:
+; extern "C" unsigned int MotherIRandom(int min, int max); // Get integer random number in desired interval
+
+MotherIRandom: ; PROC
+%IFDEF UNIX
+ mov r8d, esi ; max
+ mov edx, edi ; min
+ lea rcx, [MotherInstance] ; Pthis = point to instance
+ jmp ?Windows_MotIRandom
+%ENDIF
+%IFDEF WINDOWS
+ mov r8d, edx ; max
+ mov edx, ecx ; min
+ lea rcx, [MotherInstance] ; Pthis = point to instance
+ jmp ?Windows_MotIRandom
+%ENDIF
+; MotherIRandom ENDP
+
+; Thread-safe version:
+; extern "C" int MotIRandom(void * Pthis, int min, int max); // Get integer random number in desired interval
+MotIRandom:
+%IFDEF UNIX
+ ; translate calling convention
+ mov r8d, edx ; max
+ mov edx, esi ; min
+ mov rcx, rdi ; Pthis
+%ENDIF
+
+?Windows_MotIRandom: ; LABEL NEAR ; entry for Windows call
+ and rcx, -16 ; align
+ push r8
+ push rdx
+ call ?Windows_MotBRandom ; make random number
+ pop rcx ; min
+ pop r8 ; max
+ sub r8d, ecx
+ js short rerror ; max < min
+ inc r8d ; interval = max - min + 1
+ mul r8d ; multiply random number eax by interval and truncate
+ lea eax, [rdx+rcx] ; add min to interval*BRandom >> 32
+ ret ; ret 8 if not _cdecl calling
+
+rerror: mov eax, 80000000h ; error exit
+ ret ; ret 8 if not _cdecl calling
+;MotIRandom ENDP
+
+
+; Single threaded version:
+; extern "C" unsigned int MotherRandomInit(int seed); // Initialization
+
+MotherRandomInit: ; PROC
+%IFDEF UNIX
+ mov edx, edi ; seed
+ lea rcx, [MotherInstance] ; Pthis = point to instance
+ jmp ?Windows_MotRandomInit
+%ENDIF
+%IFDEF WINDOWS
+ mov edx, ecx ; seed
+ lea rcx, [MotherInstance] ; Pthis = point to instance
+ jmp ?Windows_MotRandomInit
+%ENDIF
+;MotherRandomInit ENDP
+
+; Thread-safe version:
+; extern "C" void MotRandomInit(void * Pthis, int seed); // Initialization
+MotRandomInit: ; PROC
+%IFDEF UNIX
+ ; translate calling convention
+ mov edx, esi ; seed
+ mov rcx, rdi ; Pthis
+%ENDIF
+
+?Windows_MotRandomInit: ; LABEL NEAR ; entry for Windows call
+ and rcx, -16 ; align
+ ; clear my buffer
+ push rdi
+ push rcx
+ mov rdi, rcx ; Pthis
+ add rdi, 16
+ mov ecx, (MotherSize - 16) / 4
+ xor eax, eax
+ cld
+ rep stosd
+ pop rcx
+
+ ; insert constants
+ mov dword [rcx+CRandomMotherA.one+4], 3FF00000H ; high dword of 1.0
+ mov dword [rcx+CRandomMotherA.MF0], 5115 ; factors
+ mov dword [rcx+CRandomMotherA.MF1], 1776
+ mov dword [rcx+CRandomMotherA.MF2], 1492
+ mov dword [rcx+CRandomMotherA.MF3], 2111111111
+
+ ; initialize from seed
+ mov eax, edx ; seed
+ ; make random numbers and put them into buffer
+ mov edx, 29943829
+ imul eax, edx
+ dec eax
+ mov [rcx+CRandomMotherA.M0], eax
+ imul eax, edx
+ dec eax
+ mov [rcx+CRandomMotherA.M1], eax
+ imul eax, edx
+ dec eax
+ mov [rcx+CRandomMotherA.M2], eax
+ imul eax, edx
+ dec eax
+ mov [rcx+CRandomMotherA.M3], eax
+ imul eax, edx
+ dec eax
+ mov [rcx+CRandomMotherA.MC], eax
+
+ ; randomize some more
+ mov edi, 20 ; loop counter
+r90: call ?Windows_MotBRandom ; (rcx and rdi unchanged)
+ dec edi
+ jnz r90
+ pop rdi
+ ret
+;MotRandomInit ENDP
+
+ ; END
diff --git a/contrib/libs/asmlib/physseed64.asm b/contrib/libs/asmlib/physseed64.asm new file mode 100644 index 0000000000..b30fc26712 --- /dev/null +++ b/contrib/libs/asmlib/physseed64.asm @@ -0,0 +1,396 @@ +%include "defs.asm" + +;************************* physseed64.asm **********************************
+; Author: Agner Fog
+; Date created: 2010-08-03
+; Last modified: 2013-09-13
+; Source URL: www.agner.org/optimize
+; Project: asmlib.zip
+; C++ prototype:
+; extern "C" int PhysicalSeed(int seeds[], int NumSeeds);
+;
+; Description:
+; Generates a non-deterministic random seed from a physical random number generator
+; which is available on some processors.
+; Uses the time stamp counter (which is less random) if no physical random number
+; generator is available.
+; The code is not optimized for speed because it is typically called only once.
+;
+; Parameters:
+; int seeds[] An array which will be filled with random numbers
+; int NumSeeds Indicates the desired number of 32-bit random numbers
+;
+; Return value: 0 Failure. No suitable instruction available (processor older than Pentium)
+; 1 No physical random number generator. Used time stamp counter instead
+; 2 Success. VIA physical random number generator used
+; 3 Success. Intel physical random number generator used
+; 4 Success. Intel physical seed generator used
+;
+; The return value will indicate the availability of a physical random number generator
+; even if NumSeeds = 0.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+
+%define NUM_TRIES 20 ; max number of tries for rdseed and rdrand instructions
+
+%define TESTING 0 ; 1 for test only
+
+global PhysicalSeed
+
+; Direct entries to CPU-specific versions
+global PhysicalSeedNone: function
+global PhysicalSeedRDTSC: function
+global PhysicalSeedVIA: function
+global PhysicalSeedRDRand: function
+global PhysicalSeedRDSeed function
+
+; ***************************************************************************
+; Define registers used for function parameters, used in 64-bit mode only
+; ***************************************************************************
+
+%IFDEF WINDOWS
+ %define par1 rcx
+ %define par2 rdx
+ %define par3 r8
+ %define par1d ecx
+ %define par2d edx
+ %define par3d r8d
+%ENDIF
+
+%IFDEF UNIX
+ %define par1 rdi
+ %define par2 rsi
+ %define par3 rdx
+ %define par1d edi
+ %define par2d esi
+ %define par3d edx
+%ENDIF
+
+
+SECTION .text align=16
+
+%IFDEF WINDOWS
+global PhysicalSeedD@8 ; DLL version
+PhysicalSeedD@8:
+%ENDIF
+
+PhysicalSeed:
+ jmp [PhysicalSeedDispatch] ; Go to appropriate version, depending on instructions available
+
+
+PhysicalSeedRDSeed:
+ push rbx
+ test par2d, par2d ; NumSeeds
+ jz S300
+ js S900
+ mov par3d, par2d ; NumSeeds
+ shr par3d, 1
+ jz S150
+ ; do 64 bits at a time
+S100: mov ebx, NUM_TRIES
+S110: ; rdseed rax
+%if TESTING
+ mov eax, par3d
+ stc
+%ELSE
+ db 48h, 0Fh, 0C7h, 0F8h ; rdseed rax
+%ENDIF
+ jc S120
+ ; failed. try again
+ dec ebx
+ jz S900
+ jmp S110
+S120: mov [par1], rax
+ add par1, 8
+ dec par3d
+ jnz S100 ; loop 64 bits
+S150:
+ and par2d, 1
+ jz S300
+ ; an odd 32 bit remains
+S200: mov ebx, NUM_TRIES
+S210: ; rdseed rax
+%if TESTING
+ mov eax, par3d
+ stc
+%ELSE
+ db 0Fh, 0C7h, 0F8h ; rdseed eax
+%ENDIF
+ jc S220
+ ; failed. try again
+ dec ebx
+ jz S900
+ jmp S210
+S220: mov [par1], eax
+S300: mov eax, 4 ; return value
+ pop rbx
+ ret
+S900: ; failure
+ xor eax, eax ; return 0
+ pop rbx
+ ret
+
+
+PhysicalSeedRDRand:
+ push rbx
+ test par2d, par2d ; NumSeeds
+ jz R300
+ js R900
+ mov par3d, par2d ; NumSeeds
+ shr par3d, 1 ; NumSeeds/2
+ jz R150
+ ; do 64 bits at a time
+R100: mov ebx, NUM_TRIES
+R110: ; rdrand rax
+%if TESTING
+ mov eax, par3d
+ stc
+%ELSE
+ db 48h, 0Fh, 0C7h, 0F0h ; rdrand rax
+%ENDIF
+ jc R120
+ ; failed. try again
+ dec ebx
+ jz R900
+ jmp R110
+R120: mov [par1], rax
+ add par1, 8
+ dec par3d
+ jnz R100 ; loop 64 bits
+R150:
+ and par2d, 1
+ jz R300
+ ; an odd 32 bit remains
+R200: mov ebx, NUM_TRIES
+R210: ; rdrand eax
+%if TESTING
+ mov eax, par3d
+ stc
+%ELSE
+ db 0Fh, 0C7h, 0F0h ; rdrand eax
+%ENDIF
+ jc R220
+ ; failed. try again
+ dec ebx
+ jz R900
+ jmp R210
+R220: mov [par1], eax
+R300: mov eax, 4 ; return value
+ pop rbx
+ ret
+R900: ; failure
+ xor eax, eax ; return 0
+ pop rbx
+ ret
+
+
+PhysicalSeedVIA:
+; VIA XSTORE supported
+ push rbx
+%IFDEF WINDOWS
+ push rsi
+ push rdi
+ mov rdi, rcx ; seeds
+ mov esi, edx ; NumSeeds
+%ENDIF
+ mov ecx, esi ; NumSeeds
+ and ecx, -2 ; round down to nearest even
+ jz T200 ; NumSeeds <= 1
+ ; make an even number of random dwords
+ shl ecx, 2 ; number of bytes (divisible by 8)
+ mov edx, 3 ; quality factor
+%if TESTING
+ mov eax, 1
+ rep stosb
+%ELSE
+ db 0F3H, 00FH, 0A7H, 0C0H ; rep xstore instuction
+%ENDIF
+T200:
+ test esi, 1
+ jz T300
+ ; NumSeeds is odd. Make 8 bytes in temporary buffer and store 4 of the bytes
+ mov rbx, rdi ; current output pointer
+ mov ecx, 4 ; Will generate 4 or 8 bytes, depending on CPU
+ mov edx, 3 ; quality factor
+ push rcx ; make temporary space on stack
+ mov rdi, rsp ; point to buffer on stack
+%if TESTING
+ mov eax, 1
+ rep stosb
+%ELSE
+ db 0F3H, 00FH, 0A7H, 0C0H ; rep xstore instuction
+%ENDIF
+ pop rax
+ mov [rbx], eax ; store the last 4 bytes
+T300:
+ mov eax, 2 ; return value
+%IFDEF WINDOWS
+ pop rdi
+ pop rsi
+%ENDIF
+ pop rbx
+ ret
+
+
+PhysicalSeedRDTSC:
+%IFDEF WINDOWS
+ push rbx
+ push rcx
+ push rdx
+ xor eax, eax
+ cpuid ; serialize
+ rdtsc ; get time stamp counter
+ pop rbx ; numseeds
+ pop rcx ; seeds
+ test ebx, ebx
+ jz U300 ; zero seeds
+ js U900 ; failure
+ mov [rcx], eax ; store time stamp counter as seeds[0]
+ add rcx, 4
+ dec ebx
+ jz U300
+ mov [rcx], edx ; store upper part of time stamp counter as seeds[1]
+ add rcx, 4
+ dec ebx
+ jz U300
+ xor eax, eax
+U100: mov [rcx], eax ; store 0 for the rest
+ add rcx, 4
+ dec ebx
+ jnz U100
+U300: mov eax, 1 ; return value
+ pop rbx
+ ret
+U900: ; failure
+ xor eax, eax ; return 0
+ pop rbx
+ ret
+
+%ELSE ; UNIX
+
+ push rbx
+ xor eax, eax
+ cpuid ; serialize
+ rdtsc ; get time stamp counter
+ test esi, esi ; numseeds
+ jz U300 ; zero seeds
+ js U900 ; failure
+ mov [rdi], eax ; store time stamp counter as seeds[0]
+ add rdi, 4
+ dec esi
+ jz U300
+ mov [rdi], edx ; store upper part of time stamp counter as seeds[1]
+ add rdi, 4
+ dec esi
+ jz U300
+ xor eax, eax
+U100: mov [rdi], eax ; store 0 for the rest
+ add rdi, 4
+ dec esi
+ jnz U100
+U300: mov eax, 1 ; return value
+ pop rbx
+ ret
+U900: ; failure
+ xor eax, eax ; return 0
+ pop rbx
+ ret
+
+%ENDIF
+
+
+PhysicalSeedNone: ; no possible generation
+ xor eax, eax
+ test par2d, par2d ; numseeds
+ jz N200
+N100: mov [par1], eax
+ add par1, 4
+ dec par2d
+ jnz N100
+N200: ret ; return 0
+
+
+PhysicalSeedDispatcher:
+ push rbx
+%IFDEF WINDOWS
+ push rcx
+ push rdx
+%ENDIF
+ ; test if RDSEED supported
+ xor eax, eax
+ cpuid
+ cmp eax, 7
+ jb P200 ; RDSEED not supported
+ mov eax, 7
+ xor ecx, ecx
+ cpuid
+ bt ebx, 18
+ ; jc USE_RDSEED ; not tested yet!!
+
+P200: ; test if RDRAND supported
+ mov eax, 1
+ cpuid
+ bt ecx, 30
+ jc USE_RDRAND
+
+ ; test if VIA xstore instruction supported
+ mov eax, 0C0000000H
+ push rax
+ cpuid
+ pop rbx
+ cmp eax, ebx
+ jna P300 ; not a VIA processor
+ lea eax, [rbx+1]
+ cpuid
+ bt edx, 3
+ jc VIA_METHOD
+
+P300: ; test if RDTSC supported
+ mov eax, 1
+ cpuid
+ bt edx, 4
+ jc USE_RDTSC ; XSTORE instruction not supported or not enabled
+
+FAILURE: ; No useful instruction supported
+ lea rax, [PhysicalSeedNone]
+ jmp P800
+
+USE_RDRAND: ; Use RDRAND instruction
+ lea rax, [PhysicalSeedRDRand]
+ jmp P800
+
+USE_RDSEED: ; Use RDSEED instruction (not tested yet)
+ lea rax, [PhysicalSeedRDSeed]
+ jmp P800
+
+VIA_METHOD: ; Use VIA xstore instructions
+ lea rax, [PhysicalSeedVIA]
+ jmp P800
+
+USE_RDTSC:
+ lea rax, [PhysicalSeedRDTSC]
+ ;jmp P800
+
+P800: mov [PhysicalSeedDispatch], rax
+%IFDEF WINDOWS
+ pop rdx
+ pop rcx
+%ENDIF
+ pop rbx
+ jmp rax ; continue in dispatched version
+
+
+; -----------------------------------------------------------------
+; Data section for dispatcher
+; -----------------------------------------------------------------
+
+SECTION .data
+
+; Pointer to appropriate versions. Initially point to dispatcher
+PhysicalSeedDispatch DQ PhysicalSeedDispatcher
+
+%IFDEF POSITIONINDEPENDENT
+; Fix potential problem in Mac linker
+ DD 0, 0
+%ENDIF
diff --git a/contrib/libs/asmlib/popcount64.asm b/contrib/libs/asmlib/popcount64.asm new file mode 100644 index 0000000000..c4ad64e03b --- /dev/null +++ b/contrib/libs/asmlib/popcount64.asm @@ -0,0 +1,112 @@ +%include "defs.asm" + +;************************* popcount64.asm ************************************
+; Author: Agner Fog
+; Date created: 2011-07-20
+; Last modified: 2011-07-20
+
+; Description:
+; Population count function. Counts the number of 1-bits in a 32-bit integer
+; unsigned int A_popcount (unsigned int x);
+;
+; Position-independent code is generated if POSITIONINDEPENDENT is defined.
+;
+; CPU dispatching included for 386 and SSE4.2 instruction sets.
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+default rel
+
+global A_popcount: function
+
+; Direct entries to CPU-specific versions
+global popcountGeneric: function
+global popcountSSE42: function
+
+; Imported from instrset32.asm:
+extern InstructionSet ; Instruction set for CPU dispatcher
+
+section .text
+
+;******************************************************************************
+; popcount function
+;******************************************************************************
+
+
+A_popcount: ; function dispatching
+ jmp near [popcountDispatch] ; Go to appropriate version, depending on instruction set
+
+align 16
+popcountSSE42: ; SSE4.2 version
+%ifdef WINDOWS
+ popcnt eax, ecx
+%else
+ popcnt eax, edi
+%endif
+ ret
+
+
+;******************************************************************************
+; popcount function generic
+;******************************************************************************
+
+popcountGeneric: ; Generic version
+%ifdef WINDOWS
+ mov eax, ecx
+%else
+ mov eax, edi
+%endif
+ mov edx, eax
+ shr eax, 1
+ and eax, 55555555h ; odd bits in eax, even bits in edx
+ and edx, 55555555h
+ add eax, edx
+ mov edx, eax
+ shr eax, 2
+ and eax, 33333333h
+ and edx, 33333333h
+ add eax, edx
+ mov edx, eax
+ shr eax, 4
+ add eax, edx
+ and eax, 0F0F0F0Fh
+ mov edx, eax
+ shr eax, 8
+ add eax, edx
+ mov edx, eax
+ shr eax, 16
+ add eax, edx
+ and eax, 03FH
+ ret
+;popcountGeneric end
+
+; ********************************************************************************
+; CPU dispatching for popcount. This is executed only once
+; ********************************************************************************
+
+%ifdef WINDOWS
+%define par1 rcx ; parameter 1, pointer to haystack
+%else
+%define par1 rdi ; parameter 1, pointer to haystack
+%endif
+
+popcountCPUDispatch:
+ ; get supported instruction set
+ push par1
+ call InstructionSet
+ pop par1
+ ; Point to generic version of strstr
+ lea rdx, [popcountGeneric]
+ cmp eax, 9 ; check popcnt supported
+ jb Q100
+ ; SSE4.2 supported
+ ; Point to SSE4.2 version of strstr
+ lea rdx, [popcountSSE42]
+Q100: mov [popcountDispatch], rdx
+ ; Continue in appropriate version
+ jmp rdx
+
+SECTION .data
+
+; Pointer to appropriate versions. Initially point to dispatcher
+popcountDispatch DQ popcountCPUDispatch
diff --git a/contrib/libs/asmlib/procname64.asm b/contrib/libs/asmlib/procname64.asm new file mode 100644 index 0000000000..1b77b74320 --- /dev/null +++ b/contrib/libs/asmlib/procname64.asm @@ -0,0 +1,145 @@ +%include "defs.asm" + +; procname64.asm
+;
+; Author: Agner Fog
+; Date created: 2007
+; Last modified: 2011-07-02
+; Description:
+; ProcessorName
+; =============
+; This function produces a zero-terminated ASCII string containing a name
+; for the microprocessor in human-readable format.
+;
+; Copyright (c) 2007-2011 GNU General Public License www.gnu.org/licenses
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+
+global ProcessorName: function
+
+SECTION .data
+align 16
+
+NameBuffer times 50H db 0 ; Static buffer to contain name
+
+
+SECTION .text align=16
+
+; ********** ProcessorName function **********
+; C++ prototype:
+; void ProcessorName (char * text);
+
+; This function finds the name of the microprocessor. The name is returned
+; in the parameter text, which must be a character array of at least 68 bytes.
+
+ProcessorName:
+ push rbx
+ push rdi
+ lea rdi, [NameBuffer] ; text pointer
+
+ mov eax, 80000000H
+ cpuid
+ cmp eax, 80000004H ; text if extended vendor string available
+ jb no_ext_vendor_string
+
+ ; Has extended vendor string
+ mov eax, 80000002H
+ cpuid
+ mov [rdi], eax ; store 16 bytes of extended vendor string
+ mov [rdi+4], ebx
+ mov [rdi+8], ecx
+ mov [rdi+0CH], edx
+ mov eax, 80000003H
+ cpuid
+ mov [rdi+10H], eax ; next 16 bytes
+ mov [rdi+14H], ebx
+ mov [rdi+18H], ecx
+ mov [rdi+1CH], edx
+ mov eax, 80000004H
+ cpuid
+ mov [rdi+20H], eax ; next 16 bytes
+ mov [rdi+24H], ebx
+ mov [rdi+28H], ecx
+ mov [rdi+2CH], edx
+ jmp get_family_and_model
+
+no_ext_vendor_string:
+ ; No extended vendor string. Get short vendor string
+ xor eax, eax
+ cpuid
+ mov [rdi],ebx ; store short vendor string
+ mov [rdi+4],edx
+ mov [rdi+8],ecx
+ mov byte [rdi+12],0 ; terminate string
+
+get_family_and_model:
+ xor eax, eax
+ mov ecx, 30H
+ cld
+ repne scasb ; find end of text
+ dec rdi
+
+ mov dword [rdi], ' Fam' ; Append text " Family "
+ mov dword [rdi+4], 'ily '
+ add rdi, 8
+
+ mov eax, 1
+ cpuid ; Get family and model
+ mov ebx, eax
+ mov ecx, eax
+ shr eax, 8
+ and eax, 0FH ; Family
+ shr ecx, 20
+ and ecx, 0FFH ; Extended family
+ add eax, ecx ; Family + extended family
+ call WriteHex ; Write as hexadecimal
+
+ mov dword [rdi], 'H Mo' ; Write text "H Model "
+ mov dword [rdi+4], 'del '
+ add rdi, 8
+
+ mov eax, ebx
+ shr eax, 4
+ and eax, 0FH ; Model
+ mov ecx, ebx
+ shr ecx, 12
+ and ecx, 0F0H ; Extended model
+ or eax, ecx ; Model | extended model
+ call WriteHex ; Write as hexadecimal
+
+ mov dword [rdi], 'H' ; Write text "H"
+
+PNEND: ; finished
+ lea rax, [NameBuffer] ; Pointer to result
+ pop rdi
+ pop rbx
+ ret
+;ProcessorName ENDP
+
+WriteHex: ; Local function: Write 2 hexadecimal digits
+ ; Parameters: AL = number to write, RDI = text destination
+ mov ecx, eax
+ shr ecx, 4
+ and ecx, 0FH ; most significant digit first
+ cmp ecx, 10
+ jnb W1
+ ; 0 - 9
+ add ecx, '0'
+ jmp W2
+W1: ; A - F
+ add ecx, 'A' - 10
+W2: mov [rdi], cl ; write digit
+
+ mov ecx, eax
+ and ecx, 0FH ; next digit
+ cmp ecx, 10
+ jnb W3
+ ; 0 - 9
+ add ecx, '0'
+ jmp W4
+W3: ; A - F
+ add ecx, 'A' - 10
+W4: mov [rdi+1], cl ; write digit
+ add rdi, 2 ; advance string pointer
+ ret
diff --git a/contrib/libs/asmlib/randomah.asi b/contrib/libs/asmlib/randomah.asi new file mode 100644 index 0000000000..ed7a0185a4 --- /dev/null +++ b/contrib/libs/asmlib/randomah.asi @@ -0,0 +1,290 @@ +; ----------------------------- RANDOMAH.ASI ---------------------------
+;
+; Author: Agner Fog
+; Date created: 1998
+; Last modified: 2013-09-09
+; Description:
+; Assembly include file containing
+; structure/class definitions for random number generators
+;
+; Copyright (c) 1998-2013 GNU General Public License www.gnu.org/licenses
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Definitions for Mersenne Twister:
+
+TEMPERING EQU 1 ; set to 0 if no tempering (improves speed by 25%)
+
+%if 0
+; define constants for MT11213A:
+MERS_N EQU 351
+MERS_M EQU 175
+MERS_R EQU 19
+MERS_A EQU 0E4BD75F5H
+MERS_U EQU 11
+MERS_S EQU 7
+MERS_T EQU 15
+MERS_L EQU 17
+MERS_B EQU 655E5280H
+MERS_C EQU 0FFD58000H
+
+%ELSE
+; or constants for MT19937:
+MERS_N EQU 624
+MERS_M EQU 397
+MERS_R EQU 31
+MERS_A EQU 09908B0DFH
+MERS_U EQU 11
+MERS_S EQU 7
+MERS_T EQU 15
+MERS_L EQU 18
+MERS_B EQU 9D2C5680H
+MERS_C EQU 0EFC60000H
+
+%ENDIF
+
+LOWER_MASK EQU (1 << MERS_R) - 1 ; lower MERS_R bits
+UPPER_MASK EQU -1 << MERS_R ; upper 32-MERS_R bits
+
+; Define class CRandomMersenneA member data
+; Must be aligned by 16.
+
+STRUC CRandomMersenneA
+.Fill1 RESD 4 ; Alignment filler
+.PreInt: RESD 4 ; premade tempered integer numbers, ready to use
+.PreFlt: RESQ 4 ; premade floating point numbers, ready to use (subtract 1.0)
+ RESQ 1 ; last PreFlt unaligned overrun if MERS_N mod 4 = 1
+.TmpFlt: RESQ 1 ; temporary storage of floating point random number
+.PreInx: RESD 1 ; index to next PreInt and PreFlt number
+.Instset: RESD 1 ; Instruction set
+.LastInterval: RESD 1 ; Last interval length for IRandomX
+.RLimit: RESD 1 ; Rejection limit used by IRandomX
+.TMB: RESD 4 ; 4 copies of MERS_B constant
+.TMC: RESD 4 ; 4 copies of MERS_C constant
+.one: RESQ 2 ; 2 copies of 1.0 constant
+.MTI: RESD 1 ; index into MT buffer
+.UMASK: RESD 1 ; UPPER_MASK
+.LMASK: RESD 1 ; LOWER_MASK ; constants
+.MATA: RESD 1 ; MERS_A
+.wrap1: RESD 4 ; MT buffer km wraparound
+.MT: RESD MERS_N ; MT history buffer (aligned by 16)
+.wrap2: RESD 4 ; MT buffer kk wraparound
+%if MERS_N & 3
+ ; MERS_N not divisible by 4. align by 4
+ RESD (4 - (MERS_N & 3))
+%ENDIF
+endstruc ; CRandomMersenneA
+
+
+; Definitions for Mother-of-all generator:
+
+; Define class CRandomMotherA member data
+; Must be aligned by 16. Preferably aligned by 64 to fit a cache line
+STRUC CRandomMotherA
+.Fill2 RESD 4 ; Alignment filler
+.one RESQ 1 ; 1.0
+.Instset RESD 1 ; Instruction set
+.M4 RESD 1 ; x[n-4]
+.M3 RESD 1 ; x[n-3] (aligned)
+.M2 RESD 1 ; x[n-2]
+.M1 RESD 1 ; x[n-1]
+.M0 RESD 1 ; x[n]
+.MC RESD 1 ; Carry (aligned)
+.zero RESD 1 ; Zero-extension of carry
+.RanP1 RESQ 1 ; Double random number in interval [1,2)
+.MF3 RESD 1 ; 2111111111 (aligned)
+.MF2 RESD 1 ; 1492
+.MF1 RESD 1 ; 1776
+.MF0 RESD 1 ; 5115
+endstruc ; CRandomMotherA
+
+MOTHERF0 EQU 5115 ; factor 0
+MOTHERF1 EQU 1776 ; factor 1
+MOTHERF2 EQU 1492 ; factor 2
+MOTHERF3 EQU 2111111111 ; factor 3
+
+
+; ***************************************************************************
+; Definitions for SFMT generator
+; ***************************************************************************
+
+; Choose Mersenne exponent.
+; Higher values give longer cycle length and use more memory:
+; MEXP equ 607
+; MEXP equ 1279
+; MEXP equ 2281
+; MEXP equ 4253
+ MEXP equ 11213
+; MEXP equ 19937
+; MEXP equ 44497
+
+%if MEXP == 44497
+SFMT_N equ 348 ; Size of state vector
+SFMT_M equ 330 ; Position of intermediate feedback
+SFMT_SL1 equ 5 ; Left shift of W[N-1], 32-bit words
+SFMT_SL2 equ 3 ; Left shift of W[0], *8, 128-bit words
+SFMT_SR1 equ 9 ; Right shift of W[M], 32-bit words
+SFMT_SR2 equ 3 ; Right shift of W[N-2], *8, 128-bit words
+SFMT_MASK1 equ 0effffffbH ;first DWORD of AND mask
+; AND mask:
+%define SFMT_MASK 0effffffbH,0dfbebfffH,0bfbf7befH,09ffd7bffH
+; Period certification vector
+%define 1,0,0a3ac4000H,0ecc1327aH
+
+%elif MEXP == 19937
+SFMT_N equ 156 ; Size of state vector
+SFMT_M equ 122 ; Position of intermediate feedback
+SFMT_SL1 equ 18 ; Left shift of W[N-1], 32-bit words
+SFMT_SL2 equ 1 ; Left shift of W[0], *8, 128-bit words
+SFMT_SR1 equ 11 ; Right shift of W[M], 32-bit words
+SFMT_SR2 equ 1 ; Right shift of W[N-2], *8, 128-bit words
+SFMT_MASK1 equ 0dfffffefH ;first DWORD of AND mask
+%define SFMT_MASK 0dfffffefH,0ddfecb7fH,0bffaffffH,0bffffff6H
+%define SFMT_PARITY 1,0,0,013c9e684H
+
+%elif MEXP == 11213
+SFMT_N equ 88 ; Size of state vector
+SFMT_M equ 68 ; Position of intermediate feedback
+SFMT_SL1 equ 14 ; Left shift of W[N-1], 32-bit words
+SFMT_SL2 equ 3 ; Left shift of W[0], *8, 128-bit words
+SFMT_SR1 equ 7 ; Right shift of W[M], 32-bit words
+SFMT_SR2 equ 3 ; Right shift of W[N-2], *8, 128-bit words
+SFMT_MASK1 equ 0effff7fbH ;first DWORD of AND mask
+%define SFMT_MASK 0effff7fbH,0ffffffefH,0dfdfbfffH,07fffdbfdH
+%define SFMT_PARITY 1,0,0e8148000H,0d0c7afa3H
+
+%elif MEXP == 4253
+SFMT_N equ 34 ; Size of state vector
+SFMT_M equ 17 ; Position of intermediate feedback
+SFMT_SL1 equ 20 ; Left shift of W[N-1], 32-bit words
+SFMT_SL2 equ 1 ; Left shift of W[0], *8, 128-bit words
+SFMT_SR1 equ 7 ; Right shift of W[M], 32-bit words
+SFMT_SR2 equ 1 ; Right shift of W[N-2], *8, 128-bit words
+SFMT_MASK1 equ 09f7bffffH ;first DWORD of AND mask
+%define SFMT_MASK 09f7bffffH,09fffff5fH,03efffffbH,0fffff7bbH
+%define SFMT_PARITY 0a8000001H,0af5390a3H,0b740b3f8H,06c11486dH
+
+%elif MEXP == 2281
+SFMT_N equ 18 ; Size of state vector
+SFMT_M equ 12 ; Position of intermediate feedback
+SFMT_SL1 equ 19 ; Left shift of W[N-1], 32-bit words
+SFMT_SL2 equ 1 ; Left shift of W[0], *8, 128-bit words
+SFMT_SR1 equ 5 ; Right shift of W[M], 32-bit words
+SFMT_SR2 equ 1 ; Right shift of W[N-2], *8, 128-bit words
+SFMT_MASK1 equ 0bff7ffbfH ;first DWORD of AND mask
+%define SFMT_MASK 0bff7ffbfH,0fdfffffeH,0f7ffef7fH,0f2f7cbbfH
+%define SFMT_PARITY 1,0,0,041dfa600H
+
+%elif MEXP == 1279
+SFMT_N equ 10 ; Size of state vector
+SFMT_M equ 7 ; Position of intermediate feedback
+SFMT_SL1 equ 14 ; Left shift of W[N-1], 32-bit words
+SFMT_SL2 equ 3 ; Left shift of W[0], *8, 128-bit words
+SFMT_SR1 equ 5 ; Right shift of W[M], 32-bit words
+SFMT_SR2 equ 1 ; Right shift of W[N-2], *8, 128-bit words
+SFMT_MASK1 equ 0f7fefffdH ;first DWORD of AND mask
+%define SFMT_MASK 0f7fefffdH,07fefcfffH,0aff3ef3fH,0b5ffff7fH
+%define SFMT_PARITY 1,0,0,020000000H
+
+%elif MEXP == 607
+SFMT_N equ 5 ; Size of state vector
+SFMT_M equ 2 ; Position of intermediate feedback
+SFMT_SL1 equ 15 ; Left shift of W[N-1], 32-bit words
+SFMT_SL2 equ 3 ; Left shift of W[0], *8, 128-bit words
+SFMT_SR1 equ 13 ; Right shift of W[M], 32-bit words
+SFMT_SR2 equ 3 ; Right shift of W[N-2], *8, 128-bit words
+SFMT_MASK1 equ 0fdff37ffH ;first DWORD of AND mask
+%define SFMT_MASK 0fdff37ffH,0ef7f3f7dH,0ff777b7dH,07ff7fb2fH
+%define SFMT_PARITY 1,0,0,05986f054H
+
+%ELSE
+%error MEXP must have one of the predefined values
+%ENDIF
+
+STRUC CRandomSFMTA
+.Fill3 RESD 4 ; Alignment filler
+
+; Parameters for Mother-Of-All generator:
+.M3: RESD 1 ; x[n-3] (aligned)
+ RESD 1 ; unused filler to fit the pmuludq instruction
+.M2: RESD 1 ; x[n-2]
+ RESD 1 ; unused filler to fit the pmuludq instruction
+.M1: RESD 1 ; x[n-1]
+ RESD 1 ; unused filler to fit the pmuludq instruction
+.M0: RESD 1 ; x[n]
+.MC: RESD 1 ; Carry (zero-extends into one)
+.one: RESQ 1 ; 1.0 (low dword = zero-extension of carry) (aligned)
+.TempRan: RESQ 1 ; Temporary random number
+.MF3: RESD 1 ; 2111111111 (aligned)
+.Instset: RESD 1 ; Instruction set
+.MF2: RESD 1 ; 1492 (MF3,MF2,MF1,MF0 interleaved with other variables to fit the pmuludq instruction)
+ RESD 1 ; Filler (may be used for read-only parameter, but not for read/write parameter)
+.MF1: RESD 1 ; 1776
+ RESD 1 ; Filler (may be used for read-only parameter, but not for read/write parameter)
+.MF0: RESD 1 ; 5115
+ RESD 1 ; Filler (may be used for read-only parameter, but not for read/write parameter)
+
+; Parameters for IRandomX:
+.LASTINTERVAL: RESD 1 ; Last interval length for IRandomX
+.RLIMIT: RESD 1 ; Rejection limit used by IRandomX
+
+; Parameters for SFMT generator:
+.USEMOTHER: RESD 1 ; 1 if combine with Mother-Of-All generator
+.IX: RESD 1 ; Index into state buffer for SFMT
+
+.AMASK: RESD 4 ; AND mask (aligned)
+.STATE: RESD SFMT_N*4 ; State vector (aligned)
+endstruc ; CRandomSFMTA
+
+
+; Load offset of TARGET into ecx. Use position-independent method if necessary
+%macro LOADOFFSET2ECX 1
+%IFNDEF POSITIONINDEPENDENT
+ mov ecx, %1
+%ELSE
+ ; get position-independent address of TARGET
+ call get_thunk_ecx
+ add ecx, %1 - $
+%ENDIF
+%endmacro
+
+; Load offset of TARGET into edi. Use position-independent method if necessary
+%macro LOADOFFSET2EDI 1
+%IFNDEF POSITIONINDEPENDENT
+ mov edi, %1
+%ELSE
+ ; get position-independent address of TARGET
+ call get_thunk_edi
+ add edi, %1 - $
+%ENDIF
+%endmacro
+
+
+; ***************************************************************************
+; Define registers used for function parameters, used in 64-bit mode only
+; ***************************************************************************
+
+%IFDEF WINDOWS
+ %define par1 rcx
+ %define par2 rdx
+ %define par3 r8
+ %define par4 r9
+ %define par5 qword [rsp+32+8] ; stack offset including shadow space
+ %define par1d ecx
+ %define par2d edx
+ %define par3d r8d
+ %define par4d r9d
+ %define par5d dword [rsp+32+8]
+%ENDIF
+
+%IFDEF UNIX
+ %define par1 rdi
+ %define par2 rsi
+ %define par3 rdx
+ %define par4 rcx
+ %define par5 r8
+ %define par1d edi
+ %define par2d esi
+ %define par3d edx
+ %define par4d ecx
+ %define par5d r8d
+%ENDIF
diff --git a/contrib/libs/asmlib/rdtsc64.asm b/contrib/libs/asmlib/rdtsc64.asm new file mode 100644 index 0000000000..42a0e23203 --- /dev/null +++ b/contrib/libs/asmlib/rdtsc64.asm @@ -0,0 +1,53 @@ +%include "defs.asm" + +; RDTSC64.ASM
+;
+; Author: Agner Fog
+; Date created: 2003
+; Last modified: 2008-10-16
+; Description:
+;
+; Copyright (c) 2009 GNU General Public License www.gnu.org/licenses
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+
+global ReadTSC: function
+
+SECTION .text align=16
+
+; ********** ReadTSC function **********
+; C++ prototype:
+; extern "C" __int64 ReadTSC (void);
+
+; This function returns the value of the time stamp counter, which counts
+; clock cycles. To count how many clock cycles a piece of code takes, call
+; Rdtsc before and after the code to measure and calculate the difference.
+
+; The number of clock cycles taken by the ReadTSC function itself is approximately:
+; Core 2: 730
+; Pentium 4: 700
+; Pentium II and Pentium III: 225
+; AMD Athlon 64, Opteron: 126
+; Does not work on 80386 and 80486.
+
+; Note that clock counts may not be fully reproducible on Intel Core and
+; Core 2 processors because the clock frequency can change. More reliable
+; instruction timings are obtained with the performance monitor counter
+; for "core clock cycles". This requires a kernel mode driver as the one
+; included with www.agner.org/optimize/testp.zip.
+
+ReadTSC:
+ push rbx ; ebx is modified by cpuid
+ sub eax, eax ; 0
+ cpuid ; serialize
+ rdtsc ; read time stamp counter into edx:eax
+ shl rdx, 32
+ or rax, rdx ; combine into 64 bit register
+ push rax
+ sub eax, eax
+ cpuid ; serialize
+ pop rax ; return value
+ pop rbx
+ ret
+;ReadTSC ENDP
diff --git a/contrib/libs/asmlib/round64.asm b/contrib/libs/asmlib/round64.asm new file mode 100644 index 0000000000..5ed55c53c6 --- /dev/null +++ b/contrib/libs/asmlib/round64.asm @@ -0,0 +1,40 @@ +%include "defs.asm" + +; ROUND64.ASM
+
+; Author: Agner Fog
+; Date created: 2007-06-15
+; Last modified: 2008-10-16
+; Description:
+; Round function
+
+; Copyright (c) 2009 GNU General Public License www.gnu.org/licenses
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+
+global RoundD: function
+global RoundF: function
+
+
+SECTION .text align=16
+
+; ********** round function **********
+; C++ prototype:
+; extern "C" int RoundD (double x);
+; extern "C" int RoundF (float x);
+
+; This function converts a single or double precision floating point number
+; to an integer, rounding to nearest or even. Does not check for overflow.
+; This function is much faster than the default conversion method in C++
+; which uses truncation.
+
+RoundD:
+ cvtsd2si eax, xmm0 ; Round xmm0 to eax
+ ret
+;RoundD ENDP
+
+RoundF:
+ cvtss2si eax, xmm0 ; Round xmm0 to eax
+ ret
+;RoundF ENDP
diff --git a/contrib/libs/asmlib/sfmt64.asm b/contrib/libs/asmlib/sfmt64.asm new file mode 100644 index 0000000000..3ca3cedca0 --- /dev/null +++ b/contrib/libs/asmlib/sfmt64.asm @@ -0,0 +1,889 @@ +%include "defs.asm" + +; ----------------------------- SFMT64.ASM ---------------------------
+; Author: Agner Fog
+; Date created: 2008-11-01
+; Last modified: 2013-09-13
+; Project: randoma library of random number generators
+; Source URL: www.agner.org/random
+; Description:
+; Random number generator of type SIMD-oriented Fast Mersenne Twister (SFMT)
+; (Mutsuo Saito and Makoto Matsumoto: "SIMD-oriented Fast Mersenne Twister:
+; a 128-bit Pseudorandom Number Generator", Monte Carlo and Quasi-Monte
+; Carlo Methods 2006, Springer, 2008, pp. 607-622).
+;
+; 64-bit mode version for x86-64 compatible microprocessors.
+; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
+; ----------------------------------------------------------------------
+
+default rel
+
+global SFMTRandomInit, SFMTRandomInitByArray, SFMTBRandom, SFMTRandom
+global SFMTRandomL, SFMTIRandom, SFMTIRandomX, SFMTgenRandomInit
+global SFMTgenRandomInitByArray, SFMTgenRandom, SFMTgenRandomL, SFMTgenIRandom
+global SFMTgenIRandomX, SFMTgenBRandom
+
+extern InstructionSet
+
+; structure definition and constants:
+%INCLUDE "randomah.asi"
+
+
+section .data
+align 16
+; Data for single instance of random number generator
+SFMTInstance: ISTRUC CRandomSFMTA
+; Size of structure
+IEND
+SFMTSize equ $-SFMTInstance
+
+
+align 16
+; Initialization constants for Mother-Of-All:
+InitMother DD 2111111111, 0, 1492, 0, 1776, 0, 5115, 0
+; Initialization Mask for SFMT:
+InitMask DD SFMT_MASK
+; Period certification vector for SFMT:
+InitParity DD SFMT_PARITY
+
+
+SECTION .CODE align=16 ; code segment
+
+
+; ---------------------------------------------------------------
+; Thread-safe static link versions for SFMT
+; ---------------------------------------------------------------
+
+; extern "C" void SFMTRandomInit(void * Pthis, int ThisSize, int seed, int IncludeMother = 0);
+; Parameters:
+; par1 = Pthis
+; par2d = ThisSize
+; par3d = seed
+; par4d = IncludeMother
+
+SFMTRandomInit:
+ cmp par2d, SFMTSize
+ jb Error ; Error exit if buffer too small
+ push rbx
+
+ ; Align by 16. Will overlap part of Fill if Pthis unaligned
+ and par1, -16
+
+ xor eax, eax
+ test par4d, par4d ; IncludeMother
+ setnz al ; convert any nonzero value to 1
+ ; Store USEMOTHER
+ mov [par1+CRandomSFMTA.USEMOTHER], eax
+
+ mov eax, par3d ; seed
+ xor ebx, ebx ; loop counter i
+ jmp L002 ; go into seeding loop
+
+L001: ; seeding loop for SFMT
+ ; y = factor * (y ^ (y >> 30)) + (++i);
+ call InitSubf0 ; randomization subfunction
+L002: mov [par1+rbx*4+CRandomSFMTA.STATE],eax ; initialize state
+ cmp ebx, SFMT_N*4 - 1
+ jb L001
+
+ ; Put 5 more values into Mother-Of-All generator
+ call InitSubf0
+ mov [par1+CRandomSFMTA.M0], eax
+ call InitSubf0
+ mov [par1+CRandomSFMTA.M1], eax
+ call InitSubf0
+ mov [par1+CRandomSFMTA.M2], eax
+ call InitSubf0
+ mov [par1+CRandomSFMTA.M3], eax
+ call InitSubf0
+ mov [par1+CRandomSFMTA.MC], eax
+
+ ; more initialization and period certification
+ call InitAndPeriod
+
+ pop rbx
+ ret
+;SFMTRandomInit ENDP
+
+Error: ; Error exit
+ xor eax, eax
+ div eax ; Divide by 0
+ ret
+
+; Subfunction used by SFMTRandomInit
+InitSubf0: ; private
+; y = 1812433253 * (y ^ (y >> 30)) + (++i);
+; input parameters:
+; eax = y
+; ebx = i
+; output:
+; eax = new y
+; ebx = i+1
+; edx modified
+ mov edx, eax
+ shr eax, 30
+ xor eax, edx
+ imul eax, 1812433253
+ inc ebx
+ add eax, ebx
+ ret
+;InitSubf0 endp
+
+; Subfunction used by SFMTRandomInitByArray
+InitSubf1: ; private
+; r = 1664525U * (r ^ (r >> 27));
+; input parameters:
+; eax = r
+; output:
+; eax = new r
+; r10 modified
+ mov r10d, eax
+ shr eax, 27
+ xor eax, r10d
+ imul eax, 1664525
+ ret
+;InitSubf1 endp
+
+; Subfunction used by SFMTRandomInitByArray
+InitSubf2: ; private
+; r = 1566083941U * (r ^ (r >> 27));
+; input parameters:
+; eax = r
+; output:
+; eax = new r
+; r10 modified
+ mov r10d, eax
+ shr eax, 27
+ xor eax, r10d
+ imul eax, 1566083941
+ ret
+;InitSubf2 endp
+
+
+; Subfunciton for initialization and period certification, except seeding
+; par1 = aligned pointer to CRandomSFMTA
+InitAndPeriod: ; private
+ push rbx
+
+ ; initialize constants for Mother-Of-All
+ movaps xmm0, oword [InitMother]
+ movaps oword [par1+CRandomSFMTA.MF3], xmm0
+ movaps xmm0, oword [InitMother+16]
+ movaps oword [par1+CRandomSFMTA.MF1], xmm0
+
+ ; initialize constants for SFMT
+ movaps xmm0, oword [InitMask]
+ movaps oword [par1+CRandomSFMTA.AMASK], xmm0
+
+ ; initialize various variables
+ xor eax, eax
+ mov dword [par1+CRandomSFMTA.one], eax
+ mov dword [par1+4+CRandomSFMTA.one], 3FF00000H
+ mov dword [par1+CRandomSFMTA.LASTINTERVAL], eax
+
+ ; get instruction set
+ push par1
+ call InstructionSet
+ pop par1
+ mov [par1+CRandomSFMTA.Instset], eax
+
+ ; Period certification
+ ; Compute parity of STATE[0-4] & InitParity
+ movaps xmm1, oword [par1+CRandomSFMTA.STATE]
+ andps xmm1, oword [InitParity]
+ movhlps xmm2, xmm1 ; high qword
+ xorps xmm1, xmm2 ; xor two qwords
+ pshufd xmm2, xmm1, 1 ; high dword
+ xorps xmm1, xmm2 ; xor two dwords
+ movd eax, xmm1 ; do rest of xor in eax
+ mov edx, eax
+ shr eax, 16
+ xor eax, edx ; xor two words
+ xor al, ah ; xor two bytes
+ jpo L008 ; parity odd: period OK
+
+ ; parity even: period not OK
+ ; Find a nonzero dword in period certification vector
+ xor ebx, ebx ; loop counter
+ lea rdx, [InitParity]
+L005: mov eax, [rdx+rbx*4] ; InitParity[i]
+ test eax, eax
+ jnz L006
+ inc ebx
+ ; assume that there is a nonzero dword in InitParity
+ jmp L005 ; loop until nonzero found
+
+L006: ; find first nonzero bit in eax
+ bsf edx, eax
+ ; flip the corresponding bit in STATE
+ btc [par1+rbx*4+CRandomSFMTA.STATE], edx
+
+L008: cmp dword [par1+CRandomSFMTA.USEMOTHER], 0
+ je L009
+ call Mother_Next ; Make first random number ready
+
+L009: ; Generate first random numbers and set IX = 0
+ call SFMT_Generate
+ pop rbx
+ ret
+;InitAndPeriod endp
+
+
+; extern "C" void SFMTRandomInitByArray
+; (void * Pthis, int ThisSize, int const seeds[], int NumSeeds, int IncludeMother = 0);
+; // Seed by more than 32 bits
+SFMTRandomInitByArray:
+; Parameters
+; par1 = Pthis
+; par2d = ThisSize
+; par3 = seeds
+; par4d = NumSeeds
+; par5d = IncludeMother
+
+; define constants:
+SFMT_SIZE equ SFMT_N*4 ; number of 32-bit integers in state
+
+%IF SFMT_SIZE >= 623
+ SFMT_LAG equ 11
+%ELIF SFMT_SIZE >= 68
+ SFMT_LAG equ 7
+%ELIF SFMT_SIZE >= 39
+ SFMT_LAG equ 5
+%ELSE
+ SFMT_LAG equ 3
+%ENDIF
+
+SFMT_MID equ ((SFMT_SIZE - SFMT_LAG) / 2)
+
+ xor eax, eax
+ cmp par5d, eax ; IncludeMother (parameter is on stack if windows)
+ setnz al ; convert any nonzero value to 1
+
+ push rbx
+ push rbp
+
+ cmp par2d, SFMTSize ; ThisSize
+ jb Error ; Error exit if buffer too small
+
+ ; Align by 16. Will overlap part of Fill if Pthis unaligned
+ and par1, -16
+
+ ; Store USEMOTHER
+ mov [par1+CRandomSFMTA.USEMOTHER], eax
+
+; 1. loop: Fill state vector with random numbers from NumSeeds
+; r = NumSeeds;
+; for (i = 0; i < SFMT_N*4; i++) {
+; r = factor * (r ^ (r >> 30)) + i;
+; sta[i] = r;}
+
+ mov eax, par4d ; r = NumSeeds
+ xor ebx, ebx ; i
+L100: mov par2d, eax
+ shr eax, 30
+ xor eax, par2d
+ imul eax, 1812433253
+ add eax, ebx
+ mov [par1+rbx*4+CRandomSFMTA.STATE], eax
+ inc ebx
+ cmp ebx, SFMT_SIZE
+ jb L100
+
+ ; count = max(NumSeeds,size-1)
+ mov eax, SFMT_SIZE - 1
+ mov r11d, par4d ; NumSeeds
+ cmp r11d, eax
+ cmovb r11d, eax
+
+; 2. loop: Fill state vector with random numbers from seeds[]
+; for (i = 1, j = 0; j < count; j++) {
+; r = func1(sta[i] ^ sta[(i + mid) % size] ^ sta[(i + size - 1) % size]);
+; sta[(i + mid) % size] += r;
+; if (j < NumSeeds) r += seeds[j]
+; r += i;
+; sta[(i + mid + lag) % size] += r;
+; sta[i] = r;
+; i = (i + 1) % size;
+; }
+ ; register use:
+ ; par1 = Pthis
+ ; par2 = j
+ ; par3 = seeds
+ ; par4 = NumSeeds
+ ; eax = r
+ ; ebx = i
+ ; ebp = (i + mid) % size, (i + mid + lag) % size
+ ; r10 = (i + size - 1) % size
+ ; r11 = count
+
+ xor par2d, par2d ; j = 0
+ lea ebx, [par2+1] ; i = 1
+
+L101: ; r = sta[i] ^ sta[(i + mid) % size] ^ sta[(i + size - 1) % size];
+ mov eax, [par1+rbx*4+CRandomSFMTA.STATE] ; sta[i]
+ lea ebp, [rbx+SFMT_MID]
+ cmp ebp, SFMT_SIZE
+ jb L102
+ sub ebp, SFMT_SIZE
+L102: xor eax, [par1+rbp*4+CRandomSFMTA.STATE] ; sta[(i + mid) % size]
+ lea r10d, [rbx+SFMT_SIZE-1]
+ cmp r10d, SFMT_SIZE
+ jb L103
+ sub r10d, SFMT_SIZE
+L103: xor eax, [par1+r10*4+CRandomSFMTA.STATE] ; sta[(i + size - 1) % size]
+
+ ; r = func1(r) = (r ^ (r >> 27)) * 1664525U;
+ call InitSubf1
+
+ ; sta[(i + mid) % size] += r;
+ add [par1+rbp*4+CRandomSFMTA.STATE], eax
+
+ ; if (j < NumSeeds) r += seeds[j]
+ cmp par2d, par4d
+ jnb L104
+ add eax, [par3+par2*4]
+L104:
+ ; r += i;
+ add eax, ebx
+
+ ; sta[(i + mid + lag) % size] += r;
+ lea ebp, [rbx+SFMT_MID+SFMT_LAG]
+ cmp ebp, SFMT_SIZE
+ jb L105
+ sub ebp, SFMT_SIZE
+L105: add [par1+rbp*4+CRandomSFMTA.STATE], eax
+
+ ;sta[i] = r;
+ mov [par1+rbx*4+CRandomSFMTA.STATE], eax
+
+ ; i = (i + 1) % size;
+ inc ebx
+ cmp ebx, SFMT_SIZE
+ jb L106
+ sub ebx, SFMT_SIZE
+L106:
+ ; j++, loop while j < count
+ inc par2d
+ cmp par2d, r11d
+ jb L101
+
+; 3. loop: Randomize some more
+; for (j = 0; j < size; j++) {
+; r = func2(sta[i] + sta[(i + mid) % size] + sta[(i + size - 1) % size]);
+; sta[(i + mid) % size] ^= r;
+; r -= i;
+; sta[(i + mid + lag) % size] ^= r;
+; sta[i] = r;
+; i = (i + 1) % size;
+; }
+ ; j = 0
+ xor par2d, par2d
+
+L110: ; r = sta[i] + sta[(i + mid) % size] + sta[(i + size - 1) % size]
+ mov eax, [par1+rbx*4+CRandomSFMTA.STATE] ; sta[i]
+ lea ebp, [rbx+SFMT_MID]
+ cmp ebp, SFMT_SIZE
+ jb L111
+ sub ebp, SFMT_SIZE
+L111: add eax, [par1+rbp*4+CRandomSFMTA.STATE] ; sta[(i + mid) % size]
+ lea r10d, [rbx+SFMT_SIZE-1]
+ cmp r10d, SFMT_SIZE
+ jb L112
+ sub r10d, SFMT_SIZE
+L112: add eax, [par1+r10*4+CRandomSFMTA.STATE] ; sta[(i + size - 1) % size]
+
+ ; r = func2(r) = (x ^ (x >> 27)) * 1566083941U;
+ call InitSubf2
+
+ ; sta[(i + mid) % size] ^= r;
+ xor [par1+rbp*4+CRandomSFMTA.STATE], eax
+
+ ; r -= i;
+ sub eax, ebx
+
+ ; sta[(i + mid + lag) % size] ^= r;
+ lea ebp, [rbx+SFMT_MID+SFMT_LAG]
+ cmp ebp, SFMT_SIZE
+ jb L113
+ sub ebp, SFMT_SIZE
+L113: xor [par1+rbp*4+CRandomSFMTA.STATE], eax
+
+ ; sta[i] = r;
+ mov [par1+rbx*4+CRandomSFMTA.STATE], eax
+
+ ; i = (i + 1) % size;
+ inc ebx
+ cmp ebx, SFMT_SIZE
+ jb L114
+ sub ebx, SFMT_SIZE
+L114:
+ ; j++, loop while j < size
+ inc par2d
+ cmp par2d, SFMT_SIZE
+ jb L110
+
+ ; if (UseMother) {
+ cmp dword [par1+CRandomSFMTA.USEMOTHER], 0
+ jz L120
+
+; 4. loop: Initialize MotherState
+; for (j = 0; j < 5; j++) {
+; r = func2(r) + j;
+; MotherState[j] = r + sta[2*j];
+; }
+ call InitSubf2
+ mov par2d, [par1+CRandomSFMTA.STATE]
+ add par2d, eax
+ mov [par1+CRandomSFMTA.M0], par2d
+ call InitSubf2
+ inc eax
+ mov par2d, [par1+8+CRandomSFMTA.STATE]
+ add par2d, eax
+ mov [par1+CRandomSFMTA.M1], par2d
+ call InitSubf2
+ add eax, 2
+ mov par2d, [par1+16+CRandomSFMTA.STATE]
+ add par2d, eax
+ mov [par1+CRandomSFMTA.M2], par2d
+ call InitSubf2
+ add eax, 3
+ mov par2d, [par1+24+CRandomSFMTA.STATE]
+ add par2d, eax
+ mov [par1+CRandomSFMTA.M3], par2d
+ call InitSubf2
+ add eax, 4
+ mov par2d, [par1+32+CRandomSFMTA.STATE]
+ add par2d, eax
+ mov [par1+CRandomSFMTA.MC], par2d
+
+L120: ; More initialization and period certification
+ call InitAndPeriod
+
+ pop rbp
+ pop rbx
+ ret
+;SFMTRandomInitByArray ENDP
+
+
+Mother_Next: ; private
+; Internal procedure: advance Mother-Of-All generator
+; The random value is in M0
+; par1 = aligned pointer to structure CRandomSFMTA
+; eax, par1, xmm0 unchanged
+
+ movdqa xmm1, oword [par1+CRandomSFMTA.M3] ; load M3,M2
+ movdqa xmm2, oword [par1+CRandomSFMTA.M1] ; load M1,M0
+ movhps qword [par1+CRandomSFMTA.M3], xmm1 ; M3=M2
+ movq qword [par1+CRandomSFMTA.M2], xmm2 ; M2=M1
+ movhps qword [par1+CRandomSFMTA.M1], xmm2 ; M1=M0
+ pmuludq xmm1, oword [par1+CRandomSFMTA.MF3] ; M3*MF3, M2*MF2
+ pmuludq xmm2, oword [par1+CRandomSFMTA.MF1] ; M1*MF1, M0*MF0
+ paddq xmm1, xmm2 ; P3+P1, P2+P0
+ movhlps xmm2, xmm1 ; Get high qword
+ movq xmm3, qword [par1+CRandomSFMTA.MC] ; +carry
+ paddq xmm1, xmm3
+ paddq xmm1, xmm2 ; P0+P1+P2+P3
+ movq qword [par1+CRandomSFMTA.M0], xmm1 ; Store new M0 and carry
+ ret
+;Mother_Next endp
+
+
+align 16
+SFMT_Generate: ; private
+; void CRandomSFMT::Generate() {
+; Fill state array with new random numbers
+
+ push rbx
+
+ ; register use
+ ; par1 = Pthis (rcx or rdi)
+ ; edx = i*16 + offset state
+ ; eax, ebx = loop end
+ ; xmm1 = r1
+ ; xmm2 = r2 = r
+ ; xmm0, xmm3 = scratch
+
+ ; r1 = state[SFMT_N*16 - 2];
+ ; r2 = state[SFMT_N*16 - 1];
+ movdqa xmm1, oword [par1+(SFMT_N-2)*16+CRandomSFMTA.STATE]
+ movdqa xmm2, oword [par1+(SFMT_N-1)*16+CRandomSFMTA.STATE]
+ mov edx, CRandomSFMTA.STATE
+
+;static inline __m128i sfmt_recursion(__m128i const &a, __m128i const &b,
+;__m128i const &c, __m128i const &d, __m128i const &mask) {
+; __m128i a1, b1, c1, d1, z1, z2;
+; b1 = _mm_srli_epi32(b, SFMT_SR1);
+; a1 = _mm_slli_si128(a, SFMT_SL2);
+; c1 = _mm_srli_si128(c, SFMT_SR2);
+; d1 = _mm_slli_epi32(d, SFMT_SL1);
+; b1 = _mm_and_si128(b1, mask);
+; z1 = _mm_xor_si128(a, a1);
+; z2 = _mm_xor_si128(b1, d1);
+; z1 = _mm_xor_si128(z1, c1);
+; z2 = _mm_xor_si128(z1, z2);
+; return z2;}
+
+; for (i = 0; i < SFMT_N - SFMT_M; i++) {
+; r = sfmt_recursion(state[i], state[i + SFMT_M], r1, r2, mask);
+; state[i] = r;
+; r1 = r2;
+; r2 = r;
+; }
+
+ mov eax, (SFMT_N-SFMT_M)*16 + CRandomSFMTA.STATE ; first loop end
+ mov ebx, SFMT_N*16 + CRandomSFMTA.STATE ; second loop end
+
+; first i loop from 0 to SFMT_N - SFMT_M
+align 8
+L201: movdqa xmm0, oword [par1+rdx+SFMT_M*16] ; b
+ psrld xmm0, SFMT_SR1 ; b1
+ pand xmm0, oword [par1+CRandomSFMTA.AMASK] ; b1
+ movdqa xmm3, oword [par1+rdx] ; a
+ pxor xmm0, xmm3
+ pslldq xmm3, SFMT_SL2 ; a1
+ psrldq xmm1, SFMT_SR2 ; c1, c = r1
+ pxor xmm0, xmm3
+ pxor xmm0, xmm1
+ movdqa xmm1, xmm2 ; r1 = r2
+ pslld xmm2, SFMT_SL1 ; d1, d = r2
+ pxor xmm2, xmm0 ; r2 = r
+ ; state[i] = r;
+ movdqa oword [par1+rdx], xmm2
+
+ ; i++ while i < SFMT_N - SFMT_M
+ add edx, 16
+ cmp edx, eax
+ jb L201
+
+;align 16
+L202: ; second i loop from SFMT_N - SFMT_M + 1 to SFMT_N
+ movdqa xmm0, oword [par1+rdx+(SFMT_M-SFMT_N)*16]; b
+ psrld xmm0, SFMT_SR1 ; b1
+ pand xmm0, oword [par1+CRandomSFMTA.AMASK] ; b1
+ movdqa xmm3, oword [par1+rdx] ; a
+ pxor xmm0, xmm3
+ pslldq xmm3, SFMT_SL2 ; a1
+ psrldq xmm1, SFMT_SR2 ; c1, c = r1
+ pxor xmm0, xmm3
+ pxor xmm0, xmm1
+ movdqa xmm1, xmm2 ; r1 = r2
+ pslld xmm2, SFMT_SL1 ; d1, d = r2
+ pxor xmm2, xmm0 ; r2 = r
+ ; state[i] = r;
+ movdqa oword [par1+rdx], xmm2
+
+ ; i++ while i < SFMT_N
+ add edx, 16
+ cmp edx, ebx
+ jb L202
+
+ ; Check if initialized
+L208: cmp dword [par1+CRandomSFMTA.AMASK], SFMT_MASK1
+ jne Error ; Make error if not initialized
+
+ ; ix = 0;
+ mov dword [par1+CRandomSFMTA.IX], 0 ; point to start of STATE buffer
+ pop rbx
+ ret
+;SFMT_Generate endp
+
+
+; extern "C" unsigned int SFMTBRandom(void * Pthis); // Output random bits
+
+SFMTBRandom: ; generate random bits
+ ; Align Pthis by 16. Will overlap part of Fill1 if Pthis unaligned
+ and par1, -16
+
+SFMTBRandom_reg: ; Entry for register parameters, used internally
+
+; if (ix >= SFMT_N*4) Generate();
+ mov edx, [par1+CRandomSFMTA.IX]
+ cmp edx, SFMT_N*16
+ jnb NeedGenerate
+
+; y = ((uint32_t*)state)[ix++];
+ mov eax, dword [par1+rdx+CRandomSFMTA.STATE]
+ add edx, 4
+ mov [par1+CRandomSFMTA.IX], edx
+
+AfterGenerate:
+; if (UseMother) y += MotherBits();
+ cmp dword [par1+CRandomSFMTA.USEMOTHER], 0
+ jz NoMother
+
+ ; add mother bits
+ add eax, [par1+CRandomSFMTA.M0] ; Add Mother random number
+ call Mother_Next ; Make next Mother random number ready
+
+NoMother: ; return y;
+ ret
+
+NeedGenerate:
+ call SFMT_Generate ; generate SFMT_N*4 random dwords
+ mov eax, [par1+CRandomSFMTA.STATE]
+ mov dword [par1+CRandomSFMTA.IX], 4
+ jmp AfterGenerate
+
+;SFMTBRandom ENDP
+
+
+; extern "C" double SFMTRandom (void * Pthis); // Output random float
+SFMTRandom: ; generate random float with 52 bits resolution
+ ; Align Pthis by 16. Will overlap part of Fill1 if Pthis unaligned
+ and par1, -16
+
+SFMTRandom_reg: ; internal entry point
+
+; check if there are at least 64 random bits in state buffer
+; if (ix >= SFMT_N*4-1) Generate();
+ mov edx, [par1+CRandomSFMTA.IX]
+ cmp edx, SFMT_N*16-4
+ jnb L303
+
+L301: ; read 64 random bits
+ movq xmm0, qword [par1+rdx+CRandomSFMTA.STATE]
+ add edx, 8
+ mov [par1+CRandomSFMTA.IX], edx
+
+ ; combine with Mother-Of-All generator?
+ cmp dword [par1+CRandomSFMTA.USEMOTHER], 0
+ jz L302 ; ConvertToFloat
+
+ ; add mother bits
+ movq xmm1, qword [par1+CRandomSFMTA.M0] ; Mother random number MC and M0
+ pshuflw xmm1, xmm1, 01001011B ; Put M0 before MC, and swap the words in MC
+ paddq xmm0, xmm1 ; Add SFMT and Mother outputs
+ call Mother_Next ; Make next Mother random number ready
+
+L302: ; ConvertToFloat
+ psrlq xmm0, 12 ; align with mantissa field of double precision float
+ movsd xmm1, [par1+CRandomSFMTA.one] ; 1.0 double precision
+ por xmm0, xmm1 ; insert exponent to get 1.0 <= x < 2.0
+ subsd xmm0, xmm1 ; subtract 1.0 to get 0.0 <= x < 1.0
+ ret ; return value
+
+L303: ; NeedGenerateR
+ call SFMT_Generate ; generate SFMT_N*4 random dwords
+ xor edx, edx
+ jmp L301
+
+;SFMTRandom ENDP
+
+
+; extern "C" long double SFMTRandomL (void * Pthis);
+SFMTRandomL: ; generate random float with 63 bits resolution
+ ; Align Pthis by 16.
+ and par1, -16
+
+SFMTRandomL_reg: ; internal entry point
+
+; check if there are at least 64 random bits in state buffer
+; if (ix >= SFMT_N*4-1) Generate();
+ mov edx, [par1+CRandomSFMTA.IX]
+ cmp edx, SFMT_N*16-4
+ jnb L403
+
+L401: ; read 64 random bits
+ movq xmm0, qword [par1+rdx+CRandomSFMTA.STATE]
+ add edx, 8
+ mov [par1+CRandomSFMTA.IX], edx
+
+ ; combine with Mother-Of-All generator?
+ cmp dword [par1+CRandomSFMTA.USEMOTHER], 0
+ jz L402
+
+ ; add mother bits
+ movq xmm1, qword [par1+CRandomSFMTA.M0] ; Mother random number MC and M0
+ pshuflw xmm1, xmm1, 01001011B ; Put M0 before MC, and swap the words in MC
+ paddq xmm0, xmm1 ; Add SFMT and Mother outputs
+ call Mother_Next ; Make next Mother random number ready
+
+L402: ;ConvertToFloat
+ sub rsp, 16 ; make space for long double
+ psrlq xmm0, 1 ; align with mantissa field of long double
+ pcmpeqw xmm1, xmm1 ; all 1's
+ psllq xmm1, 63 ; create a 1 in bit 63
+ por xmm0, xmm1 ; bit 63 is always 1 in long double
+ movq qword [rsp], xmm0 ; store mantissa
+ mov dword [rsp+8], 3FFFH ; exponent
+ fld tword [rsp] ; load long double
+ fsub qword [par1+CRandomSFMTA.one] ; subtract 1.0 to get 0.0 <= x < 1.0
+ pcmpeqw xmm0, xmm0 ; make a NAN for compilers that don't support long double
+ add rsp, 16
+ ret ; return value in st(0)
+
+L403: ;NeedGenerateR
+ call SFMT_Generate ; generate SFMT_N*4 random dwords
+ xor edx, edx
+ jmp L401
+;SFMTRandomL ENDP
+
+
+; extern "C" int SFMTIRandom (void * Pthis, int min, int max); // Output random integer
+
+SFMTIRandom:
+; par1 = Pthis
+; par2d = min
+; par3d = max
+
+ ; Align Pthis by 16.
+ and par1, -16
+ push par2 ; save min, max
+ push par3
+ call SFMTBRandom_reg ; random bits
+ pop rdx ; max
+ pop rcx ; min
+ sub edx, ecx
+ jl short WrongInterval ; max < min
+ inc edx ; max - min + 1
+ mul edx ; multiply random number by interval and truncate
+ lea eax, [rdx+rcx] ; add min to high dword of product
+ ret
+WrongInterval:
+ mov eax, 80000000H ; error exit
+ ret
+;SFMTIRandom ENDP
+
+
+; extern "C" int SFMTIRandomX (void * Pthis, int min, int max); // Output random integer
+
+SFMTIRandomX:
+; par1 = Pthis
+; par2d = min
+; par3d = max
+
+ push rbx
+ ; Align Pthis by 16.
+ and par1, -16
+
+ mov ebx, par3d
+ sub ebx, par2d ; max - min
+ jle short M30 ; max <= min (signed)
+ inc ebx ; interval = max - min + 1
+
+ ; if (interval != LastInterval) {
+ cmp ebx, [par1+CRandomSFMTA.LASTINTERVAL]
+ je M10
+ ; need to calculate new rejection limit
+ ; RLimit = uint32(((uint64)1 << 32) / interval) * interval - 1;}
+ xor eax, eax ; 0
+ lea edx, [eax+1] ; 1
+ div ebx ; (would give overflow if interval = 1)
+ mul ebx
+ dec eax
+ mov [par1+CRandomSFMTA.RLIMIT], eax
+ mov [par1+CRandomSFMTA.LASTINTERVAL], ebx
+M10: mov ebx, par2d ; save min
+
+M20: ; do { // Rejection loop
+ call SFMTBRandom_reg ; random bits (par1 is preserved)
+ ; longran = (uint64)BRandom() * interval;
+ mul dword [par1+CRandomSFMTA.LASTINTERVAL]
+ ; } while (remainder > RLimit);
+ cmp eax, [par1+CRandomSFMTA.RLIMIT]
+ ja M20
+
+ ; return (int32)iran + min
+ lea eax, [rbx+rdx]
+ pop rbx
+ ret
+
+M30: jl M40
+ ; max = min. Return min
+ mov eax, par2d
+ pop rbx
+ ret ; max = min exit
+
+M40: ; max < min: error
+ mov eax, 80000000H ; error exit
+ pop rbx
+ ret
+;SFMTIRandomX ENDP
+
+
+
+; -------------------------------------------------------------------------
+; Single-threaded static link versions for SFMT generator
+; -------------------------------------------------------------------------
+
+; extern "C" void SFMTgenRandomInit(int seed, int IncludeMother = 0);
+SFMTgenRandomInit:
+; par1d = seed
+; par2d = IncludeMother
+
+ ; set up parameters for call SFMTRandomInit
+ mov par4d, par2d ; IncludeMother
+ mov par3d, par1d ; seed
+ mov par2d, SFMTSize ; ThisSize
+ lea par1, [SFMTInstance] ; Get address of SFMTInstance into par1
+ jmp SFMTRandomInit
+;SFMTgenRandomInit ENDP
+
+
+; extern "C" void SFMTgenRandomInitByArray(int const seeds[], int NumSeeds, int IncludeMother = 0);
+SFMTgenRandomInitByArray:
+; par1 = seeds
+; par2d = NumSeeds
+; par3d = IncludeMother
+
+ ; set up parameters for call SFMTRandomInitByArray
+%IFDEF WINDOWS
+ push par3 ; IncludeMother on stack
+ sub rsp, 32 ; empty shadow space
+ mov par4d, par2d ; NumSeeds
+ mov par3, par1 ; seeds
+ mov par2d, SFMTSize ; ThisSize
+ lea par1, [SFMTInstance] ; Get address of SFMTInstance into par1
+ call SFMTRandomInitByArray
+ add rsp, 40
+ ret
+%ELSE ; UNIX
+ mov par5d, par3d ; IncludeMother in register
+ mov par4d, par2d ; NumSeeds
+ mov par3, par1 ; seeds
+ mov par2d, SFMTSize ; ThisSize
+ lea par1, [SFMTInstance] ; Get address of SFMTInstance into par1
+ jmp SFMTRandomInitByArray
+%ENDIF
+;SFMTgenRandomInitByArray ENDP
+
+
+; extern "C" double SFMTgenRandom();
+SFMTgenRandom: ; generate random float with 52 bits resolution
+ lea par1, [SFMTInstance] ; Get address of SFMTInstance into par1
+ jmp SFMTRandom_reg ; random bits
+;SFMTgenRandom ENDP
+
+
+; extern "C" double SFMTgenRandom();
+SFMTgenRandomL: ; generate random float with 63 bits resolution
+ lea par1, [SFMTInstance] ; Get address of SFMTInstance into par1
+ jmp SFMTRandomL_reg ; random bits
+;SFMTgenRandomL ENDP
+
+
+; extern "C" int SFMTgenIRandom (int min, int max);
+SFMTgenIRandom:
+ mov par3d, par2d
+ mov par2d, par1d
+ lea par1, [SFMTInstance] ; Get address of SFMTInstance into par1
+ jmp SFMTIRandom ; continue in _SFMTIRandom
+;SFMTgenIRandom ENDP
+
+
+; extern "C" int SFMTgenIRandomX (int min, int max);
+SFMTgenIRandomX:
+ mov par3d, par2d
+ mov par2d, par1d
+ lea par1, [SFMTInstance] ; Get address of SFMTInstance into par1
+ jmp SFMTIRandomX ; continue in _SFMTIRandomX
+;SFMTgenIRandomX ENDP
+
+
+; extern "C" uint32_t SFMTgenBRandom();
+SFMTgenBRandom: ; generate random float with 32 bits resolution
+ lea par1, [SFMTInstance] ; Get address of SFMTInstance into par1
+ jmp SFMTBRandom_reg ; random bits
+;SFMTgenBRandom ENDP
+
+;END
diff --git a/contrib/libs/asmlib/strcat64.asm b/contrib/libs/asmlib/strcat64.asm new file mode 100644 index 0000000000..3c8a247e3e --- /dev/null +++ b/contrib/libs/asmlib/strcat64.asm @@ -0,0 +1,70 @@ +%include "defs.asm" + +;************************* strcat64.asm ************************************
+; Author: Agner Fog
+; Date created: 2008-07-19
+; Last modified: 2008-10-16
+; Description:
+; Faster version of the standard strcat function:
+; char * strcat(char *dest, const char * src);
+; Copies zero-terminated string from src to end of dest.
+;
+; Overriding standard function strcat:
+; The alias ?OVR_strcat is changed to _strcat in the object file if
+; it is desired to override the standard library function strcat.
+;
+; Optimization:
+; Uses optimized functions A_strlen and A_memcpy.
+;
+; Copyright (c) 2009 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+default rel
+
+global A_strcat: function ; Function A_strcat
+global EXP(strcat): function ; ?OVR removed if standard function strcat overridden
+
+; Imported from strlen64.asm
+extern A_strlen
+
+; Imported from memcpy64.asm
+extern A_memcpy
+
+
+SECTION .text align=16
+
+; extern "C" char * A_strcat(char * dest, const char * src) {
+; memcpy(dest+strlen(dest), src, strlen(src)+1);
+; return dest
+; }
+
+; Function entry:
+A_strcat:
+EXP(strcat):
+
+%IFDEF WINDOWS
+%define Rpar1 rcx ; function parameter 1
+%define Rpar2 rdx ; function parameter 2
+%define Rpar3 r8 ; function parameter 3
+%ENDIF
+%IFDEF UNIX
+%define Rpar1 rdi ; function parameter 1
+%define Rpar2 rsi ; function parameter 2
+%define Rpar3 rdx ; function parameter 3
+%ENDIF
+
+ push Rpar1 ; dest
+ push Rpar2 ; src
+ call A_strlen ; length of dest
+ push rax ; strlen(dest)
+ mov Rpar1, [rsp+8] ; src
+ call A_strlen ; length of src
+ pop Rpar1 ; strlen(dest)
+ pop Rpar2 ; src
+ add Rpar1, [rsp] ; dest + strlen(dest)
+ lea Rpar3, [rax+1] ; strlen(src)+1
+ call A_memcpy ; copy
+ pop rax ; return dest
+ ret
+
+;A_strcat ENDP
diff --git a/contrib/libs/asmlib/strcpy64.asm b/contrib/libs/asmlib/strcpy64.asm new file mode 100644 index 0000000000..c505c48be7 --- /dev/null +++ b/contrib/libs/asmlib/strcpy64.asm @@ -0,0 +1,66 @@ +%include "defs.asm" + +;************************* strcpy64.asm ************************************
+; Author: Agner Fog
+; Date created: 2008-07-19
+; Last modified: 2011-07-01
+; Description:
+; Faster version of the standard strcpy function:
+; char * A_strcpy(char * dest, const char * src);
+; Copies zero-terminated string from src to dest, including terminating zero.
+;
+; Overriding standard function memcpy:
+; The alias ?OVR_strcpy is changed to _strcpy in the object file if
+; it is desired to override the standard library function strcpy.
+;
+; Optimization:
+; Uses optimized functions A_strlen and A_memcpy. These functions allow
+; calling without proper stack alignment.
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+default rel
+
+global A_strcpy: function ; Function A_strcpy
+global EXP(strcpy): function ; ?OVR removed if standard function memcpy overridden
+
+; Imported from strlen64.asm
+extern A_strlen
+
+; Imported from memcpy64.asm
+extern A_memcpy
+
+
+SECTION .text align=16
+
+; extern "C" char * A_strcpy(char * dest, const char * src) {
+; return memcpy(dest, src, strlen(src)+1);
+; }
+
+; Function entry:
+A_strcpy:
+EXP(strcpy):
+
+%IFDEF WINDOWS
+%define Rpar1 rcx ; function parameter 1
+%define Rpar2 rdx ; function parameter 2
+%define Rpar3 r8 ; function parameter 3
+%ENDIF
+%IFDEF UNIX
+%define Rpar1 rdi ; function parameter 1
+%define Rpar2 rsi ; function parameter 2
+%define Rpar3 rdx ; function parameter 3
+%ENDIF
+
+ push Rpar1 ; dest
+ push Rpar2 ; src
+ mov Rpar1, Rpar2
+ ; (A_strlen does not require stack alignment)
+ call A_strlen ; length of src
+ lea Rpar3,[rax+1] ; include terminating zero in length
+ pop Rpar2 ; src
+ pop Rpar1 ; dest
+ jmp A_memcpy ; copy and return
+
+;A_strcpy ENDP
diff --git a/contrib/libs/asmlib/stricmp64.asm b/contrib/libs/asmlib/stricmp64.asm new file mode 100644 index 0000000000..c568832b27 --- /dev/null +++ b/contrib/libs/asmlib/stricmp64.asm @@ -0,0 +1,86 @@ +%include "defs.asm" + +;************************* stricmpaz64.asm **********************************
+; Author: Agner Fog
+; Date created: 2008-12-05
+; Last modified: 2011-07-01
+; Source URL: www.agner.org/optimize
+; Project: asmlib.zip
+; Description:
+; Faster version of the standard stricmp or strcasecmp function:
+; int A_stricmp(const char *string1, const char *string2);
+; Compares two zero-terminated strings without case sensitivity.
+; Does not recognize locale-specific characters. A-Z are changed
+; to a-z before comparing, while other upper-case letters are not
+; converted but considered unique.
+;
+; Optimization:
+; SSE4.2 version not implemented because the gain is small.
+;
+; Copyright (c) 2008-2011 GNU General Public License www.gnu.org/licenses/gpl.html
+;******************************************************************************
+
+default rel
+
+global A_stricmp: function ; Function A_stricmp
+
+; ***************************************************************************
+; Define registers used for function parameters, used in 64-bit mode only
+; ***************************************************************************
+
+%IFDEF WINDOWS
+ %define par1 rcx ; first parameter
+ %define par2 rdx ; second parameter
+%ENDIF
+
+%IFDEF UNIX
+ %define par1 rdi ; first parameter
+ %define par2 rsi ; second parameter
+%ENDIF
+
+SECTION .text align=16
+
+; extern "C" int A_stricmp(const char *string1, const char *string2);
+
+A_stricmp:
+ sub par2, par1
+
+L10: mov al, [par1] ; string1
+ cmp al, [par1+par2] ; string2
+ jne L20
+ inc par1
+ test al, al
+ jnz L10 ; continue with next byte
+
+ ; terminating zero found. Strings are equal
+ xor eax, eax
+ ret
+
+L20: ; bytes are different. check case
+ xor al, 20H ; toggle case
+ cmp al, [par1+par2]
+ jne L30
+ ; possibly differing only by case. Check if a-z
+ or al, 20H ; upper case
+ sub al, 'a'
+ cmp al, 'z'-'a'
+ ja L30 ; not a-z
+ ; a-z and differing only by case
+ inc par1
+ jmp L10 ; continue with next byte
+
+L30: ; bytes are different, even after changing case
+ movzx eax, byte [par1] ; get original value again
+ sub eax, 'A'
+ cmp eax, 'Z' - 'A'
+ ja L40
+ add eax, 20H ; A-Z, make lower case
+L40: movzx edx, byte [par1+par2]
+ sub edx, 'A'
+ cmp edx, 'Z' - 'A'
+ ja L50
+ add edx, 20H ; A-Z, make lower case
+L50: sub eax, edx ; subtract to get result
+ ret
+
+;A_stricmp END
diff --git a/contrib/libs/asmlib/strlen64.asm b/contrib/libs/asmlib/strlen64.asm new file mode 100644 index 0000000000..ff65c10127 --- /dev/null +++ b/contrib/libs/asmlib/strlen64.asm @@ -0,0 +1,86 @@ +%include "defs.asm" + +;************************** strlen64.asm **********************************
+; Author: Agner Fog
+; Date created: 2008-07-19
+; Last modified: 2008-10-16
+; Description:
+; Faster version of the standard strlen function:
+; size_t strlen(const char * str);
+; Finds the length of a zero-terminated string of bytes, optimized for speed.
+;
+; Overriding standard function strlen:
+; The alias ?OVR_strlen is changed to _strlen in the object file if
+; it is desired to override the standard library function strlen.
+;
+; Calling conventions:
+; Stack alignment is not required. No shadow space or red zone used.
+; Called internally from strcpy and strcat without stack aligned.
+;
+; Optimization:
+; Uses XMM registers to read 16 bytes at a time, aligned.
+; Misaligned parts of the string are read from the nearest 16-bytes boundary
+; and the irrelevant part masked out. It may read both before the begin of
+; the string and after the end, but will never load any unnecessary cache
+; line and never trigger a page fault for reading from non-existing memory
+; pages because it never reads past the nearest following 16-bytes boundary.
+; It may, though, trigger any debug watch within the same 16-bytes boundary.
+;
+; The latest version of this file is available at:
+; www.agner.org/optimize/asmexamples.zip
+; Copyright (c) 2009 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+default rel
+
+global A_strlen: function ; Function A_strlen
+global EXP(strlen): function ; ?OVR removed if standard function strlen overridden
+
+
+SECTION .text align=16
+
+; extern "C" int strlen (const char * s);
+
+; 64-bit Windows version:
+A_strlen:
+EXP(strlen):
+
+%IFDEF WINDOWS
+ mov rax, rcx ; get pointer to string from rcx
+ mov r8, rcx ; copy pointer
+%define Rscopy r8 ; Copy of s
+
+%ELSE ; Unix
+ mov rax, rdi ; get pointer to string from rdi
+ mov ecx, edi ; copy pointer (lower 32 bits)
+%define Rscopy rdi ; Copy of s
+%ENDIF
+
+ ; rax = s, ecx = 32 bits of s
+ pxor xmm0, xmm0 ; set to zero
+ and ecx, 0FH ; lower 4 bits indicate misalignment
+ and rax, -10H ; align pointer by 16
+ movdqa xmm1, [rax] ; read from nearest preceding boundary
+ pcmpeqb xmm1, xmm0 ; compare 16 bytes with zero
+ pmovmskb edx, xmm1 ; get one bit for each byte result
+ shr edx, cl ; shift out false bits
+ shl edx, cl ; shift back again
+ bsf edx, edx ; find first 1-bit
+ jnz L2 ; found
+
+ ; Main loop, search 16 bytes at a time
+L1: add rax, 10H ; increment pointer by 16
+ movdqa xmm1, [rax] ; read 16 bytes aligned
+ pcmpeqb xmm1, xmm0 ; compare 16 bytes with zero
+ pmovmskb edx, xmm1 ; get one bit for each byte result
+ bsf edx, edx ; find first 1-bit
+ ; (moving the bsf out of the loop and using test here would be faster for long strings on old processors,
+ ; but we are assuming that most strings are short, and newer processors have higher priority)
+ jz L1 ; loop if not found
+
+L2: ; Zero-byte found. Compute string length
+ sub rax, Rscopy ; subtract start address
+ add rax, rdx ; add byte index
+ ret
+
+;A_strlen ENDP
diff --git a/contrib/libs/asmlib/substring64.asm b/contrib/libs/asmlib/substring64.asm new file mode 100644 index 0000000000..235b19a5f5 --- /dev/null +++ b/contrib/libs/asmlib/substring64.asm @@ -0,0 +1,75 @@ +%include "defs.asm" + +;************************* substring64.asm **********************************
+; Author: Agner Fog
+; Date created: 2011-07-18
+; Last modified: 2011-07-18
+; Source URL: www.agner.org/optimize
+; Project: asmlib.zip
+; Description:
+; Makes a substring of a zero-terminated ASCII string
+;
+; C++ prototype:
+; extern "C"
+; size_t A_substring(char * dest, const char * source, size_t pos, size_t len);
+; Makes a substring from source, starting at position pos (zero-based) and length
+; len and stores it in the array dest. It is the responsibility of the programmer
+; that the size of the dest array is at least len + 1.
+; The return value is the actual length of the substring. This may be less than
+; len if the length of source is less than pos + len.
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses/gpl.html
+;******************************************************************************
+
+global A_substring: function ; Function _A_substring
+
+extern A_strlen
+extern A_memcpy
+
+SECTION .text
+
+; extern "C"
+; size_t A_substring(char * dest, const char * source, size_t pos, size_t len);
+
+%ifdef WINDOWS
+%define par1 rcx ; dest
+%define par2 rdx ; source
+%define par3 r8 ; pos
+%define par4 r9 ; len
+%else ; UNIX
+%define par1 rdi
+%define par2 rsi
+%define par3 rdx
+%define par4 rcx
+%endif
+
+A_substring:
+ push par1
+ push par2
+ push par3
+ push par4
+ mov par1, par2
+ call A_strlen ; rax = strlen(source)
+ pop par4
+ pop par3
+ pop par2
+ pop par1
+ sub rax, par3 ; max length = strlen(source) - pos
+ jbe empty ; strlen(source) <= pos. Return empty string
+ cmp rax, par4
+ cmova rax, par4 ; min(len, maxlen)
+ add par2, par3 ; source + pos = source for memcpy
+ mov par3, rax ; length for memcpy
+ push rax ; new length
+ call A_memcpy
+ pop rcx ; new length = return value, rax = dest
+ mov byte [rcx+rax], 0 ; terminating zero
+ mov rax, rcx ; return new length
+ ret
+
+empty: ; return empty string
+ xor eax, eax ; return 0
+ mov byte [par1], al
+ ret
+
+;A_substring END
diff --git a/contrib/libs/asmlib/unalignedisfaster64.asm b/contrib/libs/asmlib/unalignedisfaster64.asm new file mode 100644 index 0000000000..eed68a1398 --- /dev/null +++ b/contrib/libs/asmlib/unalignedisfaster64.asm @@ -0,0 +1,188 @@ +%include "defs.asm" + +;************************* unalignedisfaster64.asm ******************************
+; Author: Agner Fog
+; Date created: 2011-07-09
+; Last modified: 2013-08-30
+; Source URL: www.agner.org/optimize
+; Project: asmlib.zip
+; Language: assembly, NASM/YASM syntax, 64 bit
+;
+; C++ prototype:
+; extern "C" int UnalignedIsFaster(void);
+;
+; Description:
+; This function finds out if unaligned 16-bytes memory read is
+; faster than aligned read followed by an alignment shift (PALIGNR) on the
+; current CPU.
+;
+; Return value:
+; 0: Unaligned read is probably slower than alignment shift
+; 1: Unknown
+; 2: Unaligned read is probably faster than alignment shift
+;
+;
+; C++ prototype:
+; extern "C" int Store256BitIsFaster(void);
+;
+; Description:
+; This function finds out if a 32-bytes memory write is
+; faster than two 16-bytes writes on the current CPU.
+;
+; Return value:
+; 0: 32-bytes memory write is slower or AVX not supported
+; 1: Unknown
+; 2: 32-bytes memory write is faster
+;
+; Copyright (c) 2011 - 2013 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+;
+; C++ prototype:
+; extern "C" int UnalignedIsFaster(void);
+
+global UnalignedIsFaster: function
+global Store256BitIsFaster: function
+extern CpuType
+extern InstructionSet
+
+
+SECTION .text
+
+UnalignedIsFaster:
+
+%ifdef UNIX
+ push 0 ; vendor
+ mov rdi, rsp
+ push 0 ; family
+ mov rsi, rsp
+ push 0 ; model
+ mov rdx, rsp
+%else ; WINDOWS
+ push 0 ; vendor
+ mov rcx, rsp
+ push 0 ; family
+ mov rdx, rsp
+ push 0 ; model
+ mov r8, rsp
+%endif
+ call CpuType ; get vendor, family, model
+ pop rdx ; model
+ pop rcx ; family
+ pop r8 ; vendor
+ xor eax, eax ; return value
+ dec r8d
+ jz Intel
+ dec r8d
+ jz AMD
+ dec r8d
+ jz VIA
+ ; unknown vendor
+ inc eax
+ jmp Uend
+
+Intel: ; Unaligned read is faster on Intel Nehalem and later, but not Atom
+ ; Nehalem = family 6, model 1AH
+ ; Atom = family 6, model 1CH
+ ; Netburst = family 0FH
+ ; Future models are likely to be family 6, mayby > 6, model > 1C
+ cmp ecx, 6
+ jb Uend ; old Pentium 1, etc
+ cmp ecx, 0FH
+ je Uend ; old Netburst architecture
+ cmp edx, 1AH
+ jb Uend ; earlier than Nehalem
+ cmp edx, 1CH
+ je Uend ; Intel Atom
+ or eax, 2 ; Intel Nehalem and later, except Atom
+ jmp Uend
+
+AMD: ; AMD processors:
+ ; The PALIGNR instruction is slow on AMD Bobcat but fast on Jaguar
+ ; K10/Opteron = family 10H ; Use unaligned
+ ; Bobcat = family 14H ; PALIGNR is very slow. Use unaligned
+ ; Piledriver = family 15H ; Use unaligned
+ ; Jaguar = family 16H ; PALIGNR is fast. Use aligned (aligned is faster in most cases, but not all)
+ cmp ecx, 10H ; AMD K8 or earlier: use aligned
+ jb Uend
+ cmp ecx, 16H ; Jaguar: use aligned
+ je Uend
+ or eax, 2 ; AMD K10 or later: use unaligned
+ jmp Uend
+
+VIA: ; Unaligned read is not faster than PALIGNR on VIA Nano 2000 and 3000
+ cmp ecx, 0FH
+ jna Uend ; VIA Nano
+ inc eax ; Future versions: unknown
+ ;jmp Uend
+
+Uend: ret
+
+;UnalignedIsFaster ENDP
+
+
+Store256BitIsFaster:
+ call InstructionSet
+ cmp eax, 11 ; AVX supported
+ jb S90
+%ifdef UNIX
+ push 0 ; vendor
+ mov rdi, rsp
+ push 0 ; family
+ mov rsi, rsp
+ push 0 ; model
+ mov rdx, rsp
+%else ; WINDOWS
+ push 0 ; vendor
+ mov rcx, rsp
+ push 0 ; family
+ mov rdx, rsp
+ push 0 ; model
+ mov r8, rsp
+%endif
+ call CpuType ; get vendor, family, model
+ pop rdx ; model
+ pop rcx ; family
+ pop rax ; vendor
+
+ cmp eax, 1 ; Intel
+ je S_Intel
+ cmp eax, 2 ; AMD
+ je S_AMD
+ cmp eax, 3
+ je S_VIA
+ jmp S91 ; other vendor, not known
+
+S_Intel:cmp ecx, 6
+ jne S92 ; unknown family. possibly future model
+ ; model 2AH Sandy Bridge
+ ; model 3AH Ivy Bridge
+ ; model 3CH Haswell
+ ; Sandy Bridge and Ivy Bridge are slightly faster with 128 than with 256 bit moves on large data blocks
+ ; Haswell is much faster with 256 bit moves
+ cmp edx, 3AH
+ jbe S90
+ jmp S92
+
+S_AMD: ; AMD
+ cmp ecx, 15H ; family 15h = Bulldozer, Piledriver
+ ja S92 ; assume future AMD families are faster
+ ; family 16H = Jaguar. 256 bit write is slightly faster
+ ; model 1 = Bulldozer is a little slower on 256 bit write
+ ; model 2 = Piledriver is terribly slow on 256 bit write
+ ; assume future models 3-4 are like Bulldozer
+ cmp edx, 4
+ jbe S90
+ jmp S91 ; later models: don't know
+
+S_VIA: jmp S91 ; don't know
+
+S90: xor eax, eax ; return 0
+ ret
+
+S91: mov eax, 1 ; return 1
+ ret
+
+S92: mov eax, 2 ; return 2
+ ret
+
+; Store256BitIsFaster ENDP
diff --git a/ydb/apps/ydb/CMakeLists.darwin-x86_64.txt b/ydb/apps/ydb/CMakeLists.darwin-x86_64.txt index 87e7b9d72a..832d38005c 100644 --- a/ydb/apps/ydb/CMakeLists.darwin-x86_64.txt +++ b/ydb/apps/ydb/CMakeLists.darwin-x86_64.txt @@ -19,6 +19,7 @@ target_link_libraries(ydb PUBLIC contrib-libs-cxxsupp yutil library-cpp-cpuid_check + contrib-libs-asmlib commands library-cpp-resource ) diff --git a/ydb/apps/ydb/CMakeLists.linux-aarch64.txt b/ydb/apps/ydb/CMakeLists.linux-aarch64.txt index 2ab9df1629..c3c69ca975 100644 --- a/ydb/apps/ydb/CMakeLists.linux-aarch64.txt +++ b/ydb/apps/ydb/CMakeLists.linux-aarch64.txt @@ -19,6 +19,7 @@ target_link_libraries(ydb PUBLIC contrib-libs-linux-headers contrib-libs-cxxsupp yutil + contrib-libs-asmlib commands library-cpp-resource ) diff --git a/ydb/apps/ydb/CMakeLists.linux-x86_64.txt b/ydb/apps/ydb/CMakeLists.linux-x86_64.txt index 9721d4dbf4..86e78d9ecb 100644 --- a/ydb/apps/ydb/CMakeLists.linux-x86_64.txt +++ b/ydb/apps/ydb/CMakeLists.linux-x86_64.txt @@ -20,6 +20,7 @@ target_link_libraries(ydb PUBLIC contrib-libs-cxxsupp yutil library-cpp-cpuid_check + contrib-libs-asmlib commands library-cpp-resource ) diff --git a/ydb/apps/ydb/CMakeLists.windows-x86_64.txt b/ydb/apps/ydb/CMakeLists.windows-x86_64.txt index 9bba9d0f4b..1919d40b1d 100644 --- a/ydb/apps/ydb/CMakeLists.windows-x86_64.txt +++ b/ydb/apps/ydb/CMakeLists.windows-x86_64.txt @@ -19,6 +19,7 @@ target_link_libraries(ydb PUBLIC contrib-libs-cxxsupp yutil library-cpp-cpuid_check + contrib-libs-asmlib commands library-cpp-resource ) |