Switch to old asmlib to be able to build ydb cli without sse4

author: pnv1 <pnv@ydb.tech> 2023-04-27 19:15:07 +0300
committer: pnv1 <pnv@ydb.tech> 2023-04-27 19:15:07 +0300
commit: a66c59109292f9e0fb44ede41adfdebe569e4df3 (patch)
tree: 906b3d10274afd16e8e70c61ff416bff9075422e /contrib
parent: 9ca91b40d6f45546e20a646d15590c0cc6cc9778 (diff)
download: ydb-a66c59109292f9e0fb44ede41adfdebe569e4df3.tar.gz
42 files changed, 9198 insertions, 0 deletions
diff --git a/contrib/libs/CMakeLists.darwin-x86_64.txt b/contrib/libs/CMakeLists.darwin-x86_64.txt
index 8f27501754..dbdaed7276 100644
--- a/contrib/libs/CMakeLists.darwin-x86_64.txt
+++ b/contrib/libs/CMakeLists.darwin-x86_64.txt
@@ -8,6 +8,8 @@
 
 add_subdirectory(antlr3_cpp_runtime)
 add_subdirectory(apache)
+add_subdirectory(asmglibc)
+add_subdirectory(asmlib)
 add_subdirectory(aws-sdk-cpp)
 add_subdirectory(base64)
 add_subdirectory(brotli)
diff --git a/contrib/libs/CMakeLists.linux-aarch64.txt b/contrib/libs/CMakeLists.linux-aarch64.txt
index 80cc88b2df..c67d278e53 100644
--- a/contrib/libs/CMakeLists.linux-aarch64.txt
+++ b/contrib/libs/CMakeLists.linux-aarch64.txt
@@ -8,6 +8,7 @@
 
 add_subdirectory(antlr3_cpp_runtime)
 add_subdirectory(apache)
+add_subdirectory(asmlib)
 add_subdirectory(aws-sdk-cpp)
 add_subdirectory(base64)
 add_subdirectory(brotli)
diff --git a/contrib/libs/CMakeLists.linux-x86_64.txt b/contrib/libs/CMakeLists.linux-x86_64.txt
index 797ac4fd05..185d96e891 100644
--- a/contrib/libs/CMakeLists.linux-x86_64.txt
+++ b/contrib/libs/CMakeLists.linux-x86_64.txt
@@ -8,6 +8,7 @@
 
 add_subdirectory(antlr3_cpp_runtime)
 add_subdirectory(apache)
+add_subdirectory(asmlib)
 add_subdirectory(aws-sdk-cpp)
 add_subdirectory(base64)
 add_subdirectory(brotli)
diff --git a/contrib/libs/CMakeLists.windows-x86_64.txt b/contrib/libs/CMakeLists.windows-x86_64.txt
index cd2534dc93..96f605c258 100644
--- a/contrib/libs/CMakeLists.windows-x86_64.txt
+++ b/contrib/libs/CMakeLists.windows-x86_64.txt
@@ -8,6 +8,7 @@
 
 add_subdirectory(antlr3_cpp_runtime)
 add_subdirectory(apache)
+add_subdirectory(asmlib)
 add_subdirectory(aws-sdk-cpp)
 add_subdirectory(base64)
 add_subdirectory(brotli)
diff --git a/contrib/libs/asmglibc/CMakeLists.darwin-x86_64.txt b/contrib/libs/asmglibc/CMakeLists.darwin-x86_64.txt
new file mode 100644
index 0000000000..e2b4e37fbb
--- /dev/null
+++ b/contrib/libs/asmglibc/CMakeLists.darwin-x86_64.txt
@@ -0,0 +1,13 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(contrib-libs-asmglibc)
+target_sources(contrib-libs-asmglibc PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmglibc/memchr.S
+)
diff --git a/contrib/libs/asmglibc/CMakeLists.txt b/contrib/libs/asmglibc/CMakeLists.txt
new file mode 100644
index 0000000000..661b6431cc
--- /dev/null
+++ b/contrib/libs/asmglibc/CMakeLists.txt
@@ -0,0 +1,11 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+if (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+  include(CMakeLists.darwin-x86_64.txt)
+endif()
diff --git a/contrib/libs/asmglibc/memchr.S b/contrib/libs/asmglibc/memchr.S
new file mode 100644
index 0000000000..b0a51115c4
--- /dev/null
+++ b/contrib/libs/asmglibc/memchr.S
@@ -0,0 +1,330 @@
+/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "sysdep.h"
+
+#ifdef USE_AS_WMEMCHR
+# define MEMCHR		wmemchr
+# define PCMPEQ		pcmpeqd
+#else
+# define MEMCHR		memchr
+# define PCMPEQ		pcmpeqb
+#endif
+
+/* fast SSE2 version with using pmaxub and 64 byte loop */
+
+	.text
+ENTRY(MEMCHR)
+	movd	%esi, %xmm1
+	mov	%edi, %ecx
+
+#ifdef USE_AS_WMEMCHR
+	test	%rdx, %rdx
+	jz	L(return_null)
+	shl	$2, %rdx
+#else
+	punpcklbw %xmm1, %xmm1
+	test	%rdx, %rdx
+	jz	L(return_null)
+	punpcklbw %xmm1, %xmm1
+#endif
+
+	and	$63, %ecx
+	pshufd	$0, %xmm1, %xmm1
+
+	cmp	$48, %ecx
+	ja	L(crosscache)
+
+	movdqu	(%rdi), %xmm0
+	PCMPEQ	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+
+	jnz	L(matches_1)
+	sub	$16, %rdx
+	jbe	L(return_null)
+	add	$16, %rdi
+	and	$15, %ecx
+	and	$-16, %rdi
+	add	%rcx, %rdx
+	sub	$64, %rdx
+	jbe	L(exit_loop)
+	jmp	L(loop_prolog)
+
+	.p2align 4
+L(crosscache):
+	and	$15, %ecx
+	and	$-16, %rdi
+	movdqa	(%rdi), %xmm0
+
+	PCMPEQ	%xmm1, %xmm0
+/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+/* Remove the leading bytes.  */
+	sar	%cl, %eax
+	test	%eax, %eax
+	je	L(unaligned_no_match)
+/* Check which byte is a match.  */
+	bsf	%eax, %eax
+
+	sub	%rax, %rdx
+	jbe	L(return_null)
+	add	%rdi, %rax
+	add	%rcx, %rax
+	ret
+
+	.p2align 4
+L(unaligned_no_match):
+        /* "rcx" is less than 16.  Calculate "rdx + rcx - 16" by using
+	   "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void
+	   possible addition overflow.  */
+	neg	%rcx
+	add	$16, %rcx
+	sub	%rcx, %rdx
+	jbe	L(return_null)
+	add	$16, %rdi
+	sub	$64, %rdx
+	jbe	L(exit_loop)
+
+	.p2align 4
+L(loop_prolog):
+	movdqa	(%rdi), %xmm0
+	PCMPEQ	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+
+	movdqa	16(%rdi), %xmm2
+	PCMPEQ	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	movdqa	32(%rdi), %xmm3
+	PCMPEQ	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	48(%rdi), %xmm4
+	PCMPEQ	%xmm1, %xmm4
+	add	$64, %rdi
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(matches0)
+
+	test	$0x3f, %rdi
+	jz	L(align64_loop)
+
+	sub	$64, %rdx
+	jbe	L(exit_loop)
+
+	movdqa	(%rdi), %xmm0
+	PCMPEQ	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+
+	movdqa	16(%rdi), %xmm2
+	PCMPEQ	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	movdqa	32(%rdi), %xmm3
+	PCMPEQ	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	48(%rdi), %xmm3
+	PCMPEQ	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+
+	add	$64, %rdi
+	test	%eax, %eax
+	jnz	L(matches0)
+
+	mov	%rdi, %rcx
+	and	$-64, %rdi
+	and	$63, %ecx
+	add	%rcx, %rdx
+
+	.p2align 4
+L(align64_loop):
+	sub	$64, %rdx
+	jbe	L(exit_loop)
+	movdqa	(%rdi), %xmm0
+	movdqa	16(%rdi), %xmm2
+	movdqa	32(%rdi), %xmm3
+	movdqa	48(%rdi), %xmm4
+
+	PCMPEQ	%xmm1, %xmm0
+	PCMPEQ	%xmm1, %xmm2
+	PCMPEQ	%xmm1, %xmm3
+	PCMPEQ	%xmm1, %xmm4
+
+	pmaxub	%xmm0, %xmm3
+	pmaxub	%xmm2, %xmm4
+	pmaxub	%xmm3, %xmm4
+	pmovmskb %xmm4, %eax
+
+	add	$64, %rdi
+
+	test	%eax, %eax
+	jz	L(align64_loop)
+
+	sub	$64, %rdi
+
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	movdqa	32(%rdi), %xmm3
+	PCMPEQ	%xmm1, %xmm3
+
+	PCMPEQ	48(%rdi), %xmm1
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	pmovmskb %xmm1, %eax
+	bsf	%eax, %eax
+	lea	48(%rdi, %rax), %rax
+	ret
+
+	.p2align 4
+L(exit_loop):
+	add	$32, %edx
+	jle	L(exit_loop_32)
+
+	movdqa	(%rdi), %xmm0
+	PCMPEQ	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+
+	movdqa	16(%rdi), %xmm2
+	PCMPEQ	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	movdqa	32(%rdi), %xmm3
+	PCMPEQ	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32_1)
+	sub	$16, %edx
+	jle	L(return_null)
+
+	PCMPEQ	48(%rdi), %xmm1
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(matches48_1)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(exit_loop_32):
+	add	$32, %edx
+	movdqa	(%rdi), %xmm0
+	PCMPEQ	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches_1)
+	sub	$16, %edx
+	jbe	L(return_null)
+
+	PCMPEQ	16(%rdi), %xmm1
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(matches16_1)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(matches0):
+	bsf	%eax, %eax
+	lea	-16(%rax, %rdi), %rax
+	ret
+
+	.p2align 4
+L(matches):
+	bsf	%eax, %eax
+	add	%rdi, %rax
+	ret
+
+	.p2align 4
+L(matches16):
+	bsf	%eax, %eax
+	lea	16(%rax, %rdi), %rax
+	ret
+
+	.p2align 4
+L(matches32):
+	bsf	%eax, %eax
+	lea	32(%rax, %rdi), %rax
+	ret
+
+	.p2align 4
+L(matches_1):
+	bsf	%eax, %eax
+	sub	%rax, %rdx
+	jbe	L(return_null)
+	add	%rdi, %rax
+	ret
+
+	.p2align 4
+L(matches16_1):
+	bsf	%eax, %eax
+	sub	%rax, %rdx
+	jbe	L(return_null)
+	lea	16(%rdi, %rax), %rax
+	ret
+
+	.p2align 4
+L(matches32_1):
+	bsf	%eax, %eax
+	sub	%rax, %rdx
+	jbe	L(return_null)
+	lea	32(%rdi, %rax), %rax
+	ret
+
+	.p2align 4
+L(matches48_1):
+	bsf	%eax, %eax
+	sub	%rax, %rdx
+	jbe	L(return_null)
+	lea	48(%rdi, %rax), %rax
+	ret
+
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	ret
+END(MEMCHR)
+
+#ifndef USE_AS_WMEMCHR
+strong_alias (memchr, __memchr)
+libc_hidden_builtin_def(memchr)
+#endif
+\ No newline at end of file
diff --git a/contrib/libs/asmglibc/sysdep.h b/contrib/libs/asmglibc/sysdep.h
new file mode 100644
index 0000000000..1cfb71673e
--- /dev/null
+++ b/contrib/libs/asmglibc/sysdep.h
@@ -0,0 +1,12 @@
+#if defined(__APPLE__)
+    #define ENTRY(X) .globl _## X; .align 1<<3; _ ## X:
+    #define END(X)
+    #define L(X) L ## X
+#else
+    #define ENTRY(X) .globl X; .type X,@function; .align 1<<4; X: .cfi_startproc;
+    #define END(X) .cfi_endproc; .size X,.-X;
+    #define L(X) .L ## X
+#endif
+
+#define libc_hidden_builtin_def(X)
+#define strong_alias(X, Y)
diff --git a/contrib/libs/asmlib/CMakeLists.darwin-x86_64.txt b/contrib/libs/asmlib/CMakeLists.darwin-x86_64.txt
new file mode 100644
index 0000000000..56e892f3a2
--- /dev/null
+++ b/contrib/libs/asmlib/CMakeLists.darwin-x86_64.txt
@@ -0,0 +1,192 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(contrib-libs-asmlib)
+target_link_libraries(contrib-libs-asmlib PUBLIC
+  contrib-libs-asmglibc
+)
+target_sources(contrib-libs-asmlib PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/dummy.c
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/debugbreak64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/cachesize64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/divfixedi64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/rdtsc64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/strcat64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/unalignedisfaster64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/strcpy64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/substring64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/strlen64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/cputype64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memcmp64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memmove64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/stricmp64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/divfixedv64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/physseed64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/cpuid64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/round64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memcpy64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/popcount64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/dispatchpatch64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/procname64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memset64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
diff --git a/contrib/libs/asmlib/CMakeLists.linux-aarch64.txt b/contrib/libs/asmlib/CMakeLists.linux-aarch64.txt
new file mode 100644
index 0000000000..d29b43c90a
--- /dev/null
+++ b/contrib/libs/asmlib/CMakeLists.linux-aarch64.txt
@@ -0,0 +1,16 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(contrib-libs-asmlib)
+target_link_libraries(contrib-libs-asmlib PUBLIC
+  contrib-libs-linux-headers
+)
+target_sources(contrib-libs-asmlib PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/dummy.c
+)
diff --git a/contrib/libs/asmlib/CMakeLists.linux-x86_64.txt b/contrib/libs/asmlib/CMakeLists.linux-x86_64.txt
new file mode 100644
index 0000000000..e4b9975e9f
--- /dev/null
+++ b/contrib/libs/asmlib/CMakeLists.linux-x86_64.txt
@@ -0,0 +1,216 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(contrib-libs-asmlib)
+target_link_libraries(contrib-libs-asmlib PUBLIC
+  contrib-libs-linux-headers
+)
+target_sources(contrib-libs-asmlib PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/dummy.c
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/sfmt64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/mother64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/mersenne64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/debugbreak64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/cachesize64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/divfixedi64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/rdtsc64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/strcat64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/unalignedisfaster64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/strcpy64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/substring64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/strlen64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/cputype64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memcmp64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memmove64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/stricmp64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/divfixedv64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/physseed64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/cpuid64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/round64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memcpy64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/popcount64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/dispatchpatch64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/procname64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memset64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
diff --git a/contrib/libs/asmlib/CMakeLists.txt b/contrib/libs/asmlib/CMakeLists.txt
new file mode 100644
index 0000000000..f8b31df0c1
--- /dev/null
+++ b/contrib/libs/asmlib/CMakeLists.txt
@@ -0,0 +1,17 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA)
+  include(CMakeLists.linux-aarch64.txt)
+elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+  include(CMakeLists.darwin-x86_64.txt)
+elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA)
+  include(CMakeLists.windows-x86_64.txt)
+elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA)
+  include(CMakeLists.linux-x86_64.txt)
+endif()
diff --git a/contrib/libs/asmlib/CMakeLists.windows-x86_64.txt b/contrib/libs/asmlib/CMakeLists.windows-x86_64.txt
new file mode 100644
index 0000000000..6e1a2adde6
--- /dev/null
+++ b/contrib/libs/asmlib/CMakeLists.windows-x86_64.txt
@@ -0,0 +1,213 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(contrib-libs-asmlib)
+target_sources(contrib-libs-asmlib PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/dummy.c
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/sfmt64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/mother64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/mersenne64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/debugbreak64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/cachesize64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/divfixedi64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/rdtsc64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/strcat64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/unalignedisfaster64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/strcpy64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/substring64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/strlen64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/cputype64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memcmp64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memmove64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/stricmp64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/divfixedv64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/physseed64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/cpuid64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/round64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memcpy64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/popcount64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/dispatchpatch64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/procname64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
+target_yasm_source(contrib-libs-asmlib
+  PRIVATE
+  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memset64.asm
+  -I
+  ${CMAKE_BINARY_DIR}
+  -I
+  ${CMAKE_SOURCE_DIR}
+)
diff --git a/contrib/libs/asmlib/cachesize64.asm b/contrib/libs/asmlib/cachesize64.asm
new file mode 100644
index 0000000000..c0bce8cf74
--- /dev/null
+++ b/contrib/libs/asmlib/cachesize64.asm
@@ -0,0 +1,335 @@
+%include "defs.asm"
+
+;*************************  cachesize64.asm  *************************************
+; Author:           Agner Fog
+; Date created:     2011-07-11
+; Last modified:    2013-08-14
+; Description:
+; Determines the size of the data caches 
+;
+; extern "C" site_t DataCacheSize(int level);
+; Input: 
+; level: n = 1 - 4: level n data cache
+;        0 = largest level data cache
+; Return value: size in bytes of data cache
+;
+; The latest version of this file is available at:
+; www.agner.org/optimize/asmexamples.zip
+; Copyright (c) 2011-2013 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+default rel
+
+global DataCacheSize: function
+
+; Imported from cputype64.asm
+extern CpuType                         ; near. Determine CPU vendor
+
+struc   data_layout
+ok:     resd    2
+level1: resq    1
+level2: resq    1
+level3: resq    1
+level4: resq    1
+descriptortable: resd 60
+endstruc
+
+struc   descriptor_record              ; record for table of cache descriptors
+d_key:          resb 1                 ; key from cpuid instruction
+d_level:        resb 1                 ; cache level
+d_sizem:        resb 1                 ; size multiplier
+d_2pow:         resb 1                 ; power of 2. size = d_sizem << d_2pow
+endstruc
+
+SECTION .data
+
+dataref:                               ; reference point
+ok_:       DD      0, 0                ; 1 when values are determined
+level1_:   DQ      0                   ; level 1 data cache size
+level2_:   DQ      0                   ; level 2 data cache size
+level3_:   DQ      0                   ; level 3 data cache size
+level4_:   DQ      0                   ; level 4 data cache size
+numlevels  equ     4                   ; max level
+
+; From "Intel Processor Identification and the CPUID Instruction, Application note 485
+descriptortable_:                      ; table of Intel cache descriptors
+db 0Ah, 1, 1, 13                       ; 8 kb L1 data cache
+db 0Ch, 1, 1, 14                       ; 16 kb L1 data cache
+db 0Dh, 1, 1, 14                       ; 16 kb L1 data cache
+db 21h, 2, 1, 18                       ; 256 kb L2 data cache
+db 22h, 3, 1, 19                       ; 512 kb L3 data cache
+db 23h, 3, 1, 20                       ; 1 Mb L3 data cache
+db 25h, 3, 1, 21                       ; 2 Mb L3 data cache
+db 29h, 3, 1, 22                       ; 4 Mb L3 data cache
+db 2Ch, 1, 1, 15                       ; 32 kb L1 data cache
+db 39h, 2, 1, 17                       ; 128 kb L2 data cache
+db 3Ah, 2, 3, 16                       ; 192 kb L2 data cache
+db 3Bh, 2, 1, 17                       ; 128 kb L1 data cache
+db 3Ch, 2, 1, 18                       ; 256 kb L1 data cache
+db 3Dh, 2, 3, 17                       ; 384 kb L2 data cache
+db 3Eh, 2, 1, 19                       ; 512 kb L2 data cache
+db 41h, 2, 1, 17                       ; 128 kb L2 data cache
+db 42h, 2, 1, 18                       ; 256 kb L2 data cache
+db 43h, 2, 1, 19                       ; 512 kb L2 data cache
+db 44h, 2, 1, 20                       ; 1 Mb L2 data cache
+db 45h, 2, 1, 21                       ; 2 Mb L2 data cache
+db 46h, 3, 1, 22                       ; 4 Mb L3 data cache
+db 47h, 3, 1, 23                       ; 8 Mb L3 data cache
+db 48h, 2, 3, 20                       ; 3 Mb L2 data cache
+db 49h, 2, 1, 22                       ; 4 Mb L2 or 3 data cache
+db 4Ah, 3, 3, 21                       ; 6 Mb L3 data cache
+db 4Bh, 3, 1, 23                       ; 8 Mb L3 data cache
+db 4Ch, 3, 3, 22                       ; 12 Mb L3 data cache
+db 4Dh, 3, 1, 24                       ; 16 Mb L3 data cache
+db 4Eh, 2, 3, 21                       ; 6 Mb L2 data cache
+db 60h, 1, 1, 14                       ; 16 kb L1 data cache
+db 66h, 1, 1, 13                       ; 8 kb L1 data cache
+db 67h, 1, 1, 14                       ; 16 kb L1 data cache
+db 68h, 1, 1, 15                       ; 32 kb L1 data cache
+db 78h, 2, 1, 20                       ; 1 Mb L2 data cache
+db 79h, 2, 1, 17                       ; 128 kb L2 data cache
+db 7Ah, 2, 1, 18                       ; 256 kb L2 data cache
+db 7Bh, 2, 1, 19                       ; 512 kb L2 data cache
+db 7Ch, 2, 1, 20                       ; 1 Mb L2 data cache
+db 7Dh, 2, 1, 21                       ; 2 Mb L2 data cache
+db 7Fh, 2, 1, 19                       ; 512 kb L2 data cache
+db 82h, 2, 1, 18                       ; 256 kb L2 data cache
+db 83h, 2, 1, 19                       ; 512 kb L2 data cache
+db 84h, 2, 1, 20                       ; 1 Mb L2 data cache
+db 85h, 2, 1, 21                       ; 2 Mb L2 data cache
+db 86h, 2, 1, 19                       ; 512 kb L2 data cache
+db 87h, 2, 1, 20                       ; 1 Mb L2 data cache
+db 0D0h, 3, 1, 19                      ; 512 kb L3 data cache
+db 0D1h, 3, 1, 20                      ; 1 Mb L3 data cache
+db 0D2h, 3, 1, 21                      ; 2 Mb L3 data cache
+db 0D6h, 3, 1, 20                      ; 1 Mb L3 data cache
+db 0D7h, 3, 1, 21                      ; 2 Mb L3 data cache
+db 0D8h, 3, 1, 22                      ; 4 Mb L3 data cache
+db 0DCh, 3, 3, 19                      ; 1.5 Mb L3 data cache
+db 0DDh, 3, 3, 20                      ; 3 Mb L3 data cache
+db 0DEh, 3, 3, 21                      ; 6 Mb L3 data cache
+db 0E2h, 3, 1, 21                      ; 2 Mb L3 data cache
+db 0E3h, 3, 1, 22                      ; 4 Mb L3 data cache
+db 0E4h, 3, 1, 23                      ; 8 Mb L3 data cache
+db 0EAh, 3, 3, 22                      ; 12 Mb L3 data cache
+db 0EBh, 3, 9, 21                      ; 18 Mb L3 data cache
+db 0ECh, 3, 3, 23                      ; 24 Mb L3 data cache
+descriptortablelength equ ($ - descriptortable_) / descriptor_record_size
+
+
+SECTION .text
+
+; extern "C" site_t DataCacheSize(int level);
+
+; Function entry:
+DataCacheSize:
+        push    rbx
+        push    r14
+%ifdef  WINDOWS
+        push    rsi
+        push    rdi
+        mov     r14d, ecx              ; level
+%else   ; UNIX
+        mov     r14d, edi              ; level
+%endif
+        ; check if called before
+        lea     r9, [dataref]
+        cmp     dword [r9+ok], 1       ; ok
+        je      D800
+        
+        ; find cpu vendor
+        push    0
+%ifdef  WINDOWS
+        mov     rcx, rsp
+        xor     edx, edx
+        xor     r8d, r8d
+%else   ; UNIX
+        mov     rdi, rsp
+        xor     esi, esi
+        xor     edx, edx
+%endif        
+        call    CpuType
+        lea     r9, [dataref]
+        pop     rax                    ; eax = vendor
+        dec     eax
+        jz      Intel
+        dec     eax
+        jz      AMD
+        dec     eax
+        jz      VIA
+        ; unknown vendor, try all methods
+        call    IntelNewMethod
+        jnc     D800                   ; not carry = success
+        call    AMDMethod
+        jnc     D800                   ; not carry = success
+        call    IntelOldMethod
+        jmp     D800                   ; return whether success or not
+        
+Intel:  call    IntelNewMethod
+        jnc     D800                   ; not carry = success
+        call    IntelOldMethod
+        jmp     D800                   ; return whether success or not
+
+AMD:    ; AMD and VIA use same method
+VIA:    call    AMDMethod
+        
+D800:   ; cache data known, get desired return value
+        xor     eax, eax
+        cmp     r14d, numlevels
+        ja      D900
+        cmp     r14d, 0
+        je      D820
+        ; level = 1 .. numlevels
+        mov     rax, [r9 + r14*8]      ; size of selected cache
+        jmp     D850
+D820:   ; level = 0. Get size of largest level cache
+        mov     rax, [r9 + level3]     ; level3
+        test    rax, rax
+        jnz     D850
+        mov     rax, [r9 + level2]     ; level2
+        test    rax, rax
+        jnz     D850
+        mov     eax, [r9 + level1]     ; level1
+D850:   mov     dword [r9 + ok], 1     ; remember called, whether success or not
+D900:   
+%ifdef  WINDOWS
+        pop     rdi
+        pop     rsi
+%endif
+        pop     r14
+        pop     rbx
+        ret
+
+
+; Determine cache sizes by CPUID function 4
+; input: esi = pointer to dataref
+; output: values returned in dataref + level1, level2, level3
+; carry flag = 0 on succes
+IntelNewMethod:
+        xor     eax, eax
+        cpuid                          ; get number of CPUID functions
+        cmp     eax, 4
+        jb      I900                   ; fail
+        xor     esi, esi               ; loop counter
+I100:   mov     eax, 4
+        mov     ecx, esi
+        cpuid                          ; get cache parameters
+        mov     edx, eax
+        and     edx, 11111b            ; cache type
+        jz      I500                   ; no more caches
+        cmp     edx, 2
+        je      I200                   ; code cache, ignore
+        inc     ecx                    ; sets
+        mov     edx, ebx
+        shr     edx, 22
+        inc     edx                    ; ways
+        imul    ecx, edx
+        mov     edx, ebx
+        shr     edx, 12
+        and     edx, 1111111111b
+        inc     edx                    ; partitions
+        imul    ecx, edx
+        and     ebx, 111111111111b        
+        inc     ebx                    ; line size
+        imul    rcx, rbx               ; calculated cache size (64 bit)
+        shr     eax, 5
+        and     eax, 111b              ; cache level
+        cmp     eax, numlevels
+        jna     I180
+        mov     eax, numlevels         ; limit higher levels
+I180:   mov     [r9+rax*8], rcx        ; store size of data cache level eax
+I200:   inc     esi
+        cmp     esi, 100h              ; avoid infinite loop
+        jb      I100                   ; next cache
+I500:   ; loop finished
+        ; check if OK
+        mov     eax, [r9+level1]       ; level1
+        cmp     eax, 1024
+I900:   ret                            ; carry flag set if fail
+
+; Determine cache sizes by CPUID function 2
+; input: esi = pointer to dataref
+; output: values returned in dataref + level1, level2, level3
+; carry flag = 0 on succes
+IntelOldMethod:
+        xor     eax, eax
+        cpuid                          ; get number of CPUID functions
+        cmp     eax, 2
+        jb      J900                   ; fail
+        mov     eax, 2
+        xor     ecx, ecx
+        cpuid                          ; get 16 descriptor bytes in eax, ebx, ecx, edx
+        mov     al, 0                  ; al does not contain a descriptor
+        sub     rsp, 16
+        mov     [rsp],    eax          ; save all descriptors
+        mov     [rsp+4],  ebx
+        mov     [rsp+8],  ecx
+        mov     [rsp+12], edx
+        mov     edx, 15                ; loop counter
+        ; loop to read 16 descriptor bytes
+J100:   mov     al, byte [rsp+rdx]
+        ; find in table
+        mov     ebx, descriptortablelength-1  ; loop counter
+        ; loop to search in descriptortable
+J200:   cmp     al, [r9 + descriptortable + rbx*4 + d_key]
+        jne     J300
+        ; descriptor found
+        movzx   eax, byte [r9 + descriptortable + rbx*4 + d_sizem]
+        mov     cl,  [r9 + descriptortable + rbx*4 + d_2pow]
+        shl     eax, cl                ; compute size
+        movzx   ecx, byte [r9 + descriptortable + rbx*4 + d_level]
+        ; check that level = 1-3
+        cmp     ecx, 3
+        ja      J300
+        mov     [r9+rcx*8], rax        ; store size eax of data cache level ecx
+J300:   dec     ebx
+        jns     J200                   ; inner loop
+        dec     edx
+        jns     J100                   ; outer loop
+        add     rsp, 16                ; remove from stack
+        ; check if OK
+        mov     eax, [r9 + level1]
+        cmp     eax, 1024
+J900:   ret                            ; carry flag set if fail
+
+
+; Determine cache sizes by CPUID function 80000005H - 80000006H
+; input: esi = pointer to dataref
+; output: values returned in dataref
+; carry flag = 0 on succes
+AMDMethod:
+        mov     eax, 80000000H
+        cpuid                          ; get number of CPUID functions
+        cmp     eax, 6
+        jb      K900                   ; fail
+        mov     eax, 80000005H
+        cpuid                          ; get L1 cache size
+        shr     ecx, 24                ; L1 data cache size in kbytes
+        shl     ecx, 10                ; L1 data cache size in bytes
+        mov     [r9 + level1], ecx     ; store L1 data cache size
+        mov     eax, 80000006H
+        cpuid                          ; get L2 and L3 cache sizes
+        shr     ecx, 16                ; L2 data cache size in kbytes
+        shl     ecx, 10                ; L2 data cache size in bytes
+        mov     [r9 + level2], ecx     ; store L2 data cache size
+        mov     ecx, edx
+        shr     ecx, 18                ; L3 data cache size / 512 kbytes
+        shl     rcx, 19                ; L3 data cache size in bytes
+%if 0   ; AMD manual is unclear: 
+        ; do we have to increase the value if the number of ways is not a power or 2?
+        shr     edx, 12
+        and     edx, 1111b             ; L3 associativity
+        cmp     edx, 3
+        jb      K100
+        test    edx, 1
+        jz      K100
+        ; number of ways is not a power of 2, multiply by 1.5 ?
+        mov     rax, rcx
+        shr     rax, 1
+        add     rcx, rax
+%endif
+K100:   mov     [r9 + level3], rcx     ; store L3 data cache size
+        ; check if OK
+        mov     eax, [r9 + level1]
+        cmp     eax, 1024
+K900:   ret                            ; carry flag set if fail
diff --git a/contrib/libs/asmlib/cpuid64.asm b/contrib/libs/asmlib/cpuid64.asm
new file mode 100644
index 0000000000..95f1b5a22d
--- /dev/null
+++ b/contrib/libs/asmlib/cpuid64.asm
@@ -0,0 +1,55 @@
+%include "defs.asm"
+
+;*************************  cpuid64.asm  *********************************
+; Author:           Agner Fog
+; Date created:     2008-12-14
+; Last modified:    2011-07-01
+; Source URL:       www.agner.org/optimize
+; Project:          asmlib.zip
+; Description:
+; This function calls the CPUID instruction.
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+default rel
+
+global cpuid_ex: function
+
+SECTION .text  align=16
+
+; ********** cpuid_ex function **********
+; C++ prototype:
+; extern "C" void cpuid_ex (int abcd[4], int a, int c);
+; Input: a = eax, c = ecx
+; Output: abcd[0] = eax, abcd[1] = ebx, abcd[2] = ecx, abcd[3] = edx
+
+
+cpuid_ex:
+
+%IFDEF   WINDOWS
+; parameters: rcx = abcd, edx = a, r8d = c
+        push    rbx
+        xchg    rcx, r8
+        mov     eax, edx
+        cpuid                          ; input eax, ecx. output eax, ebx, ecx, edx
+        mov     [r8],    eax
+        mov     [r8+4],  ebx
+        mov     [r8+8],  ecx
+        mov     [r8+12], edx
+        pop     rbx
+%ENDIF        
+%IFDEF   UNIX
+; parameters: rdi = abcd, esi = a, edx = c
+        push    rbx
+        mov     eax, esi
+        mov     ecx, edx
+        cpuid                          ; input eax, ecx. output eax, ebx, ecx, edx
+        mov     [rdi],    eax
+        mov     [rdi+4],  ebx
+        mov     [rdi+8],  ecx
+        mov     [rdi+12], edx
+        pop     rbx
+%ENDIF        
+        ret
+;cpuid_ex END
diff --git a/contrib/libs/asmlib/cputype64.asm b/contrib/libs/asmlib/cputype64.asm
new file mode 100644
index 0000000000..633ebee86a
--- /dev/null
+++ b/contrib/libs/asmlib/cputype64.asm
@@ -0,0 +1,127 @@
+%include "defs.asm"
+
+;*************************  cputype64.asm  **********************************
+; Author:           Agner Fog
+; Date created:     2011-07-09
+; Last modified:    2011-07-09
+; Source URL:       www.agner.org/optimize
+; Project:          asmlib.zip
+; Language:         assembly, NASM/YASM syntax, 64 bit
+;
+; C++ prototype:
+; extern "C" void CpuType(int * vendor, int * family, int * model);
+;
+; Description:
+; This function finds the vendor, family and model number of the CPU
+; and returns the values through the pointers. If a pointer is zero
+; then the value is not returned.
+;
+; Vendor: 
+; 0 = unknown
+; 1 = Intel
+; 2 = AMD
+; 3 = VIA/Centaur
+; 4 = Cyrix
+; 5 = NexGen
+;
+; Family: This is the sum of the family and extended family fields of the cpuid
+; Model:  This is the model + (extended model << 8)
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+;
+; C++ prototype:
+; extern "C" void CpuType(int * vendor, int * family, int * model);
+
+global CpuType: function
+
+
+SECTION .text
+
+CpuType:
+        push    rbx
+%ifdef  UNIX
+        mov     r8, rdx
+%endif
+%ifdef  WINDOWS        
+        push    rsi
+        push    rdi
+        mov     rdi, rcx
+        mov     rsi, rdx
+%endif
+        
+; parameters
+; vendor  rdi
+; family  rsi
+; model   r8
+
+        xor     r9d,  r9d              ; vendor
+        xor     r10d, r10d             ; family
+        xor     r11d, r11d             ; model
+
+        xor     eax, eax
+        cpuid                          ; get vendor
+        ; ecx = last  4 characters of vendor string
+        ; ebx = first 4 characters of vendor string
+        cmp     ecx, 'ntel'            ; 'GenuineIntel'
+        je      C110
+        cmp     ecx, 'cAMD'            ; 'AuthenticAMD'
+        je      C120
+        cmp     ebx, 'Cent'            ; 'CentaurHauls'
+        je      C130
+        cmp     ebx, 'VIA '            ; 'VIA VIA VIA '
+        je      C130
+        cmp     ebx, 'Cyri'            ; 'CyrixInstead'
+        je      C140
+        cmp     ebx, 'NexG'            ; 'NexGenDriven'
+        je      C150
+        jmp     C200                   ; other
+C110:   or      r9d, 1
+        jmp     C200
+C120:   or      r9d, 2
+        jmp     C200
+C130:   or      r9d, 3
+        jmp     C200
+C140:   or      r9d, 4
+        jmp     C200
+C150:   or      r9d, 5
+        ;jmp     C200
+C200:   
+
+        ; Get family and model
+        mov     eax, 1
+        cpuid                          
+        mov     ebx, eax
+        mov     r10d, eax
+        shr     ebx, 8
+        and     ebx, 0FH               ; Family
+        shr     r10d, 20
+        and     r10d, 0FFH             ; Extended family
+        add     r10d, ebx              ; Family + extended family
+        
+        mov     r11d, eax
+        shr     r11d, 4
+        and     r11d, 0FH              ; Model
+        shr     eax, 12
+        and     eax, 0F0H              ; Extended model
+        or      r11d, eax              ; extended model | Model
+        
+C300:   ; return r9d = vendor, r10d = family, r11d = model
+        test    rdi, rdi
+        jz      C310
+        mov     [rdi], r9d
+C310:   test    rsi, rsi
+        jz      C320
+        mov     [rsi], r10d
+C320:   test    r8, r8
+        jz      C330
+        mov     [r8], r11d
+C330:   xor     eax, eax
+        ; return
+%ifdef  WINDOWS 
+        pop     rdi
+        pop     rsi
+%endif
+        pop     rbx
+        ret
+;CpuType ENDP
diff --git a/contrib/libs/asmlib/debugbreak64.asm b/contrib/libs/asmlib/debugbreak64.asm
new file mode 100644
index 0000000000..ed2971cd24
--- /dev/null
+++ b/contrib/libs/asmlib/debugbreak64.asm
@@ -0,0 +1,33 @@
+%include "defs.asm"
+
+;*************************  debugbreak64.asm  **********************************
+; Author:           Agner Fog
+; Date created:     2011-07-09
+; Last modified:    2011-07-09
+; Source URL:       www.agner.org/optimize
+; Project:          asmlib.zip
+; Language:         assembly, NASM/YASM syntax, 32 bit
+;
+; C++ prototype:
+; extern "C" void A_DebugBreak(void);
+;
+; Description:
+; Makes a debug breakpoint. Works only when running under a debugger
+;
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+;
+; C++ prototype:
+; extern "C" void A_DebugBreak(void);
+
+global A_DebugBreak: function
+
+
+SECTION .text
+
+A_DebugBreak:
+        int3
+        nop
+        ret
+;A_DebugBreak ENDP
diff --git a/contrib/libs/asmlib/defs.asm b/contrib/libs/asmlib/defs.asm
new file mode 100644
index 0000000000..db313e6cf1
--- /dev/null
+++ b/contrib/libs/asmlib/defs.asm
@@ -0,0 +1,22 @@
+%ifdef UNIX
+    %ifdef DARWIN
+        %define EXP(x) _ %+ x
+    %else
+        %define EXP(x) x
+    %endif
+%else
+    %define EXP(x) _ %+ x
+    %define WINDOWS
+%endif
+
+%define ALLOW_OVERRIDE 1
+
+%ifdef WINDOWS
+    %define WEAK_SYM(x) global x
+%else
+    %ifdef DARWIN
+        %define WEAK_SYM(x) global x
+    %else
+        %define WEAK_SYM(x) weak x
+    %endif
+%endif
diff --git a/contrib/libs/asmlib/dispatchpatch64.asm b/contrib/libs/asmlib/dispatchpatch64.asm
new file mode 100644
index 0000000000..205fac543d
--- /dev/null
+++ b/contrib/libs/asmlib/dispatchpatch64.asm
@@ -0,0 +1,303 @@
+%include "defs.asm"
+
+;***********************  dispatchpatch64.asm  ********************************
+; Author:           Agner Fog
+; Date created:     2007-07-20
+; Last modified:    2013-08-21
+; Source URL:       www.agner.org/optimize
+; Project:          asmlib.zip
+; Language:         assembly, NASM/YASM syntax, 64 bit
+;
+; C++ prototype:
+; extern "C" int  __intel_cpu_indicator = 0;
+; extern "C" void __intel_cpu_indicator_init()
+;
+; Description:
+; Example of how to replace Intel CPU dispatcher in order to improve 
+; compatibility of Intel function libraries with non-Intel processors.
+; Only works with static link libraries (*.lib, *.a), not dynamic libraries
+; (*.dll, *.so). Linking in this as an object file will override the functions
+; with the same name in the library.; 
+; 
+; Copyright (c) 2007-2013 GNU LGPL License v. 3.0 www.gnu.org/licenses/lgpl.html
+;******************************************************************************
+
+; extern InstructionSet: function
+%include "instrset64.asm"              ; include code for InstructionSet function
+
+; InstructionSet function return value:
+;  4 or above = SSE2 supported
+;  5 or above = SSE3 supported
+;  6 or above = Supplementary SSE3
+;  8 or above = SSE4.1 supported
+;  9 or above = POPCNT supported
+; 10 or above = SSE4.2 supported
+; 11 or above = AVX supported by processor and operating system
+; 12 or above = PCLMUL and AES supported
+; 13 or above = AVX2 supported
+; 14 or above = FMA3, F16C, BMI1, BMI2, LZCNT
+; 15 or above = HLE + RTM supported
+
+
+global __intel_cpu_indicator
+global __intel_cpu_indicator_init
+
+
+SECTION .data
+intel_cpu_indicator@:                  ; local name
+__intel_cpu_indicator: dd 0
+
+; table of indicator values
+itable  DD      1                      ; 0: generic version, 80386 instruction set
+        DD      8, 8                   ; 1,   2: MMX
+        DD      0x80                   ; 3:      SSE
+        DD      0x200                  ; 4:      SSE2
+        DD      0x800                  ; 5:      SSE3
+        DD      0x1000,  0x1000        ; 6,   7: SSSE3
+        DD      0x2000,  0x2000        ; 8,   9: SSE4.1
+        DD      0x8000,  0x8000        ; 10, 11: SSE4.2 and popcnt
+        DD      0x20000, 0x20000       ; 12, 13: AVX, pclmul, aes
+        DD      0x400000               ; 14:     AVX2, F16C, BMI1, BMI2, LZCNT, FMA3
+        DD      0x800000               ; 15:     HLE, RTM
+itablelen equ ($ - itable) / 4         ; length of table
+
+SECTION .text
+
+__intel_cpu_indicator_init:
+        push    rax                    ; registers must be pushed
+        push    rcx
+        push    rdx
+        push    r8
+        push    r9
+        push    r10
+        push    r11
+        push    rsi
+        push    rdi
+        call    InstructionSet
+        cmp     eax, itablelen
+        jb      L100
+        mov     eax, itablelen - 1     ; limit to table length
+L100:   lea     rdx, [rel itable]
+        mov     eax, [rdx + 4*rax]
+        mov     [rel intel_cpu_indicator@], eax             ; store in __intel_cpu_indicator
+        pop     rdi
+        pop     rsi
+        pop     r11
+        pop     r10
+        pop     r9
+        pop     r8
+        pop     rdx
+        pop     rcx
+        pop     rax
+        ret
+
+;__intel_cpu_indicator_init ENDP
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;     Dispatcher for Math Kernel Library (MKL),
+;     version 10.2 and higher
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+WEAK_SYM(mkl_serv_cpu_detect)
+
+SECTION .data
+; table of indicator values
+; Note: the table is different in 32 bit and 64 bit mode
+
+mkltab  DD      0, 0, 0, 0             ; 0-3: generic version, 80386 instruction set
+        DD      0                      ; 4:      SSE2
+        DD      1                      ; 5:      SSE3
+        DD      2, 2, 2, 2             ; 6-9:    SSSE3
+        DD      3                      ; 10:     SSE4.2
+        DD      4, 4, 4                ; 11-13:  AVX
+        DD      5                      ; 14:     AVX2, FMA3, BMI1, BMI2, LZCNT, PCLMUL
+mkltablen equ ($ - mkltab) / 4         ; length of table
+
+SECTION .text
+
+mkl_serv_cpu_detect:
+        push    rcx                    ; Perhaps not needed
+        push    rdx
+        push    r8
+        push    r9
+%ifdef WINDOWS
+        push    rsi
+        push    rdi
+%endif
+        call    InstructionSet
+        cmp     eax, mkltablen
+        jb      M100
+        mov     eax, mkltablen - 1     ; limit to table length
+M100:   
+        lea     rdx, [rel mkltab]
+        mov     eax, [rdx + 4*rax]
+%ifdef WINDOWS
+        pop     rdi
+        pop     rsi
+%endif
+        pop     r9
+        pop     r8
+        pop     rdx
+        pop     rcx
+        ret
+; end mkl_serv_cpu_detect        
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;     Dispatcher for Vector Math Library (VML)
+;     version 10.0 and higher
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+WEAK_SYM(mkl_vml_serv_cpu_detect)
+
+SECTION .data
+; table of indicator values
+; Note: the table is different in 32 bit and 64 bit mode
+
+vmltab  DD      0, 0, 0, 0             ; 0-3: generic version, 80386 instruction set
+        DD      1, 1                   ; 4-5:    SSE2
+        DD      2, 2                   ; 6-7:    SSSE3
+        DD      3, 3                   ; 8-9:    SSE4.1
+        DD      4                      ; 10:     SSE4.2
+        DD      5, 5, 5                ; 11:     AVX
+;       DD      6  ??        
+vmltablen equ ($ - vmltab) / 4         ; length of table
+
+SECTION .text
+
+mkl_vml_serv_cpu_detect:
+        push    rcx                    ; Perhaps not needed
+        push    rdx
+        push    r8
+        push    r9
+%ifdef WINDOWS
+        push    rsi
+        push    rdi
+%endif
+        call    InstructionSet
+        cmp     eax, vmltablen
+        jb      V100
+        mov     eax, vmltablen - 1     ; limit to table length
+V100:   
+        lea     rdx, [rel vmltab]
+        mov     eax, [rdx + 4*rax]
+%ifdef WINDOWS
+        pop     rdi
+        pop     rsi
+%endif
+        pop     r9
+        pop     r8
+        pop     rdx
+        pop     rcx
+        ret
+; end mkl_vml_serv_cpu_detect        
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;     Dispatcher for __intel_cpu_feature_indicator 
+;     version 13 and higher
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+global __intel_cpu_features_init
+global __intel_cpu_feature_indicator
+global __intel_cpu_fms_indicator
+global __intel_cpu_features_init_x
+global __intel_cpu_feature_indicator_x
+global __intel_cpu_fms_indicator_x
+
+SECTION .data
+; table of indicator values
+
+intel_cpu_feature_indicator@:
+__intel_cpu_feature_indicator:
+__intel_cpu_feature_indicator_x  DD 0, 0
+intel_cpu_fms_indicator@:
+__intel_cpu_fms_indicator:
+__intel_cpu_fms_indicator_x:     DD 0, 0
+
+
+feattab DD  1                ; 0 default
+        DD  0BH              ; 1 MMX
+        DD  0FH              ; 2 conditional move and FCOMI supported
+        DD  3FH              ; 3 SSE
+        DD  7FH              ; 4 SSE2
+        DD  0FFH             ; 5 SSE3
+        DD  1FFH, 1FFH       ; 6 Supplementary SSE3
+        DD  3FFH             ; 8 SSE4.1
+        DD  0BFFH            ; 9 POPCNT 
+        DD  0FFFH            ; 10 SSE4.2 
+        DD  10FFFH           ; 11 AVX 
+        DD  16FFFH           ; 12 PCLMUL and AES 
+        DD  816FFFH          ; 13 AVX2 
+        DD  9DEFFFH          ; 14 FMA3, F16C, BMI1, BMI2, LZCNT
+        DD  0FDEFFFH         ; 15 HLE, RTM 
+
+feattablen equ ($ - feattab) / 4  ; length of table
+
+SECTION .text
+
+__intel_cpu_features_init:
+__intel_cpu_features_init_x:
+        push    rbx
+        push    rcx                    ; Perhaps not needed
+        push    rdx
+        push    r8
+        push    r9
+%ifdef WINDOWS
+        push    rsi
+        push    rdi
+%endif
+        call    InstructionSet
+        cmp     eax, feattablen
+        jb      F100
+        mov     eax, vmltablen - 1     ; limit to table length
+F100:   
+        lea     rdx, [rel feattab]
+        mov     ebx, [rdx + 4*rax]     ; look up in table        
+        push    rbx
+        mov     eax, 1
+        cpuid
+        pop     rbx
+        bt      ecx, 22                ; MOVBE
+        jnc     F200
+        or      ebx, 1000H
+F200:   mov     [intel_cpu_feature_indicator@], rbx
+
+        ; get family and model
+        mov     edx, eax
+        and     eax, 0FH               ; stepping bit 0-3
+        mov     ecx, edx
+        shr     ecx, 4
+        and     ecx, 0FH               ; model
+        mov     ebx, edx
+        shr     ebx, 12
+        and     ebx, 0F0H              ; x model
+        or      ecx, ebx               ; full model
+        mov     ah,  cl                ; model bit 8 - 15
+        mov     ecx, edx
+        shr     ecx, 8
+        and     ecx, 0FH               ; family
+        mov     ebx, edx
+        shr     ebx, 20
+        and     ebx, 0FFH              ; x family
+        add     ecx, ebx               ; full family
+        shl     ecx, 16
+        or      eax, ecx               ; full family bit 16 - 23
+        mov     [intel_cpu_fms_indicator@], eax
+        
+%ifdef WINDOWS
+        pop     rdi
+        pop     rsi
+%endif
+        pop     r9
+        pop     r8
+        pop     rdx
+        pop     rcx
+        pop     rbx
+        ret
+; end __intel_cpu_features_init        
+
+
+
+
diff --git a/contrib/libs/asmlib/divfixedi64.asm b/contrib/libs/asmlib/divfixedi64.asm
new file mode 100644
index 0000000000..bf8ab137a9
--- /dev/null
+++ b/contrib/libs/asmlib/divfixedi64.asm
@@ -0,0 +1,173 @@
+%include "defs.asm"
+
+;*************************  divfixedi64.asm  *********************************
+; Author:           Agner Fog
+; Date created:     2011-07-22
+; Last modified:    2011-07-22
+;
+; Function prototypes:
+; void setdivisori32(int buffer[2], int d);
+; int dividefixedi32(const int buffer[2], int x);
+; void setdivisoru32(uint32_t buffer[2], uint32_t d);
+; uint32_t dividefixedu32(const uint32_t buffer[2], uint32_t x);
+;
+; Description:
+; Functions for fast repeated integer division by the same divisor, signed 
+; and unsigned 32-bit integer versions. The divisor must be positive.
+;
+; The setdivisor functions calculate the reciprocal divisor and shift counts,
+; the dividefixed functions do the division by multiplication and shift.
+;
+; The methods used are described by:
+; T. Granlund and P. L. Montgomery: Division by Invariant Integers Using Multiplication,
+; Proceedings of the SIGPLAN 1994 Conference on Programming Language Design and Implementation.
+; http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556
+;
+; Mathematical formula, unsigned division:
+; x = dividend
+; d = divisor
+; n = integer size, bits
+; L = ceil(log2(d))
+; m = 1 + 2^n * (2^L-d) / d       [2^L should overflow to 0 if L = n]
+; sh1 = min(L,1)
+; sh2 = max(L-1,0)
+; t = m*x >> n                    [high part of unsigned multiplication]
+; x/d = (((x-t) >> sh1) + t) >> sh2
+;
+; Mathematical formula, signed division:
+; x = dividend
+; d = abs(divisor)
+; n = integer size, bits
+; L = ceil(log2(d))
+; L = max(L,1)
+; m = 1 + 2^(n+L-1)/d - 2^n       [division should overflow to 0 if d = 1]
+; sh1 = L-1
+; q = x + (m*x >> n)              [high part of signed multiplication]
+; q = (q >> sh1) - (x<0 ? -1 : 0)
+; if (divisor < 0) q = -q         [negative divisor not supported in present implementation]
+; x/d = q
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+%IFDEF  WINDOWS
+%define par1   rcx                     ; function parameter 1
+%define par2   edx                     ; function parameter 2
+%define buf    r9                      ; copy of function parameter 1: buffer
+%define rx     r8
+%define rxd    r8d                     ; d or x
+%ELSE   ; UNIX
+%define par1   rdi                     ; function parameter 1
+%define par2   esi                     ; function parameter 2
+%define buf    rdi                     ; function parameter 1: buffer
+%define rx     rsi
+%define rxd    esi                     ; d or x
+%ENDIF
+
+
+section .text
+
+; extern "C" void setdivisori32(int buffer[2], int d);
+; 32 bit signed 
+
+global setdivisori32: function
+setdivisori32:
+%IFDEF  WINDOWS
+        mov     rxd, edx               ; x
+        mov     buf, rcx               ; buffer
+%ENDIF        
+        dec     rxd                    ; rxd = r8d or esi
+        mov     ecx, -1                ; value for bsr if rxd = 0 (assuming bsr leaves dest unchanged if src = 0, this works on both Intel, AMD and VIA processors)
+        bsr     ecx, rxd               ; floor(log2(d-1))
+        inc     rxd
+        js      H120                   ; d < 0. Generate error
+        inc     ecx                    ; L = ceil(log2(d))        
+        sub     ecx, 1                 ; shift count = L - 1
+        adc     ecx, 0                 ; avoid negative shift count
+        xor     eax, eax
+        mov     edx, 1
+        cmp     rxd, edx
+        je      H110                   ; avoid overflow when d = 1
+        shl     edx, cl
+        div     rxd
+H110:   inc     eax
+        mov     [buf], eax             ; multiplier
+        mov     [buf+4], ecx           ; shift count
+        ret
+        
+H120:   ; d <= 0 not supported. Generate error
+        mov     edx, 1
+        div     edx                    ; will overflow
+        ud2
+
+        
+; extern "C" int dividefixedi32(int buffer[2], int x);
+global dividefixedi32: function
+dividefixedi32:
+%IFDEF  WINDOWS
+        mov     eax, edx
+        mov     rxd, edx               ; x
+        mov     buf, rcx               ; buffer
+%ELSE
+        mov     eax, esi
+%ENDIF        
+        imul    dword [buf]            ; m
+        lea     eax, [rdx+rx]          ; rx = r8 or rsi
+        mov     ecx, [buf+4]           ; shift count
+        sar     eax, cl
+        sar     rxd, 31                ; sign(x)
+        sub     eax, rxd
+        ret
+
+
+;extern "C" void setdivisoru32(int buffer[2], int d);
+; 32 bit unsigned 
+
+global setdivisoru32: function
+setdivisoru32:
+%IFDEF  WINDOWS
+        mov     rxd, edx               ; x
+        mov     buf, rcx               ; buffer
+%ENDIF        
+        dec     rxd                    ; rxd = r8d or esi
+        mov     ecx, -1                ; value for bsr if r8d = 0
+        bsr     ecx, rxd               ; floor(log2(d-1))
+        inc     rxd
+        inc     ecx                    ; L = ceil(log2(d))
+        mov     edx, 1
+        shl     rdx, cl                ; 2^L (64 bit shift because cl may be 32)
+        sub     edx, rxd
+        xor     eax, eax
+        div     rxd
+        inc     eax
+        mov     [buf], eax             ; multiplier
+        sub     ecx, 1
+        setae   dl
+        movzx   edx, dl                ; shift1
+        seta    al
+        neg     al
+        and     al,cl
+        movzx   eax, al                ; shift 2
+        shl     eax, 8
+        or      eax, edx
+        mov     [buf+4], eax           ; shift 1 and shift 2
+        ret
+        
+;extern "C" int dividefixedu32(int buffer[2], int x);
+global dividefixedu32: function       ; unsigned
+dividefixedu32:
+%IFDEF  WINDOWS
+        mov     eax, edx
+        mov     rxd, edx               ; x
+        mov     buf, rcx               ; buffer
+%ELSE
+        mov     eax, esi
+%ENDIF        
+        mul     dword [buf]            ; m
+        sub     rxd, edx               ; x-t
+        mov     ecx, [buf+4]           ; shift 1 and shift 2
+        shr     rxd, cl
+        lea     eax, [rx+rdx]
+        shr     ecx, 8
+        shr     eax, cl
+        ret
diff --git a/contrib/libs/asmlib/divfixedv64.asm b/contrib/libs/asmlib/divfixedv64.asm
new file mode 100644
index 0000000000..a4f0e177ec
--- /dev/null
+++ b/contrib/libs/asmlib/divfixedv64.asm
@@ -0,0 +1,498 @@
+%include "defs.asm"
+
+;*************************  divfixedv64.asm  *********************************
+; Author:           Agner Fog
+; Date created:     2011-07-25
+; Last modified:    2012-03-10
+;
+; Function prototypes:
+; void setdivisorV8i16(__m128i buf[2], int16_t d);
+; void setdivisorV8u16(__m128i buf[2], uint16_t d);
+; void setdivisorV4i32(__m128i buf[2], int32_t d);
+; void setdivisorV4u32(__m128i buf[2], uint32_t d);
+;
+; __m128i dividefixedV8i16(const __m128i buf[2], __m128i x);
+; __m128i dividefixedV8u16(const __m128i buf[2], __m128i x);;
+; __m128i dividefixedV4i32(const __m128i buf[2], __m128i x);
+; __m128i dividefixedV4u32(const __m128i buf[2], __m128i x);
+;
+; Alternative versions for VectorClass.h:
+; (These versions pack all parameters into a single register)
+; __m128i setdivisor8s(int16_t d);
+; __m128i setdivisor8us(uint16_t d);
+; __m128i setdivisor4i(int32_t d);
+; __m128i setdivisor4ui(uint32_t d);
+;
+; Description:
+; Functions for integer vector division by the same divisor, signed 
+; and unsigned 16-bit and 32-bit integer versions.
+;
+; The setdivisor functions calculate the reciprocal divisor and shift counts,
+; the dividefixed functions do the division by multiplication and shift of the 
+; vector elements of packed 16-bit or 32-bit signed or unsigned integers. 
+;
+; The divisor must be positive. A zero divisor generated a divide by zero error.
+; A negative divisor generates a division overflow error. To divide by a negative
+; divisor, change the sign of the divisor and the result.
+;
+; The methods used are described in this article:
+; T. Granlund and P. L. Montgomery: Division by Invariant Integers Using Multiplication,
+; Proceedings of the SIGPLAN 1994 Conference on Programming Language Design and Implementation.
+; http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556
+;
+; Mathematical formula, unsigned division:
+; x = dividend
+; d = divisor
+; n = integer size, bits
+; L = ceil(log2(d))
+; m = 1 + 2^n * (2^L-d) / d       [2^L should overflow to 0 if L = n]
+; sh1 = min(L,1)
+; sh2 = max(L-1,0)
+; t = m*x >> n                    [high part of unsigned multiplication]
+; x/d = (((x-t) >> sh1) + t) >> sh2
+;
+; Mathematical formula, signed division:
+; x = dividend
+; d = abs(divisor)
+; n = integer size, bits
+; L = ceil(log2(d))
+; L = max(L,1)
+; m = 1 + 2^(n+L-1)/d - 2^n       [division should overflow to 0 if d = 1]
+; sh1 = L-1
+; q = x + (m*x >> n)              [high part of signed multiplication]
+; q = (q >> sh1) - (x<0 ? -1 : 0)
+; if (divisor < 0) q = -q         [negative divisor not supported in present implementation]
+; x/d = q
+;
+; Copyright (c) 2011 - 2012 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+default rel
+
+%IFDEF  WINDOWS
+%define par1   rcx                     ; function parameter 1
+%define par1d  ecx
+%define par1w   cx
+%define par2   rdx                     ; function parameter 2
+%define par2d  edx
+%define par2w  dx 
+%define buf    r8                      ; pointer to buffer
+%ENDIF
+%IFDEF  UNIX
+%define par1   rdi                     ; function parameter 1
+%define par1d  edi
+%define par1w  di 
+%define par2   rsi                     ; function parameter 2
+%define par2d  esi
+%define par2w  si
+%define buf    rdi                     ; pointer to buffer
+%ENDIF
+
+
+; Imported from instrset64.asm:
+extern InstructionSet                  ; Instruction set for CPU dispatcher
+
+section .text  align = 16
+
+;******************************************************************************
+;                    16 bit signed integers
+;******************************************************************************
+
+; extern "C" __m128i setdivisor8s(int16_t d);
+; vector of 8 x 16 bit signed integers
+
+global setdivisor8s: function
+setdivisor8s:
+        push    rbx
+        movsx   ebx, par1w             ; d
+        dec     ebx
+        mov     ecx, -1                ; value for bsr if ebx = 0
+        bsr     ecx, ebx               ; floor(log2(d-1))
+        inc     ebx
+        js      H120                   ; Generate error if d < 0. (error for d=0 will come in the div instruction)
+        inc     ecx                    ; L = ceil(log2(d))        
+        sub     ecx, 1                 ; shift count = L - 1
+        adc     ecx, 0                 ; avoid negative shift count
+        xor     eax, eax
+        mov     edx, 1
+        cmp     ebx, edx
+        je      H110                   ; avoid division overflow when d = 1
+        shl     edx, cl
+        div     bx                     ; 2^(16+L-1)/d
+H110:   inc     eax
+        movd    xmm0, eax              ; multiplier
+        pshuflw xmm0, xmm0, 0          ; broadcast into lower 4 words
+        movd    xmm1, ecx              ; shift count
+        punpcklqdq xmm0, xmm1          ; insert shift count into upper half
+        pop     rbx
+        ret
+H120:   ; d < 0 not supported. Generate error
+        mov     edx, 1
+        div     edx
+        ud2
+; setdivisor8s end        
+
+        
+; extern "C" void setdivisorV8i16(__m128i buf[2], int16_t d);
+; vector of 8 x 16 bit signed integers
+
+global setdivisorV8i16: function
+setdivisorV8i16:
+        push    par1                   ; buf
+        mov     par1d, par2d           ; d
+        call    setdivisor8s
+        pop     rax                    ; buf
+        punpcklqdq xmm0, xmm0          ; copy multiplier into upper 4 words        
+        movdqa  [rax], xmm0            ; multiplier
+        movdqa  [rax+16], xmm1         ; shift count is still in xmm1
+        ret
+; setdivisorV8i16 end
+
+
+; extern "C" int dividefixedV8i16(const __m128i buf[2], __m128i x);
+global dividefixedV8i16: function
+
+dividefixedV8i16:
+; buf = par1
+; x = xmm0 (UNIX) or [par2] (Windows)
+%IFDEF  WINDOWS
+        movdqa  xmm0, [par2]           ; x
+%ENDIF
+        movdqa  xmm1, xmm0             ; x
+        pmulhw  xmm0, [par1]           ; multiply high signed words
+        paddw   xmm0, xmm1
+        movd    xmm2, [par1+16]        ; shift count
+        psraw   xmm0, xmm2             ; shift right arithmetic
+        psraw   xmm1, 15               ; sign of x
+        psubw   xmm0, xmm1
+        ret
+;dividefixedV8i16 end
+
+
+
+;******************************************************************************
+;                    16 bit unsigned integers
+;******************************************************************************
+
+; extern "C" __m128i setdivisor8us(uint16_t d);
+; vector of 8 x 16 bit unsigned integers
+
+align 16
+global setdivisor8us: function
+setdivisor8us:
+        push    rbx
+        movzx   ebx, par1w             ; d
+        dec     ebx
+        mov     ecx, -1                ; value for bsr if ebx = 0
+        bsr     ecx, ebx               ; floor(log2(d-1))
+        inc     ebx
+        inc     ecx                    ; L = ceil(log2(d))
+        mov     edx, 1
+        shl     edx, cl                ; 2^L  [32-bit shift to allow overflow]
+        sub     edx, ebx
+        xor     eax, eax
+        div     bx
+        inc     eax
+        movd    xmm0, eax
+        pshuflw xmm0, xmm0, 0          ; broadcast into lower 4 words
+        sub     ecx, 1
+        setae   dl
+        movzx   edx, dl                ; shift 1
+        seta    al
+        neg     al
+        and     al,cl
+        movzx   eax, al                ; shift 2
+        movd    xmm1, edx              ; shift 1
+        movd    xmm2, eax              ; shift 2
+        punpckldq  xmm1, xmm2          ; combine into two dwords
+        punpcklqdq xmm0, xmm1          ; multipliers, shift1, shift2
+        pop     rbx
+        ret
+; setdivisor8us end
+
+
+;extern "C" void setdivisorV8u16(__m128i buf[2], uint16_t d);
+; 8 x 16 bit unsigned 
+
+global setdivisorV8u16: function
+setdivisorV8u16:
+        push    par1                   ; buf
+        mov     par1d, par2d           ; d
+        call    setdivisor8us
+        pop     rax                    ; buf
+        punpcklqdq xmm0, xmm0          ; copy multiplier into upper 4 words        
+        movdqa  [rax], xmm0            ; multiplier
+        movdqa  [rax+16], xmm1         ; shift counts are still in xmm1
+        ret
+; setdivisorV8u16 end
+
+        
+;extern "C" __m128i dividefixedV8u16(const __m128i buf[2], __m128i x);
+global dividefixedV8u16: function
+
+align 16
+dividefixedV8u16:
+; buf = par1
+; x = xmm0 (UNIX) or [par2] (Windows)
+%IFDEF  WINDOWS
+        movdqa  xmm0, [par2]           ; x
+%ENDIF
+        movdqa  xmm1, xmm0             ; x
+        pmulhuw xmm0, [par1]           ; multiply high unsigned words
+        psubw   xmm1, xmm0
+        movd    xmm2, [par1+16]        ; shift1
+        psrlw   xmm1, xmm2
+        paddw   xmm0, xmm1
+        movd    xmm2, [par1+20]        ; shift2
+        psrlw   xmm0, xmm2
+        ret
+;dividefixedV8u16 end
+
+
+
+;******************************************************************************
+;                    32 bit signed integers
+;******************************************************************************
+
+; extern "C" __m128i setdivisor4i(int32_t d);
+; vector of 4 x 32 bit signed integers
+
+align 16
+global setdivisor4i: function
+setdivisor4i:
+        push    rbx
+        mov     ebx, par1d             ; d
+        dec     ebx
+        mov     ecx, -1                ; value for bsr if ebx = 0
+        bsr     ecx, ebx               ; floor(log2(d-1))
+        inc     ebx
+        js      K120                   ; Generate error if d < 0. (error for d=0 will come in the div instruction)
+        inc     ecx                    ; L = ceil(log2(d))        
+        sub     ecx, 1                 ; shift count = L - 1
+        adc     ecx, 0                 ; avoid negative shift count
+        xor     eax, eax
+        mov     edx, 1
+        cmp     ebx, edx
+        je      K110                   ; avoid division overflow when d = 1
+        shl     edx, cl
+        div     ebx                    ; 2^(16+L-1)/d
+K110:   inc     eax
+        movd    xmm0, eax              ; multiplier
+        pshufd  xmm0, xmm0, 0          ; broadcast into 4 dwords
+        movd    xmm1, ecx              ; shift count
+        punpcklqdq xmm0, xmm1          ; insert shift count into upper half
+        pop     rbx
+        ret
+        
+K120:   ; d < 0 not supported. Generate error
+        mov     edx, 1
+        div     edx
+        ud2
+; setdivisor4i end
+
+
+; extern "C" void setdivisorV4i32(__m128i buf[2], int32_t d);
+; vector of 4 x 32 bit signed integers
+
+global setdivisorV4i32: function
+setdivisorV4i32:
+        push    par1                   ; buf
+        mov     par1d, par2d           ; d
+        call    setdivisor4i
+        pop     rax                    ; buf
+        punpcklqdq xmm0, xmm0          ; copy multiplier into upper 4 words        
+        movdqa  [rax], xmm0            ; multiplier
+        movdqa  [rax+16], xmm1         ; shift count is still in xmm1
+        ret
+; setdivisorV4i32 end
+
+        
+; extern "C" int dividefixedV4i32(const __m128i buf[2], __m128i x);
+global dividefixedV4i32: function
+
+; Direct entries to CPU-specific versions
+global dividefixedV4i32SSE2:  function
+global dividefixedV4i32SSE41: function
+
+align 8
+dividefixedV4i32: ; function dispatching
+        jmp     near [dividefixedV4i32Dispatch] ; Go to appropriate version, depending on instruction set
+
+align 16
+dividefixedV4i32SSE41: 
+; buf = par1
+; x = xmm0 (UNIX) or [par2] (Windows)
+%IFDEF  WINDOWS
+        movdqa  xmm0,[par2]            ; x
+%ENDIF
+        movdqa  xmm1, xmm0             ; x
+        movdqa  xmm2, xmm0             ; x        
+        movdqa  xmm3, [par1]           ; multiplier
+        pmuldq  xmm0, xmm3             ; 32 x 32 -> 64 bit signed multiplication of x[0] and x[2]
+        psrlq   xmm0, 32               ; high dword of result 0 and 2
+        psrlq   xmm1, 32               ; get x[1] and x[3] into position for multiplication
+        pmuldq  xmm1, xmm3             ; 32 x 32 -> 64 bit signed multiplication of x[1] and x[3]
+        pcmpeqd xmm3, xmm3
+        psllq   xmm3, 32               ; generate mask of dword 1 and 3
+        pand    xmm1, xmm3             ; high dword of result 1 and 3
+        por     xmm0, xmm1             ; combine all four results into one vector
+        paddd   xmm0, xmm2
+        movd    xmm3, [par1+16]        ; shift count
+        psrad   xmm0, xmm3             ; shift right arithmetic
+        psrad   xmm2, 31               ; sign of x
+        psubd   xmm0, xmm2
+        ret
+;dividefixedV4i32SSE41 end
+
+dividefixedV4i32SSE2:
+; I have tried to change sign and use pmuludq, but get rounding error (gives 9/10 = 1).
+; This solution, with 4 separate multiplications, is probably faster anyway despite store forwarding stall
+        push    rbp
+        mov     rbp, rsp
+%IFDEF  WINDOWS
+        movdqa  xmm0,[par2]            ; x
+        mov     buf, par1
+%ENDIF
+        sub     rsp, 16                ; allocate stack space
+        and     rsp, -16               ; stack should be aligned already. align anyway to be safe
+        movdqa  [rsp], xmm0            ; store x
+        movdqa  xmm2, xmm0             ; x        
+        mov     ecx, [buf]             ; multiplier
+        ; do four signed high multiplications
+        mov     eax, [rsp]
+        imul    ecx
+        mov     [rsp], edx
+        mov     eax, [rsp+4]
+        imul    ecx
+        mov     [rsp+4], edx
+        mov     eax, [rsp+8]
+        imul    ecx
+        mov     [rsp+8], edx
+        mov     eax, [rsp+12]
+        imul    ecx
+        mov     [rsp+12], edx
+        movdqa  xmm0, [rsp]            ; x*m vector
+        paddd   xmm0, xmm2
+        movd    xmm3, [buf+16]         ; shift count
+        psrad   xmm0, xmm3             ; shift right arithmetic
+        psrad   xmm2, 31               ; sign of x
+        psubd   xmm0, xmm2
+        mov     rsp, rbp
+        pop     rbp        
+        ret
+;dividefixedV4i32SSE2 end
+
+
+; ********************************************************************************
+; CPU dispatching for dividefixedV4i32. This is executed only once
+; ********************************************************************************
+
+dividefixedV4i32CPUDispatch:
+        ; get supported instruction set
+        push    par1
+        push    par2
+        call    InstructionSet
+        pop     par2
+        pop     par1
+        ; Point to generic version
+        lea     r8, [dividefixedV4i32SSE2]
+        cmp     eax, 8                ; check if PMULDQ supported
+        jb      Q100
+        ; SSE4.1 supported
+        ; Point to SSE4.1 version of strstr
+        lea     r8, [dividefixedV4i32SSE41]
+Q100:   mov     [dividefixedV4i32Dispatch], r8
+        ; Continue in appropriate version 
+        jmp     r8
+
+SECTION .data
+
+; Pointer to appropriate versions. Initially point to dispatcher
+dividefixedV4i32Dispatch Dq dividefixedV4i32CPUDispatch
+
+section .text
+
+
+;******************************************************************************
+;                    32 bit unsigned integers
+;******************************************************************************
+
+; extern "C" __m128i setdivisor4ui(uint32_t d);
+; vector of 4 x 32 bit unsigned integers
+
+align 16
+global setdivisor4ui: function
+setdivisor4ui:
+        push    rbx
+        mov     ebx, par1d             ; d
+        dec     ebx
+        mov     ecx, -1                ; value for bsr if ebx = 0
+        bsr     ecx, ebx               ; floor(log2(d-1))
+        inc     ebx
+        inc     ecx                    ; L = ceil(log2(d))
+        mov     edx, 1
+        shl     rdx, cl                ; 2^L     [64 bit shift to allow overflow]
+        sub     edx, ebx
+        xor     eax, eax
+        div     ebx
+        inc     eax
+        movd    xmm0, eax
+        pshufd  xmm0, xmm0, 0          ; broadcast into 4 dwords
+        sub     ecx, 1
+        setae   dl
+        movzx   edx, dl                ; shift1
+        seta    al
+        neg     al
+        and     al,cl
+        movzx   eax, al        
+        movd    xmm1, edx              ; shift 1
+        movd    xmm2, eax              ; shift 2
+        punpckldq  xmm1, xmm2          ; combine into two dwords
+        punpcklqdq xmm0, xmm1          ; multipliers, shift1, shift2
+        pop     rbx
+        ret
+; setdivisor4ui end
+
+;extern "C" void setdivisorV4u32(__m128i buf[2], uint32_t d);
+; 4 x 32 bit unsigned 
+
+global setdivisorV4u32: function
+setdivisorV4u32:
+        push    par1                   ; buf
+        mov     par1d, par2d           ; d
+        call    setdivisor4ui
+        pop     rax                    ; buf
+        punpcklqdq xmm0, xmm0          ; copy multiplier into upper 4 words        
+        movdqa  [rax], xmm0            ; multiplier
+        movdqa  [rax+16], xmm1         ; shift counts are still in xmm1
+        ret
+; setdivisorV4u32 end
+        
+;extern "C" __m128i dividefixedV4u32(const __m128i buf[2], __m128i x);
+global dividefixedV4u32: function
+
+align 16
+dividefixedV4u32:
+; buf = par1
+; x = xmm0 (UNIX) or [par2] (Windows)
+%IFDEF  WINDOWS
+        movdqa  xmm0,[par2]            ; x
+%ENDIF
+        movdqa  xmm1, xmm0             ; x
+        movdqa  xmm2, xmm0             ; x
+        movdqa  xmm3, [par1]           ; multiplier
+        pmuludq xmm0, xmm3             ; 32 x 32 -> 64 bit unsigned multiplication of x[0] and x[2]
+        psrlq   xmm0, 32               ; high dword of result 0 and 2
+        psrlq   xmm1, 32               ; get x[1] and x[3] into position for multiplication
+        pmuludq xmm1, xmm3             ; 32 x 32 -> 64 bit unsigned multiplication of x[1] and x[3]
+        pcmpeqd xmm3, xmm3
+        psllq   xmm3, 32               ; generate mask of dword 1 and 3
+        pand    xmm1, xmm3             ; high dword of result 1 and 3
+        por     xmm0, xmm1             ; combine all four results into one vector
+        psubd   xmm2, xmm0
+        movd    xmm3, [par1+16]        ; shift1
+        psrld   xmm2, xmm3
+        paddd   xmm0, xmm2
+        movd    xmm3, [par1+20]        ; shift2
+        psrld   xmm0, xmm3
+        ret
+;dividefixedV4u32 end
diff --git a/contrib/libs/asmlib/dummy.c b/contrib/libs/asmlib/dummy.c
new file mode 100644
index 0000000000..e69de29bb2
--- /dev/null
+++ b/contrib/libs/asmlib/dummy.c
diff --git a/contrib/libs/asmlib/instrset64.asm b/contrib/libs/asmlib/instrset64.asm
new file mode 100644
index 0000000000..c8cdd34a19
--- /dev/null
+++ b/contrib/libs/asmlib/instrset64.asm
@@ -0,0 +1,184 @@
+%include "defs.asm"
+
+;*************************  instrset64.asm  **********************************
+; Author:           Agner Fog
+; Date created:     2003-12-12
+; Last modified:    2013-09-11
+; Source URL:       www.agner.org/optimize
+; Project:          asmlib.zip
+; Language:         assembly, NASM/YASM syntax, 64 bit
+;
+; C++ prototype:
+; extern "C" int InstructionSet (void);
+;
+; Description:
+; This function returns an integer indicating which instruction set is
+; supported by the microprocessor and operating system. A program can
+; call this function to determine if a particular set of instructions can
+; be used.
+;
+; The method used here for detecting whether XMM instructions are enabled by
+; the operating system is different from the method recommended by Intel.
+; The method used here has the advantage that it is independent of the 
+; ability of the operating system to catch invalid opcode exceptions. The
+; method used here has been thoroughly tested on many different versions of
+; Intel and AMD microprocessors, and is believed to work reliably. For further
+; discussion of this method, see my manual "Optimizing subroutines in assembly
+; language" (www.agner.org/optimize/).
+; 
+; Copyright (c) 2003-2013 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+;
+; ********** InstructionSet function **********
+; C++ prototype:
+; extern "C" int InstructionSet (void);
+;
+; return value:
+;  0 =  80386 instruction set only
+;  1 or above = MMX instructions supported
+;  2 or above = conditional move and FCOMI supported
+;  3 or above = SSE (XMM) supported by processor and operating system
+;  4 or above = SSE2 supported
+;  5 or above = SSE3 supported
+;  6 or above = Supplementary SSE3
+;  8 or above = SSE4.1 supported
+;  9 or above = POPCNT supported
+; 10 or above = SSE4.2 supported
+; 11 or above = AVX supported by processor and operating system
+; 12 or above = PCLMUL and AES supported
+; 13 or above = AVX2 supported
+; 14 or above = FMA3, F16C, BMI1, BMI2, LZCNT
+; 15 or above = HLE + RTM supported
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+
+global InstructionSet: function
+global IInstrSet
+
+
+SECTION .data
+align 16
+
+IInstrSet@:                            ; local name to avoid problems in shared objects
+IInstrSet:  dd      -1                 ; this global variable is valid after first call
+
+
+SECTION .text  align=16
+
+; ********** InstructionSet function **********
+; C++ prototype:
+; extern "C" int InstructionSet (void);
+
+; return value:
+;  4 or above = SSE2 supported
+;  5 or above = SSE3 supported
+;  6 or above = Supplementary SSE3 supported
+;  8 or above = SSE4.1 supported
+;  9 or above = POPCNT supported
+; 10 or above = SSE4.2 supported
+; 11 or above = AVX supported by processor and operating system
+; 12 or above = PCLMUL and AES supported
+
+
+InstructionSet:
+        ; Check if this function has been called before
+        mov     eax, [IInstrSet@]
+        test    eax, eax
+        js      FirstTime              ; Negative means first time
+        ; Early return. Has been called before
+        ret                            ; Return value is in eax
+
+FirstTime:
+        push    rbx
+
+        mov     eax, 1
+        cpuid                          ; get features into edx and ecx
+        
+        mov     eax, 4                 ; at least SSE2 supported in 64 bit mode
+        test    ecx, 1                 ; SSE3 support by microprocessor
+        jz      ISEND
+        inc     eax                    ; 5
+        
+        bt      ecx, 9                 ; Suppl-SSE3 support by microprocessor
+        jnc     ISEND
+        inc     eax                    ; 6
+        
+        bt      ecx, 19                ; SSE4.1 support by microprocessor
+        jnc     ISEND
+        mov     al, 8                  ; 8        
+        
+        bt      ecx, 23                ; POPCNT support by microprocessor
+        jnc     ISEND
+        inc     eax                    ; 9
+        
+        bt      ecx, 20                ; SSE4.2 support by microprocessor
+        jnc     ISEND
+        inc     eax                    ; 10
+
+        ; check OS support for YMM registers (AVX)
+        bt      ecx, 27                ; OSXSAVE: XGETBV supported
+        jnc     ISEND
+        push    rax
+        push    rcx
+        push    rdx
+        xor     ecx, ecx
+        db      0FH, 01H, 0D0H         ; XGETBV
+        and     eax, 6
+        cmp     eax, 6                 ; AVX support by OS
+        pop     rdx
+        pop     rcx
+        pop     rax
+        jne     ISEND
+
+        bt      ecx, 28                ; AVX support by microprocessor
+        jnc     ISEND
+        inc     eax                    ; 11
+        
+        bt      ecx, 1                 ; PCLMUL support
+        jnc     ISEND
+        bt      ecx, 25                ; AES support
+        jnc     ISEND
+        inc     eax                    ; 12
+        
+        push    rax
+        push    rcx
+        mov     eax, 7
+        xor     ecx, ecx
+        cpuid                          ; check for AVX2
+        bt      ebx, 5
+        pop     rcx
+        pop     rax
+        jnc     ISEND
+        inc     eax                    ; 13
+        
+; 14 or above = FMA3, F16C, BMI1, BMI2, LZCNT
+        bt      ecx, 12                ; FMA3
+        jnc     ISEND
+        bt      ecx, 29                ; F16C
+        jnc     ISEND
+        bt      ebx, 3                 ; BMI1
+        jnc     ISEND
+        bt      ebx, 8                 ; BMI2
+        jnc     ISEND
+        
+        push    rax
+        push    rbx
+        push    rcx
+        mov     eax, 80000001H
+        cpuid
+        bt      ecx, 5                 ; LZCNT
+        pop     rcx
+        pop     rbx
+        pop     rax
+        jnc     ISEND
+        
+        inc     eax                    ; 14
+       
+ISEND:  mov     [IInstrSet@], eax      ; save value in global variable
+
+        pop     rbx
+        ret                            ; return value is in eax
+
+;InstructionSet ENDP
diff --git a/contrib/libs/asmlib/memcmp64.asm b/contrib/libs/asmlib/memcmp64.asm
new file mode 100644
index 0000000000..b8a8ab5fbc
--- /dev/null
+++ b/contrib/libs/asmlib/memcmp64.asm
@@ -0,0 +1,295 @@
+%include "defs.asm"
+
+;*************************  memcmp64.asm  *************************************
+; Author:           Agner Fog
+; Date created:     2013-10-03
+; Last modified:    2013-10-03
+; Description:
+; Faster version of the standard memcmp function:
+;
+; int A_memcmp (const void * ptr1, const void * ptr2, size_t count);
+;
+; Compares two memory blocks of size num.
+; The return value is zero if the two memory blocks ptr1 and ptr2 are equal
+; The return value is positive if the first differing byte of ptr1 is bigger 
+; than ptr2 when compared as unsigned bytes.
+; The return value is negative if the first differing byte of ptr1 is smaller 
+; than ptr2 when compared as unsigned bytes.
+;
+; Overriding standard function memcmp:
+; The alias ?OVR_memcmp is changed to _memcmp in the object file if
+; it is desired to override the standard library function memcmp.
+;
+; Optimization:
+; Uses XMM registers if SSE2 is available, uses YMM registers if AVX2.
+;
+; The latest version of this file is available at:
+; www.agner.org/optimize/asmexamples.zip
+; Copyright (c) 2013 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+global A_memcmp: function              ; Function memcmp
+global EXP(memcmp): function           ; ?OVR_ removed if standard function memcmp overridden
+; Direct entries to CPU-specific versions
+global memcmpSSE2: function            ; SSE2 version
+global memcmpAVX2: function            ; AVX2 version
+
+; Imported from instrset64.asm
+extern InstructionSet                 ; Instruction set for CPU dispatcher
+
+default rel
+
+; define registers used for parameters
+%IFDEF  WINDOWS
+%define par1   rcx                     ; function parameter 1
+%define par2   rdx                     ; function parameter 2
+%define par3   r8                      ; function parameter 3
+%define par4   r9                      ; scratch register
+%define par4d  r9d                     ; scratch register
+%ENDIF
+%IFDEF  UNIX
+%define par1   rdi                     ; function parameter 1
+%define par2   rsi                     ; function parameter 2
+%define par3   rdx                     ; function parameter 3
+%define par4   rcx                     ; scratch register
+%define par4d  ecx                     ; scratch register
+%ENDIF
+
+
+
+SECTION .text  align=16
+
+; extern "C" int A_memcmp (const void * ptr1, const void * ptr2, size_t count);
+; Function entry:
+A_memcmp:
+EXP(memcmp):
+        jmp     qword [memcmpDispatch] ; Go to appropriate version, depending on instruction set
+
+
+align 16
+memcmpAVX2:    ; AVX2 version. Use ymm register
+memcmpAVX2@:   ; internal reference
+
+        add     par1, par3                       ; use negative index from end of memory block
+        add     par2, par3
+        neg     par3
+        jz      A900
+        mov     par4d, 0FFFFH 
+        cmp     par3, -32
+        ja      A100
+        
+A000:   ; loop comparing 32 bytes
+        vmovdqu   ymm1, [par1+par3]
+        vpcmpeqb  ymm0, ymm1, [par2+par3]        ; compare 32 bytes
+        vpmovmskb eax, ymm0                      ; get byte mask
+        xor     eax, -1                          ; not eax would not set flags
+        jnz     A700                             ; difference found
+        add     par3, 32
+        jz      A900                             ; finished, equal
+        cmp     par3, -32
+        jna     A000                             ; next 32 bytes
+        vzeroupper                               ; end ymm state
+        
+A100:   ; less than 32 bytes left
+        cmp     par3, -16
+        ja      A200
+        movdqu  xmm1, [par1+par3]
+        movdqu  xmm2, [par2+par3]
+        pcmpeqb xmm1, xmm2                       ; compare 16 bytes
+        pmovmskb eax, xmm1                       ; get byte mask
+        xor     eax, par4d                       ; invert lower 16 bits
+        jnz     A701                             ; difference found
+        add     par3, 16
+        jz      A901                             ; finished, equal
+        
+A200:   ; less than 16 bytes left
+        cmp     par3, -8
+        ja      A300
+        ; compare 8 bytes
+        movq    xmm1, [par1+par3]
+        movq    xmm2, [par2+par3]
+        pcmpeqb xmm1, xmm2                       ; compare 8 bytes
+        pmovmskb eax, xmm1                       ; get byte mask
+        xor     eax, par4d
+        jnz     A701                             ; difference found
+        add     par3, 8
+        jz      A901 
+        
+A300:   ; less than 8 bytes left
+        cmp     par3, -4
+        ja      A400
+        ; compare 4 bytes
+        movd    xmm1, [par1+par3]
+        movd    xmm2, [par2+par3]
+        pcmpeqb xmm1, xmm2                       ; compare 4 bytes
+        pmovmskb eax, xmm1                       ; get byte mask
+        xor     eax, par4d                         ; not ax
+        jnz     A701                             ; difference found
+        add     par3, 4
+        jz      A901 
+
+A400:   ; less than 4 bytes left
+        cmp     par3, -2
+        ja      A500
+        movzx   eax, word [par1+par3]
+        movzx   par4d, word [par2+par3]
+        sub     eax, par4d
+        jnz     A800                             ; difference in byte 0 or 1
+        add     par3, 2
+        jz      A901 
+        
+A500:   ; less than 2 bytes left
+        test    par3, par3
+        jz      A901                             ; no bytes left
+        
+A600:   ; one byte left
+        movzx   eax, byte [par1+par3]
+        movzx   par4d, byte [par2+par3]
+        sub     eax, par4d                         ; return result
+        ret
+
+A700:   ; difference found. find position
+        vzeroupper
+A701:   
+        bsf     eax, eax
+        add     par3, rax
+        movzx   eax, byte [par1+par3]
+        movzx   par4d, byte [par2+par3]
+        sub     eax, par4d                         ; return result
+        ret
+
+A800:   ; difference in byte 0 or 1
+        neg     al
+        sbb     par3, -1                           ; add 1 to par3 if al == 0
+        movzx   eax, byte [par1+par3]
+        movzx   par4d, byte [par2+par3]
+        sub     eax, par4d                         ; return result
+        ret
+
+A900:   ; equal
+        vzeroupper
+A901:   xor     eax, eax        
+        ret
+        
+
+memcmpSSE2:    ; SSE2 version. Use xmm register
+memcmpSSE2@:   ; internal reference
+
+        add     par1, par3                         ; use negative index from end of memory block
+        add     par2, par3
+        neg     par3
+        jz      S900 
+        mov     par4d, 0FFFFH
+        cmp     par3, -16
+        ja      S200
+        
+S100:   ; loop comparing 16 bytes
+        movdqu  xmm1, [par1+par3]
+        movdqu  xmm2, [par2+par3]
+        pcmpeqb xmm1, xmm2                       ; compare 16 bytes
+        pmovmskb eax, xmm1                       ; get byte mask
+        xor     eax, par4d                         ; not ax
+        jnz     S700                             ; difference found
+        add     par3, 16
+        jz      S900                             ; finished, equal
+        cmp     par3, -16
+        jna     S100                             ; next 16 bytes
+        
+S200:   ; less than 16 bytes left
+        cmp     par3, -8
+        ja      S300
+        ; compare 8 bytes
+        movq    xmm1, [par1+par3]
+        movq    xmm2, [par2+par3]
+        pcmpeqb xmm1, xmm2                       ; compare 8 bytes
+        pmovmskb eax, xmm1                       ; get byte mask
+        xor     eax, par4d                         ; not ax
+        jnz     S700                             ; difference found
+        add     par3, 8
+        jz      S900 
+        
+S300:   ; less than 8 bytes left
+        cmp     par3, -4
+        ja      S400
+        ; compare 4 bytes
+        movd    xmm1, [par1+par3]
+        movd    xmm2, [par2+par3]
+        pcmpeqb xmm1, xmm2                       ; compare 4 bytes
+        pmovmskb eax, xmm1                       ; get byte mask
+        xor     eax, par4d                         ; not ax
+        jnz     S700                             ; difference found
+        add     par3, 4
+        jz      S900 
+
+S400:   ; less than 4 bytes left
+        cmp     par3, -2
+        ja      S500
+        movzx   eax, word [par1+par3]
+        movzx   par4d, word [par2+par3]
+        sub     eax, par4d
+        jnz     S800                             ; difference in byte 0 or 1
+        add     par3, 2
+        jz      S900 
+        
+S500:   ; less than 2 bytes left
+        test    par3, par3
+        jz      S900                             ; no bytes left
+        
+        ; one byte left
+        movzx   eax, byte [par1+par3]
+        movzx   par4d, byte [par2+par3]
+        sub     eax, par4d                         ; return result
+        ret
+
+S700:   ; difference found. find position
+        bsf     eax, eax
+        add     par3, rax
+        movzx   eax, byte [par1+par3]
+        movzx   par4d, byte [par2+par3]
+        sub     eax, par4d                         ; return result
+        ret
+
+S800:   ; difference in byte 0 or 1
+        neg     al
+        sbb     par3, -1                          ; add 1 to par3 if al == 0
+S820:   movzx   eax, byte [par1+par3]
+        movzx   par4d, byte [par2+par3]
+        sub     eax, par4d                         ; return result
+        ret
+
+S900:   ; equal
+        xor     eax, eax        
+        ret
+
+        
+; CPU dispatching for memcmp. This is executed only once
+memcmpCPUDispatch:
+        push    par1
+        push    par2
+        push    par3        
+        call    InstructionSet                         ; get supported instruction set
+        ; SSE2 always supported
+        lea     par4, [memcmpSSE2@]
+        cmp     eax, 13                ; check AVX2
+        jb      Q100
+        ; AVX2 supported
+        lea     par4, [memcmpAVX2@]        
+Q100:   ; save pointer
+        mov     qword [memcmpDispatch], par4
+; Continue in appropriate version of memcmp
+        pop     par3
+        pop     par2
+        pop     par1
+        jmp     par4
+
+
+SECTION .data
+align 16
+
+
+; Pointer to appropriate version.
+; This initially points to memcmpCPUDispatch. memcmpCPUDispatch will
+; change this to the appropriate version of memcmp, so that
+; memcmpCPUDispatch is only executed once:
+memcmpDispatch DQ memcmpCPUDispatch
+
diff --git a/contrib/libs/asmlib/memcpy64.asm b/contrib/libs/asmlib/memcpy64.asm
new file mode 100644
index 0000000000..d590990b99
--- /dev/null
+++ b/contrib/libs/asmlib/memcpy64.asm
@@ -0,0 +1,1332 @@
+%include "defs.asm"
+
+;*************************  memcpy64.asm  ************************************
+; Author:           Agner Fog
+; Date created:     2008-07-19
+; Last modified:    2016-11-12 (patched version with AVX512 support removed)
+;
+; Description:
+; Faster version of the standard memcpy function:
+; void * A_memcpy(void *dest, const void *src, size_t count);
+; Copies 'count' bytes from 'src' to 'dest'
+;
+; Overriding standard function memcpy:
+; The alias ?OVR_memcpy is changed to _memcpy in the object file if
+; it is desired to override the standard library function memcpy.
+;
+; The function uses non-temporal writes to bypass the cache when the size is
+; bigger than half the size of the largest_level cache. This limit can be
+; read with GetMemcpyCacheLimit and changed with SetMemcpyCacheLimit
+; C++ prototypes:
+; extern "C" size_t GetMemcpyCacheLimit();  // in memcpy64.asm
+; extern "C" void SetMemcpyCacheLimit();    // in memmove64.asm
+; extern "C" void SetMemcpyCacheLimit1();   // used internally
+;
+; Position-independent code is generated if POSITIONINDEPENDENT is defined.
+;
+; CPU dispatching included SSE2, Suppl-SSE3 and AVX instruction sets.
+;
+; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+default rel
+
+global A_memcpy: function              ; Function A_memcpy
+global EXP(memcpy): function           ; ?OVR removed if standard function memcpy overridden
+global memcpySSE2: function            ; Version for processors with only SSE2
+global memcpySSSE3: function           ; Version for processors with SSSE3
+global memcpyU: function               ; Version for processors with fast unaligned read
+global memcpyU256: function            ; Version for processors with fast 256-bit read/write
+
+global GetMemcpyCacheLimit: function   ; Get the size limit for bypassing cache when copying with memcpy and memmove
+global SetMemcpyCacheLimit1: function  ; Set the size limit for bypassing cache when copying with memcpy
+
+
+; Imported from instrset64.asm
+extern InstructionSet                  ; Instruction set for CPU dispatcher
+
+; Imported from unalignedisfaster64.asm:
+extern UnalignedIsFaster               ; Tells if unaligned read is faster than PALIGNR
+extern Store256BitIsFaster             ; Tells if a 256 bit store is faster than two 128 bit stores
+
+; Imported from cachesize32.asm:
+extern DataCacheSize                   ; Gets size of data cache
+
+
+; Define prolog for this function
+%MACRO  PROLOGM  0
+%IFDEF  WINDOWS
+        push    rsi
+        push    rdi
+        mov     rdi, rcx               ; dest
+        mov     r9,  rcx               ; dest
+        mov     rsi, rdx               ; src
+        mov     rcx, r8                ; count
+%ELSE   ; Unix
+        mov     rcx, rdx               ; count
+        mov     r9,  rdi               ; dest
+%ENDIF
+%ENDM
+
+; Define return from this function
+%MACRO  RETURNM  0
+%IFDEF  WINDOWS
+        pop     rdi
+        pop     rsi
+%ENDIF
+        mov     rax, r9                ; Return value = dest
+        ret
+%ENDM
+
+
+SECTION .text  align=16
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;                          Common entry for dispatch
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; extern "C" void * A_memcpy(void * dest, const void * src, size_t count);
+; Function entry:
+A_memcpy:
+EXP(memcpy):
+        jmp     qword [memcpyDispatch] ; Go to appropriate version, depending on instruction set
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; AVX Version for processors with fast unaligned read and fast 32 bytes write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+memcpyU256:   ; global label
+memcpyU256@:  ; local label
+        PROLOGM
+        cmp     rcx, 40H
+        jb      A1000                  ; Use simpler code if count < 64
+
+        ; count >= 64
+        ; Calculate size of first block up to first regular boundary of dest
+        mov     edx, edi
+        neg     edx
+        and     edx, 1FH
+        jz      B3100                    ; Skip if dest aligned by 32
+
+        ; edx = size of first partial block, 1 - 31 bytes
+        test    dl, 3
+        jz      B3030
+        test    dl, 1
+        jz      B3020
+        ; move 1 byte
+        movzx   eax, byte [rsi]
+        mov     [rdi], al
+        inc     rsi
+        inc     rdi
+B3020:  test    dl, 2
+        jz      B3030
+        ; move 2 bytes
+        movzx   eax, word [rsi]
+        mov     [rdi], ax
+        add     rsi, 2
+        add     rdi, 2
+B3030:  test    dl, 4
+        jz      B3040
+        ; move 4 bytes
+        mov     eax, [rsi]
+        mov     [rdi], eax
+        add     rsi, 4
+        add     rdi, 4
+B3040:  test    dl, 8
+        jz      B3050
+        ; move 8 bytes
+        mov     rax, [rsi]
+        mov     [rdi], rax
+        add     rsi, 8
+        add     rdi, 8
+B3050:  test    dl, 16
+        jz      B3060
+        ; move 16 bytes
+        movups  xmm0, [rsi]
+        movaps  [rdi], xmm0
+        add     rsi, 16
+        add     rdi, 16
+B3060:  sub     rcx, rdx
+
+B3100:  ; Now dest is aligned by 32. Any partial block has been moved
+
+        ; Set up for loop moving 32 bytes per iteration:
+        mov     rdx, rcx               ; Save count
+        and     rcx, -20H              ; Round down to nearest multiple of 32
+        add     rsi, rcx               ; Point to the end
+        add     rdi, rcx               ; Point to the end
+        sub     rdx, rcx               ; Remaining data after loop
+        ; Check if count very big
+        cmp     rcx, [CacheBypassLimit]
+        ja      I3100                  ; Use non-temporal store if count > CacheBypassLimit
+        neg     rcx                    ; Negative index from the end
+
+H3100:  ; copy -rcx bytes in blocks of 32 bytes.
+
+        ; Check for false memory dependence: The CPU may falsely assume
+        ; a partial overlap between the written destination and the following
+        ; read source if source is unaligned and
+        ; (src-dest) modulo 4096  is close to 4096
+        test    sil, 1FH
+        jz      H3110                  ; aligned
+        mov     eax, esi
+        sub     eax, edi
+        and     eax, 0FFFH             ; modulo 4096
+        cmp     eax, 1000H - 200H
+        ja      J3100
+
+align 16
+H3110:  ; main copy loop, 32 bytes at a time
+        ; rcx has negative index from the end, counting up to zero
+        vmovups ymm0, [rsi+rcx]
+        vmovaps [rdi+rcx], ymm0
+        add     rcx, 20H
+        jnz     H3110
+        sfence
+        vzeroupper                     ; end of AVX mode
+
+H3120:  ; Move the remaining edx bytes (0 - 31):
+        add     rsi, rdx
+        add     rdi, rdx
+        neg     rdx
+        jz      H3500                  ; Skip if no more data
+        ; move 16-8-4-2-1 bytes, aligned
+        cmp     edx, -10H
+        jg      H3200
+        ; move 16 bytes
+        movups  xmm0, [rsi+rdx]
+        movaps  [rdi+rdx], xmm0
+        add     rdx, 10H
+H3200:  cmp     edx, -8
+        jg      H3210
+        ; move 8 bytes
+        movq    xmm0, qword [rsi+rdx]
+        movq    qword [rdi+rdx], xmm0
+        add     rdx, 8
+        jz      H500                   ; Early skip if count divisible by 8
+H3210:  cmp     edx, -4
+        jg      H3220
+        ; move 4 bytes
+        mov     eax, [rsi+rdx]
+        mov     [rdi+rdx], eax
+        add     rdx, 4
+H3220:  cmp     edx, -2
+        jg      H3230
+        ; move 2 bytes
+        movzx   eax, word [rsi+rdx]
+        mov     [rdi+rdx], ax
+        add     rdx, 2
+H3230:  cmp     edx, -1
+        jg      H3500
+        ; move 1 byte
+        movzx   eax, byte [rsi+rdx]
+        mov     [rdi+rdx], al
+H3500:  ; finished
+        RETURNM
+
+I3100:   ; non-temporal move
+        neg     rcx                    ; Negative index from the end
+
+align 16
+I3110:  ; main copy loop, 32 bytes at a time
+        ; rcx has negative index from the end, counting up to zero
+        vmovups ymm0, [rsi+rcx]
+        vmovntps [rdi+rcx], ymm0
+        add     rcx, 20H
+        jnz     I3110
+        sfence
+        vzeroupper                      ; end of AVX mode
+        jmp     H3120                  ; Move the remaining edx bytes (0 - 31)
+
+
+align 16
+J3100:  ; There is a false memory dependence.
+        ; check if src and dest overlap, if not then it is safe
+        ; to copy backwards to avoid false memory dependence
+%if 1
+        ; Use this version if you want consistent behavior in the case
+        ; where dest > src and overlap. However, this case is undefined
+        ; anyway because part of src is overwritten before copying
+        push    rdx
+        mov     rax, rsi
+        sub     rax, rdi
+        cqo
+        xor     rax, rdx
+        sub     rax, rdx   ; abs(src-dest)
+        neg     rcx        ; size
+        pop     rdx        ; restore rdx
+        cmp     rax, rcx
+        jnb     J3110
+        neg     rcx        ; restore rcx
+        jmp     H3110      ; overlap between src and dest. Can't copy backwards
+%else
+        ; save time by not checking the case that is undefined anyway
+        mov     rax, rsi
+        sub     rax, rdi
+        neg     rcx        ; size
+        cmp     rax, rcx
+        jnb     J3110      ; OK to copy backwards
+        ; must copy forwards
+        neg     rcx        ; restore ecx
+        jmp     H3110      ; copy forwards
+
+%endif
+
+J3110:   ; copy backwards, rcx = size. rsi, rdi = end of src, dest
+        push    rsi
+        push    rdi
+        sub     rsi, rcx
+        sub     rdi, rcx
+J3120:  ; loop backwards
+        vmovups ymm0, [rsi+rcx-20H]
+        vmovaps [rdi+rcx-20H], ymm0
+        sub     rcx, 20H
+        jnz     J3120
+        sfence
+        vzeroupper
+        pop     rdi
+        pop     rsi
+        jmp     H3120
+
+align 16
+        ; count < 64. Move 32-16-8-4-2-1 bytes
+        ; multiple CPU versions (SSSE3 and above)
+A1000:  add     rsi, rcx               ; end of src
+        add     rdi, rcx               ; end of dest
+        neg     rcx                    ; negative index from the end
+        cmp     ecx, -20H
+        jg      A1100
+        ; move 32 bytes
+        ; movdqu is faster than 64-bit moves on processors with SSSE3
+        movups  xmm0, [rsi+rcx]
+        movups  xmm1, [rsi+rcx+10H]
+        movups  [rdi+rcx], xmm0
+        movups  [rdi+rcx+10H], xmm1
+        add     rcx, 20H
+A1100:  cmp     ecx, -10H
+        jg      A1200
+        ; move 16 bytes
+        movups  xmm0, [rsi+rcx]
+        movups  [rdi+rcx], xmm0
+        add     rcx, 10H
+A1200:  cmp     ecx, -8
+        jg      A1300
+        ; move 8 bytes
+        mov     rax, qword [rsi+rcx]
+        mov     qword [rdi+rcx], rax
+        add     rcx, 8
+A1300:  cmp     ecx, -4
+        jg      A1400
+        ; move 4 bytes
+        mov     eax, [rsi+rcx]
+        mov     [rdi+rcx], eax
+        add     rcx, 4
+        jz      A1900                     ; early out if count divisible by 4
+A1400:  cmp     ecx, -2
+        jg      A1500
+        ; move 2 bytes
+        movzx   eax, word [rsi+rcx]
+        mov     [rdi+rcx], ax
+        add     rcx, 2
+A1500:  cmp     ecx, -1
+        jg      A1900
+        ; move 1 byte
+        movzx   eax, byte [rsi+rcx]
+        mov     [rdi+rcx], al
+A1900:  ; finished
+        RETURNM
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;  Version for processors with fast unaligned read and fast 16 bytes write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+memcpyU:   ; global label
+memcpyU@:  ; local label
+        PROLOGM
+        cmp     rcx, 40H
+        jb      A1000                  ; Use simpler code if count < 64
+
+        ; count >= 64
+        ; Calculate size of first block up to first regular boundary of dest
+        mov     edx, edi
+        neg     edx
+        and     edx, 0FH
+        jz      B2100                    ; Skip if dest aligned by 16
+
+        ; edx = size of first partial block, 1 - 15 bytes
+        test    dl, 3
+        jz      B2030
+        test    dl, 1
+        jz      B2020
+        ; move 1 byte
+        movzx   eax, byte [rsi]
+        mov     [rdi], al
+        inc     rsi
+        inc     rdi
+B2020:  test    dl, 2
+        jz      B2030
+        ; move 2 bytes
+        movzx   eax, word [rsi]
+        mov     [rdi], ax
+        add     rsi, 2
+        add     rdi, 2
+B2030:  test    dl, 4
+        jz      B2040
+        ; move 4 bytes
+        mov     eax, [rsi]
+        mov     [rdi], eax
+        add     rsi, 4
+        add     rdi, 4
+B2040:  test    dl, 8
+        jz      B2050
+        ; move 8 bytes
+        mov     rax, [rsi]
+        mov     [rdi], rax
+        add     rsi, 8
+        add     rdi, 8
+B2050:  sub     rcx, rdx
+B2100:  ; Now dest is aligned by 16. Any partial block has been moved
+
+        ; Set up for loop moving 32 bytes per iteration:
+        mov     rdx, rcx               ; Save count
+        and     rcx, -20H              ; Round down to nearest multiple of 32
+        add     rsi, rcx               ; Point to the end
+        add     rdi, rcx               ; Point to the end
+        sub     rdx, rcx               ; Remaining data after loop
+
+        ; Check if count very big
+        cmp     rcx, [CacheBypassLimit]
+        ja      I100                   ; Use non-temporal store if count > CacheBypassLimit
+        neg     rcx                    ; Negative index from the end
+
+H100:   ; copy -rcx bytes in blocks of 32 bytes.
+
+        ; Check for false memory dependence: The CPU may falsely assume
+        ; a partial overlap between the written destination and the following
+        ; read source if source is unaligned and
+        ; (src-dest) modulo 4096 is close to 4096
+        test    sil, 0FH
+        jz      H110                   ; aligned
+        mov     eax, esi
+        sub     eax, edi
+        and     eax, 0FFFH             ; modulo 4096
+        cmp     eax, 1000H - 200H
+        ja      J100
+
+H110:   ; main copy loop, 32 bytes at a time
+        ; rcx has negative index from the end, counting up to zero
+        movups  xmm0, [rsi+rcx]
+        movups  xmm1, [rsi+rcx+10H]
+        movaps  [rdi+rcx], xmm0
+        movaps  [rdi+rcx+10H], xmm1
+        add     rcx, 20H
+        jnz     H110
+
+H120:   ; Move the remaining edx bytes (0 - 31):
+        add     rsi, rdx
+        add     rdi, rdx
+        neg     rdx
+        jz      H500                   ; Skip if no more data
+        ; move 16-8-4-2-1 bytes, aligned
+        cmp     edx, -10H
+        jg      H200
+        ; move 16 bytes
+        movups  xmm0, [rsi+rdx]
+        movaps  [rdi+rdx], xmm0
+        add     rdx, 10H
+H200:   cmp     edx, -8
+        jg      H210
+        ; move 8 bytes
+        movq    xmm0, qword [rsi+rdx]
+        movq    qword [rdi+rdx], xmm0
+        add     rdx, 8
+        jz      H500                   ; Early skip if count divisible by 8
+H210:   cmp     edx, -4
+        jg      H220
+        ; move 4 bytes
+        mov     eax, [rsi+rdx]
+        mov     [rdi+rdx], eax
+        add     rdx, 4
+H220:   cmp     edx, -2
+        jg      H230
+        ; move 2 bytes
+        movzx   eax, word [rsi+rdx]
+        mov     [rdi+rdx], ax
+        add     rdx, 2
+H230:   cmp     edx, -1
+        jg      H500
+        ; move 1 byte
+        movzx   eax, byte [rsi+rdx]
+        mov     [rdi+rdx], al
+H500:   ; finished
+        RETURNM
+
+I100:   ; non-temporal move
+        neg     rcx                    ; Negative index from the end
+
+align 16
+I110:   ; main copy loop, 32 bytes at a time
+        ; rcx has negative index from the end, counting up to zero
+        movups  xmm0, [rsi+rcx]
+        movups  xmm1, [rsi+rcx+10H]
+        movntps [rdi+rcx], xmm0
+        movntps [rdi+rcx+10H], xmm1
+        add     rcx, 20H
+        jnz     I110
+        sfence
+        jmp     H120                  ; Move the remaining edx bytes (0 - 31):
+
+
+align 16
+J100:   ; There is a false memory dependence.
+        ; check if src and dest overlap, if not then it is safe
+        ; to copy backwards to avoid false memory dependence
+%if 1
+        ; Use this version if you want consistent behavior in the case
+        ; where dest > src and overlap. However, this case is undefined
+        ; anyway because part of src is overwritten before copying
+        push    rdx
+        mov     rax, rsi
+        sub     rax, rdi
+        cqo
+        xor     rax, rdx
+        sub     rax, rdx   ; abs(src-dest)
+        neg     rcx        ; size
+        pop     rdx        ; restore rdx
+        cmp     rax, rcx
+        jnb     J110
+        neg     rcx        ; restore rcx
+        jmp     H110       ; overlap between src and dest. Can't copy backwards
+%else
+        ; save time by not checking the case that is undefined anyway
+        mov     rax, rsi
+        sub     rax, rdi
+        neg     rcx        ; size
+        cmp     rax, rcx
+        jnb     J110       ; OK to copy backwards
+        ; must copy forwards
+        neg     rcx        ; restore ecx
+        jmp     H110       ; copy forwards
+
+%endif
+
+J110:   ; copy backwards, rcx = size. rsi, rdi = end of src, dest
+        push    rsi
+        push    rdi
+        sub     rsi, rcx
+        sub     rdi, rcx
+J120:   ; loop backwards
+        movups  xmm1, [rsi+rcx-20H]
+        movups  xmm0, [rsi+rcx-10H]
+        movaps  [rdi+rcx-20H], xmm1
+        movaps  [rdi+rcx-10H], xmm0
+        sub     rcx, 20H
+        jnz     J120
+        pop     rdi
+        pop     rsi
+        jmp     H120
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;  Version for processors with SSSE3. Aligned read + shift + aligned write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+memcpySSSE3:     ; global label
+memcpySSSE3@:    ; local label
+        PROLOGM
+        cmp     rcx, 40H
+        jb      A1000                  ; Use simpler code if count < 64
+
+        ; count >= 64
+        ; Calculate size of first block up to first regular boundary of dest
+        mov     edx, edi
+        neg     edx
+        and     edx, 0FH
+        jz      B1200                    ; Skip if dest aligned by 16
+
+        ; edx = size of first partial block, 1 - 15 bytes
+        test    dl, 3
+        jz      B1030
+        test    dl, 1
+        jz      B1020
+        ; move 1 byte
+        movzx   eax, byte [rsi]
+        mov     [rdi], al
+        inc     rsi
+        inc     rdi
+B1020:  test    dl, 2
+        jz      B1030
+        ; move 2 bytes
+        movzx   eax, word [rsi]
+        mov     [rdi], ax
+        add     rsi, 2
+        add     rdi, 2
+B1030:  test    dl, 4
+        jz      B1040
+        ; move 4 bytes
+        mov     eax, [rsi]
+        mov     [rdi], eax
+        add     rsi, 4
+        add     rdi, 4
+B1040:  test    dl, 8
+        jz      B1050
+        ; move 8 bytes
+        mov     rax, [rsi]
+        mov     [rdi], rax
+        add     rsi, 8
+        add     rdi, 8
+B1050:  sub     rcx, rdx
+B1200:  ; Now dest is aligned by 16. Any partial block has been moved
+        ; Find alignment of src modulo 16 at this point:
+        mov     eax, esi
+        and     eax, 0FH
+
+        ; Set up for loop moving 32 bytes per iteration:
+        mov     edx, ecx               ; Save count (lower 32 bits)
+        and     rcx, -20H              ; Round down count to nearest multiple of 32
+        add     rsi, rcx               ; Point to the end
+        add     rdi, rcx               ; Point to the end
+        sub     edx, ecx               ; Remaining data after loop (0-31)
+        sub     rsi, rax               ; Nearest preceding aligned block of src
+
+        ; Check if count very big
+        cmp     rcx, [CacheBypassLimit]
+        ja      B1400                   ; Use non-temporal store if count > CacheBypassLimit
+        neg     rcx                    ; Negative index from the end
+
+        ; Dispatch to different codes depending on src alignment
+        lea     r8, [AlignmentDispatchSSSE3]
+        jmp     near [r8+rax*8]
+
+B1400:  neg     rcx
+        ; Dispatch to different codes depending on src alignment
+        lea     r8, [AlignmentDispatchNT]
+        jmp     near [r8+rax*8]
+
+
+align   16
+C100:   ; Code for aligned src. SSE2 and SSSE3 versions
+        ; The nice case, src and dest have same alignment.
+
+        ; Loop. rcx has negative index from the end, counting up to zero
+        movaps  xmm0, [rsi+rcx]
+        movaps  xmm1, [rsi+rcx+10H]
+        movaps  [rdi+rcx], xmm0
+        movaps  [rdi+rcx+10H], xmm1
+        add     rcx, 20H
+        jnz     C100
+
+        ; Move the remaining edx bytes (0 - 31):
+        add     rsi, rdx
+        add     rdi, rdx
+        neg     rdx
+        jz      C500                   ; Skip if no more data
+        ; move 16-8-4-2-1 bytes, aligned
+        cmp     edx, -10H
+        jg      C200
+        ; move 16 bytes
+        movaps  xmm0, [rsi+rdx]
+        movaps  [rdi+rdx], xmm0
+        add     rdx, 10H
+C200:   cmp     edx, -8
+        jg      C210
+        ; move 8 bytes
+        mov     rax, [rsi+rdx]
+        mov     [rdi+rdx], rax
+        add     rdx, 8
+        jz      C500                   ; Early skip if count divisible by 8
+C210:   cmp     edx, -4
+        jg      C220
+        ; move 4 bytes
+        mov     eax, [rsi+rdx]
+        mov     [rdi+rdx], eax
+        add     rdx, 4
+C220:   cmp     edx, -2
+        jg      C230
+        ; move 2 bytes
+        movzx   eax, word [rsi+rdx]
+        mov     [rdi+rdx], ax
+        add     rdx, 2
+C230:   cmp     edx, -1
+        jg      C500
+        ; move 1 byte
+        movzx   eax, byte [rsi+rdx]
+        mov     [rdi+rdx], al
+C500:   ; finished
+        RETURNM
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;  Version for processors with SSE2. Aligned read + shift + aligned write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+memcpySSE2:     ; global label
+memcpySSE2@:    ; local label
+        PROLOGM
+        cmp     rcx, 40H
+        jae     B0100                   ; Use simpler code if count < 64
+
+        ; count < 64. Move 32-16-8-4-2-1 bytes
+        add     rsi, rcx               ; end of src
+        add     rdi, rcx               ; end of dest
+        neg     rcx                    ; negative index from the end
+        cmp     ecx, -20H
+        jg      A100
+        ; move 32 bytes
+        ; mov r64 is faster than movdqu on Intel Pentium M and Core 1
+        ; movdqu is fast on Nehalem and later
+        mov     rax, [rsi+rcx]
+        mov     rdx, [rsi+rcx+8]
+        mov     [rdi+rcx], rax
+        mov     [rdi+rcx+8], rdx
+        mov     rax, qword [rsi+rcx+10H]
+        mov     rdx, qword [rsi+rcx+18H]
+        mov     qword [rdi+rcx+10H], rax
+        mov     qword [rdi+rcx+18H], rdx
+        add     rcx, 20H
+A100:   cmp     ecx, -10H
+        jg      A200
+        ; move 16 bytes
+        mov     rax, [rsi+rcx]
+        mov     rdx, [rsi+rcx+8]
+        mov     [rdi+rcx], rax
+        mov     [rdi+rcx+8], rdx
+        add     rcx, 10H
+A200:   cmp     ecx, -8
+        jg      A300
+        ; move 8 bytes
+        mov     rax, qword [rsi+rcx]
+        mov     qword [rdi+rcx], rax
+        add     rcx, 8
+A300:   cmp     ecx, -4
+        jg      A400
+        ; move 4 bytes
+        mov     eax, [rsi+rcx]
+        mov     [rdi+rcx], eax
+        add     rcx, 4
+        jz      A900                     ; early out if count divisible by 4
+A400:   cmp     ecx, -2
+        jg      A500
+        ; move 2 bytes
+        movzx   eax, word [rsi+rcx]
+        mov     [rdi+rcx], ax
+        add     rcx, 2
+A500:   cmp     ecx, -1
+        jg      A900
+        ; move 1 byte
+        movzx   eax, byte [rsi+rcx]
+        mov     [rdi+rcx], al
+A900:   ; finished
+        RETURNM
+
+B0100:  ; count >= 64
+        ; Calculate size of first block up to first regular boundary of dest
+        mov     edx, edi
+        neg     edx
+        and     edx, 0FH
+        jz      B0200                    ; Skip if dest aligned by 16
+
+        ; edx = size of first partial block, 1 - 15 bytes
+        test    dl, 3
+        jz      B0030
+        test    dl, 1
+        jz      B0020
+        ; move 1 byte
+        movzx   eax, byte [rsi]
+        mov     [rdi], al
+        inc     rsi
+        inc     rdi
+B0020:  test    dl, 2
+        jz      B0030
+        ; move 2 bytes
+        movzx   eax, word [rsi]
+        mov     [rdi], ax
+        add     rsi, 2
+        add     rdi, 2
+B0030:  test    dl, 4
+        jz      B0040
+        ; move 4 bytes
+        mov     eax, [rsi]
+        mov     [rdi], eax
+        add     rsi, 4
+        add     rdi, 4
+B0040:  test    dl, 8
+        jz      B0050
+        ; move 8 bytes
+        mov     rax, [rsi]
+        mov     [rdi], rax
+        add     rsi, 8
+        add     rdi, 8
+B0050:  sub     rcx, rdx
+B0200:  ; Now dest is aligned by 16. Any partial block has been moved
+
+        ; This part will not always work if count < 64
+        ; Calculate size of first block up to first regular boundary of dest
+        mov     edx, edi
+        neg     edx
+        and     edx, 0FH
+        jz      B300                    ; Skip if dest aligned by 16
+
+        ; rdx = size of first partial block, 1 - 15 bytes
+        add     rsi, rdx
+        add     rdi, rdx
+        sub     rcx, rdx
+        neg     rdx
+        cmp     edx, -8
+        jg      B200
+        ; move 8 bytes
+        mov     rax, [rsi+rdx]
+        mov     [rdi+rdx], rax
+        add     rdx, 8
+B200:   cmp     edx, -4
+        jg      B210
+        ; move 4 bytes
+        mov     eax, [rsi+rdx]
+        mov     [rdi+rdx], eax
+        add     rdx, 4
+        jz      B300              ; early out if aligned by 4
+B210:   cmp     edx, -2
+        jg      B220
+        ; move 2 bytes
+        movzx   eax, word [rsi+rdx]
+        mov     [rdi+rdx], ax
+        add     rdx, 2
+B220:   cmp     edx, -1
+        jg      B300
+        ; move 1 byte
+        movzx   eax, byte [rsi+rdx]
+        mov     [rdi+rdx], al
+
+B300:   ; Now dest is aligned by 16. Any partial block has been moved
+        ; Find alignment of src modulo 16 at this point:
+        mov     eax, esi
+        and     eax, 0FH
+
+        ; Set up for loop moving 32 bytes per iteration:
+        mov     edx, ecx               ; Save count (lower 32 bits)
+        and     rcx, -20H              ; Round down count to nearest multiple of 32
+        add     rsi, rcx               ; Point to the end
+        add     rdi, rcx               ; Point to the end
+        sub     edx, ecx               ; Remaining data after loop (0-31)
+        sub     rsi, rax               ; Nearest preceding aligned block of src
+
+        ; Check if count very big
+        cmp     rcx, [CacheBypassLimit]
+        ja      B400                   ; Use non-temporal store if count > CacheBypassLimit
+        neg     rcx                    ; Negative index from the end
+
+        ; Dispatch to different codes depending on src alignment
+        lea     r8, [AlignmentDispatchSSE2]
+        jmp     near [r8+rax*8]
+
+B400:   neg     rcx
+        ; Dispatch to different codes depending on src alignment
+        lea     r8, [AlignmentDispatchNT]
+        jmp     near [r8+rax*8]
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;  Macros and alignment jump tables
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Macros for each src alignment, SSE2 instruction set:
+; Make separate code for each alignment u because the shift instructions
+; have the shift count as a constant:
+
+%MACRO  MOVE_UNALIGNED_SSE2  2 ; u, nt
+; Move rcx + rdx bytes of data
+; Source is misaligned. (src-dest) modulo 16 = %1
+; %2 = 1 if non-temporal store desired
+; eax = %1
+; rsi = src - %1 = nearest preceding 16-bytes boundary
+; rdi = dest (aligned)
+; rcx = - (count rounded down to nearest divisible by 32)
+; edx = remaining bytes to move after loop
+        movdqa  xmm0, [rsi+rcx]        ; Read from nearest preceding 16B boundary
+%%L1:  ; Loop. rcx has negative index from the end, counting up to zero
+        movdqa  xmm1, [rsi+rcx+10H]    ; Read next two blocks aligned
+        movdqa  xmm2, [rsi+rcx+20H]
+        movdqa  xmm3, xmm1             ; Copy because used twice
+        psrldq  xmm0, %1               ; shift right
+        pslldq  xmm1, 16-%1            ; shift left
+        por     xmm0, xmm1             ; combine blocks
+        %IF %2 == 0
+        movdqa  [rdi+rcx], xmm0        ; Save aligned
+        %ELSE
+        movntdq [rdi+rcx], xmm0        ; non-temporal save
+        %ENDIF
+        movdqa  xmm0, xmm2             ; Save for next iteration
+        psrldq  xmm3, %1               ; shift right
+        pslldq  xmm2, 16-%1            ; shift left
+        por     xmm3, xmm2             ; combine blocks
+        %IF %2 == 0
+        movdqa  [rdi+rcx+10H], xmm3    ; Save aligned
+        %ELSE
+        movntdq [rdi+rcx+10H], xmm3    ; non-temporal save
+        %ENDIF
+        add     rcx, 20H               ; Loop through negative values up to zero
+        jnz     %%L1
+
+        ; Set up for edx remaining bytes
+        add     rsi, rdx
+        add     rdi, rdx
+        neg     rdx
+        cmp     edx, -10H
+        jg      %%L2
+        ; One more 16-bytes block to move
+        movdqa  xmm1, [rsi+rdx+10H]
+        psrldq  xmm0, %1               ; shift right
+        pslldq  xmm1, 16-%1            ; shift left
+        por     xmm0, xmm1             ; combine blocks
+        %IF %2 == 0
+        movdqa  [rdi+rdx], xmm0        ; Save aligned
+        %ELSE
+        movntdq [rdi+rdx], xmm0        ; non-temporal save
+        %ENDIF
+        add     rdx, 10H
+%%L2:   ; Get src pointer back to misaligned state
+        add     rsi, rax
+        %IF %2 == 1
+        sfence
+        %ENDIF
+        ; Move remaining 0 - 15 bytes, unaligned
+        jmp     C200
+%ENDMACRO
+
+
+%MACRO  MOVE_UNALIGNED_SSE2_4  1 ; nt
+; Special case for u = 4
+; %1 = 1 if non-temporal store desired
+        movaps  xmm0, [rsi+rcx]        ; Read from nearest preceding 16B boundary
+%%L1:   ; Loop. rcx has negative index from the end, counting up to zero
+        movaps  xmm1, [rsi+rcx+10H]    ; Read next two blocks aligned
+        movss   xmm0, xmm1             ; Moves 4 bytes, leaves remaining bytes unchanged
+        shufps  xmm0, xmm0, 00111001B  ; Rotate
+        %IF %1 == 0
+        movaps  [rdi+rcx], xmm0        ; Save aligned
+        %ELSE
+        movntps [rdi+rcx], xmm0        ; Non-temporal save
+        %ENDIF
+        movaps  xmm0, [rsi+rcx+20H]
+        movss   xmm1, xmm0
+        shufps  xmm1, xmm1, 00111001B
+        %IF %1 == 0
+        movaps  [rdi+rcx+10H], xmm1    ; Save aligned
+        %ELSE
+        movntps [rdi+rcx+10H], xmm1    ; Non-temporal save
+        %ENDIF
+        add     rcx, 20H               ; Loop through negative values up to zero
+        jnz     %%L1
+        ; Set up for edx remaining bytes
+        add     rsi, rdx
+        add     rdi, rdx
+        neg     rdx
+        cmp     edx, -10H
+        jg      %%L2
+        ; One more 16-bytes block to move
+        movaps  xmm1, [rsi+rdx+10H]    ; Read next two blocks aligned
+        movss   xmm0, xmm1
+        shufps  xmm0, xmm0, 00111001B
+        %IF %1 == 0
+        movaps  [rdi+rdx], xmm0        ; Save aligned
+        %ELSE
+        movntps [rdi+rdx], xmm0        ; Non-temporal save
+        %ENDIF
+        add     rdx, 10H
+%%L2:   ; Get src pointer back to misaligned state
+        add     rsi, rax
+        %IF %1 == 1
+        sfence
+        %ENDIF
+        ; Move remaining 0 - 15 bytes, unaligned
+        jmp     C200
+%ENDMACRO
+
+
+%MACRO  MOVE_UNALIGNED_SSE2_8  1 ; nt
+; Special case for u = 8
+; %1 = 1 if non-temporal store desired
+        movaps  xmm0, [rsi+rcx]        ; Read from nearest preceding 16B boundary
+%%L1:   ; Loop. rcx has negative index from the end, counting up to zero
+        movaps  xmm1, [rsi+rcx+10H]    ; Read next two blocks aligned
+        movsd   xmm0, xmm1             ; Moves 8 bytes, leaves remaining bytes unchanged
+        shufps  xmm0, xmm0, 01001110B  ; Rotate
+        %IF %1 == 0
+        movaps  [rdi+rcx], xmm0        ; Save aligned
+        %ELSE
+        movntps [rdi+rcx], xmm0        ; Non-temporal save
+        %ENDIF
+        movaps  xmm0, [rsi+rcx+20H]
+        movsd   xmm1, xmm0
+        shufps  xmm1, xmm1, 01001110B
+        %IF %1 == 0
+        movaps  [rdi+rcx+10H], xmm1    ; Save aligned
+        %ELSE
+        movntps [rdi+rcx+10H], xmm1    ; Non-temporal save
+        %ENDIF
+        add     rcx, 20H               ; Loop through negative values up to zero
+        jnz     %%L1
+        ; Set up for edx remaining bytes
+        add     rsi, rdx
+        add     rdi, rdx
+        neg     rdx
+        cmp     edx, -10H
+        jg      %%L2
+        ; One more 16-bytes block to move
+        movaps  xmm1, [rsi+rdx+10H]    ; Read next two blocks aligned
+        movsd   xmm0, xmm1
+        shufps  xmm0, xmm0, 01001110B
+        %IF %1 == 0
+        movaps  [rdi+rdx], xmm0        ; Save aligned
+        %ELSE
+        movntps [rdi+rdx], xmm0        ; Non-temporal save
+        %ENDIF
+        add     rdx, 10H
+%%L2:   ; Get src pointer back to misaligned state
+        add     rsi, rax
+        %IF %1 == 1
+        sfence
+        %ENDIF
+        ; Move remaining 0 - 15 bytes, unaligned
+        jmp     C200
+%ENDMACRO
+
+
+%MACRO  MOVE_UNALIGNED_SSE2_12  1 ; nt
+; Special case for u = 12
+; %1 = 1 if non-temporal store desired
+        movaps  xmm0, [rsi+rcx]        ; Read from nearest preceding 16B boundary
+        shufps  xmm0, xmm0, 10010011B
+%%L1:   ; Loop. rcx has negative index from the end, counting up to zero
+        movaps  xmm1, [rsi+rcx+10H]    ; Read next two blocks aligned
+        movaps  xmm2, [rsi+rcx+20H]
+        shufps  xmm1, xmm1, 10010011B
+        shufps  xmm2, xmm2, 10010011B
+        movaps  xmm3, xmm2
+        movss   xmm2, xmm1             ; Moves 4 bytes, leaves remaining bytes unchanged
+        movss   xmm1, xmm0             ; Moves 4 bytes, leaves remaining bytes unchanged
+        %IF %1 == 0
+        movaps  [rdi+rcx], xmm1        ; Save aligned
+        movaps  [rdi+rcx+10H], xmm2    ; Save aligned
+        %ELSE
+        movntps [rdi+rcx], xmm1        ; Non-temporal save
+        movntps [rdi+rcx+10H], xmm2    ; Non-temporal save
+        %ENDIF
+        movaps  xmm0, xmm3             ; Save for next iteration
+        add     rcx, 20H               ; Loop through negative values up to zero
+        jnz     %%L1
+        ; Set up for edx remaining bytes
+        add     rsi, rdx
+        add     rdi, rdx
+        neg     rdx
+        cmp     edx, -10H
+        jg      %%L2
+        ; One more 16-bytes block to move
+        movaps  xmm1, [rsi+rdx+10H]    ; Read next two blocks aligned
+        shufps  xmm1, xmm1, 10010011B
+        movss   xmm1, xmm0             ; Moves 4 bytes, leaves remaining bytes unchanged
+        %IF %1 == 0
+        movaps  [rdi+rdx], xmm1        ; Save aligned
+        %ELSE
+        movntps [rdi+rdx], xmm1        ; Non-temporal save
+        %ENDIF
+        add     rdx, 10H
+%%L2:   ; Get src pointer back to misaligned state
+        add     rsi, rax
+        %IF %1 == 1
+        sfence
+        %ENDIF
+        ; Move remaining 0 - 15 bytes, unaligned
+        jmp     C200
+%ENDMACRO
+
+
+; Macros for each src alignment, Suppl.SSE3 instruction set:
+; Make separate code for each alignment u because the palignr instruction
+; has the shift count as a constant:
+
+%MACRO MOVE_UNALIGNED_SSSE3  1 ; u
+; Move rcx + rdx bytes of data
+; Source is misaligned. (src-dest) modulo 16 = %1
+; eax = %1
+; rsi = src - %1 = nearest preceding 16-bytes boundary
+; rdi = dest (aligned)
+; rcx = - (count rounded down to nearest divisible by 32)
+; edx = remaining bytes to move after loop
+        movdqa  xmm0, [rsi+rcx]        ; Read from nearest preceding 16B boundary
+
+%%L1:   ; Loop. rcx has negative index from the end, counting up to zero
+        movdqa  xmm2, [rsi+rcx+10H]    ; Read next two blocks
+        movdqa  xmm3, [rsi+rcx+20H]
+        movdqa  xmm1, xmm0             ; Save xmm0
+        movdqa  xmm0, xmm3             ; Save for next iteration
+        palignr xmm3, xmm2, %1         ; Combine parts into aligned block
+        palignr xmm2, xmm1, %1         ; Combine parts into aligned block
+        movdqa  [rdi+rcx], xmm2        ; Save aligned
+        movdqa  [rdi+rcx+10H], xmm3    ; Save aligned
+        add     rcx, 20H
+        jnz     %%L1
+
+        ; Set up for edx remaining bytes
+        add     rsi, rdx
+        add     rdi, rdx
+        neg     rdx
+        cmp     edx, -10H
+        jg      %%L2
+        ; One more 16-bytes block to move
+        movdqa  xmm2, [rsi+rdx+10H]
+        palignr xmm2, xmm0, %1
+        movdqa  [rdi+rdx], xmm2
+        add     rdx, 10H
+%%L2:   ; Get src pointer back to misaligned state
+        add     rsi, rax
+        ; Move remaining 0 - 15 bytes
+        jmp     C200
+%ENDMACRO
+
+
+; Make 15 instances of SSE2 macro for each value of the alignment u.
+; These are pointed to by the jump table AlignmentDispatchSSE2 below
+; (alignments and fillers are inserted manually to minimize the number
+; of 16-bytes boundaries inside loops)
+
+align 16
+D104:   MOVE_UNALIGNED_SSE2_4    0
+times 4 nop
+D108:   MOVE_UNALIGNED_SSE2_8    0
+times 4 nop
+D10C:   MOVE_UNALIGNED_SSE2_12   0
+times 1 nop
+D101:   MOVE_UNALIGNED_SSE2 1,   0
+D102:   MOVE_UNALIGNED_SSE2 2,   0
+D103:   MOVE_UNALIGNED_SSE2 3,   0
+D105:   MOVE_UNALIGNED_SSE2 5,   0
+D106:   MOVE_UNALIGNED_SSE2 6,   0
+D107:   MOVE_UNALIGNED_SSE2 7,   0
+D109:   MOVE_UNALIGNED_SSE2 9,   0
+times 1 nop
+D10A:   MOVE_UNALIGNED_SSE2 0AH, 0
+D10B:   MOVE_UNALIGNED_SSE2 0BH, 0
+D10D:   MOVE_UNALIGNED_SSE2 0DH, 0
+D10E:   MOVE_UNALIGNED_SSE2 0EH, 0
+D10F:   MOVE_UNALIGNED_SSE2 0FH, 0
+
+; Make 15 instances of Suppl-SSE3 macro for each value of the alignment u.
+; These are pointed to by the jump table AlignmentDispatchSupSSE3 below
+
+align   16
+E104:   MOVE_UNALIGNED_SSSE3 4
+E108:   MOVE_UNALIGNED_SSSE3 8
+E10C:   MOVE_UNALIGNED_SSSE3 0CH
+E101:   MOVE_UNALIGNED_SSSE3 1
+E102:   MOVE_UNALIGNED_SSSE3 2
+E103:   MOVE_UNALIGNED_SSSE3 3
+E105:   MOVE_UNALIGNED_SSSE3 5
+E106:   MOVE_UNALIGNED_SSSE3 6
+E107:   MOVE_UNALIGNED_SSSE3 7
+E109:   MOVE_UNALIGNED_SSSE3 9
+times 1 nop
+E10A:   MOVE_UNALIGNED_SSSE3 0AH
+E10B:   MOVE_UNALIGNED_SSSE3 0BH
+E10D:   MOVE_UNALIGNED_SSSE3 0DH
+E10E:   MOVE_UNALIGNED_SSSE3 0EH
+E10F:   MOVE_UNALIGNED_SSSE3 0FH
+
+; Codes for non-temporal move. Aligned case first
+
+align 16
+F100:   ; Non-temporal move, src and dest have same alignment.
+        ; Loop. rcx has negative index from the end, counting up to zero
+        movaps  xmm0, [rsi+rcx]        ; Read
+        movaps  xmm1, [rsi+rcx+10H]
+        movntps [rdi+rcx], xmm0        ; Write non-temporal (bypass cache)
+        movntps [rdi+rcx+10H], xmm1
+        add     rcx, 20H
+        jnz     F100                   ; Loop through negative rcx up to zero
+
+        ; Move the remaining edx bytes (0 - 31):
+        add     rsi, rdx
+        add     rdi, rdx
+        neg     rdx
+        jz      C500                   ; Skip if no more data
+        ; Check if we can more one more 16-bytes block
+        cmp     edx, -10H
+        jg      C200
+        ; move 16 bytes, aligned
+        movaps  xmm0, [rsi+rdx]
+        movntps [rdi+rdx], xmm0
+        add     rdx, 10H
+        sfence
+        ; move the remaining 0 - 15 bytes
+        jmp     C200
+
+; Make 15 instances of MOVE_UNALIGNED_SSE2 macro for each value of
+; the alignment u.
+; These are pointed to by the jump table AlignmentDispatchNT below
+
+;align 16
+F104:   MOVE_UNALIGNED_SSE2_4    1
+F108:   MOVE_UNALIGNED_SSE2_8    1
+F10C:   MOVE_UNALIGNED_SSE2_12   1
+F101:   MOVE_UNALIGNED_SSE2 1,   1
+F102:   MOVE_UNALIGNED_SSE2 2,   1
+F103:   MOVE_UNALIGNED_SSE2 3,   1
+F105:   MOVE_UNALIGNED_SSE2 5,   1
+F106:   MOVE_UNALIGNED_SSE2 6,   1
+F107:   MOVE_UNALIGNED_SSE2 7,   1
+F109:   MOVE_UNALIGNED_SSE2 9,   1
+F10A:   MOVE_UNALIGNED_SSE2 0AH, 1
+F10B:   MOVE_UNALIGNED_SSE2 0BH, 1
+F10D:   MOVE_UNALIGNED_SSE2 0DH, 1
+F10E:   MOVE_UNALIGNED_SSE2 0EH, 1
+F10F:   MOVE_UNALIGNED_SSE2 0FH, 1
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;                   CPU dispatcher
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+memcpyCPUDispatch:   ; CPU dispatcher, check for instruction sets and which method is fastest
+        ; This part is executed only once
+        push    rbx
+        push    rcx
+        push    rdx
+        push    rsi
+        push    rdi
+        push    r8
+        ; set CacheBypassLimit to half the size of the largest level cache
+        call    GetMemcpyCacheLimit@
+        mov     eax, 1
+        cpuid                          ; Get feature flags
+        lea     rbx, [memcpySSE2@]
+        bt      ecx, 9                 ; Test bit for SupplSSE3
+        jnc     Q100
+        lea     rbx, [memcpySSSE3@]
+        call    UnalignedIsFaster      ; Test if unaligned read is faster than aligned read and shift
+        test    eax, eax
+        jz      Q100
+        lea     rbx, [memcpyU@]
+        call    Store256BitIsFaster    ; Test if 256-bit read/write is available and faster than 128-bit read/write
+        test    eax, eax
+        jz      Q100
+        lea     rbx, [memcpyU256@]
+Q100:
+        ; Insert appropriate pointer
+        mov     [memcpyDispatch], rbx
+        mov     rax, rbx
+        pop     r8
+        pop     rdi
+        pop     rsi
+        pop     rdx
+        pop     rcx
+        pop     rbx
+        ; Jump according to the replaced function pointer
+        jmp     rax
+
+; extern "C" size_t GetMemcpyCacheLimit();
+GetMemcpyCacheLimit:
+GetMemcpyCacheLimit@:  ; local limit
+        mov     rax, [CacheBypassLimit]
+        test    rax, rax
+        jnz     U200
+        ; Get half the size of the largest level cache
+%ifdef  WINDOWS
+        xor     ecx, ecx               ; 0 means largest level cache
+%else
+        xor     edi, edi               ; 0 means largest level cache
+%endif
+        call    DataCacheSize          ; get cache size
+        shr     rax, 1                 ; half the size
+        jnz     U100
+        mov     eax, 400000H           ; cannot determine cache size. use 4 Mbytes
+U100:   mov     [CacheBypassLimit], rax
+U200:   ret
+
+; Note: SetMemcpyCacheLimit is defined in memmove64.asm, calling SetMemcpyCacheLimit1
+SetMemcpyCacheLimit1:
+%ifdef  WINDOWS
+        mov     rax, rcx
+%else
+        mov     rax, rdi
+%endif
+        test    rax, rax
+        jnz     U400
+        ; zero, means default
+        mov     [CacheBypassLimit], rax
+        call    GetMemcpyCacheLimit@
+U400:   mov     [CacheBypassLimit], rax
+        ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;                   getDispatch, for testing only
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+getDispatch:
+mov rax,[memcpyDispatch]
+ret
+
+global getDispatch
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;    data section. jump tables, dispatch function pointer, cache size
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Data segment must be included in function namespace
+SECTION .data
+align 16
+
+; Jump tables for alignments 0 - 15:
+; The CPU dispatcher replaces AlignmentDispatch with
+; AlignmentDispatchSSE2 or AlignmentDispatchSupSSE3 if Suppl-SSE3
+; is supported.
+
+; Code pointer for each alignment for SSE2 instruction set
+AlignmentDispatchSSE2:
+DQ C100, D101, D102, D103, D104, D105, D106, D107
+DQ D108, D109, D10A, D10B, D10C, D10D, D10E, D10F
+
+; Code pointer for each alignment for Suppl-SSE3 instruction set
+AlignmentDispatchSSSE3:
+DQ C100, E101, E102, E103, E104, E105, E106, E107
+DQ E108, E109, E10A, E10B, E10C, E10D, E10E, E10F
+
+; Code pointer for each alignment for non-temporal store
+AlignmentDispatchNT:
+DQ F100, F101, F102, F103, F104, F105, F106, F107
+DQ F108, F109, F10A, F10B, F10C, F10D, F10E, F10F
+
+; Pointer to appropriate version.
+; This initially points to memcpyCPUDispatch. memcpyCPUDispatch will
+; change this to the appropriate version of memcpy, so that
+; memcpyCPUDispatch is only executed once:
+memcpyDispatch DQ memcpyCPUDispatch
+
+; Bypass cache by using non-temporal moves if count > CacheBypassLimit
+; The optimal value of _CacheBypassLimit is difficult to estimate, but
+; a reasonable value is half the size of the largest cache:
+CacheBypassLimit: DQ 0
diff --git a/contrib/libs/asmlib/memmove64.asm b/contrib/libs/asmlib/memmove64.asm
new file mode 100644
index 0000000000..1c61032541
--- /dev/null
+++ b/contrib/libs/asmlib/memmove64.asm
@@ -0,0 +1,1090 @@
+%include "defs.asm"
+
+;*************************  memmove64.asm  ***********************************
+; Author:           Agner Fog
+; Date created:     2008-07-18
+; Last modified:    2016-11-16 (patched version with AVX512 support removed)
+; Description:
+; Faster version of the standard memmove function:
+; void * A_memmove(void *dest, const void *src, size_t count);
+; Moves 'count' bytes from 'src' to 'dest'. src and dest may overlap.
+;
+; Overriding standard function memmove:
+; The alias ?OVR_memmove is changed to _memmove in the object file if
+; it is desired to override the standard library function memmove.
+;
+; CPU dispatching included for different CPUs
+;
+; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+default rel
+
+global A_memmove: function             ; Function A_memmove
+global EXP(memmove): function          ; ?OVR removed if standard function memmove overridden
+global memmoveSSE2: function           ; Version for processors with only SSE2
+global memmoveSSSE3: function          ; Version for processors with SSSE3
+global memmoveU: function              ; Version for processors with fast unaligned read
+global memmoveU256: function           ; Version for processors with fast 256-bit read/write
+global SetMemcpyCacheLimit             ; Change limit for bypassing cache
+
+; Imported from memcpy64.asm:
+extern A_memcpy                        ; function entry
+extern memcpySSE2                      ; CPU specific function entry
+extern memcpySSSE3                     ; CPU specific function entry
+extern memcpyU                         ; CPU specific function entry
+extern memcpyU256                      ; CPU specific function entry
+
+; Imported from instrset64.asm
+extern InstructionSet                  ; Instruction set for CPU dispatcher
+
+; Imported from unalignedisfaster64.asm:
+extern UnalignedIsFaster               ; Tells if unaligned read is faster than PALIGNR
+extern Store256BitIsFaster             ; Tells if a 256 bit store is faster than two 128 bit stores
+
+; Imported from memcpy64.asm
+extern GetMemcpyCacheLimit             ; Get the size limit for bypassing cache when copying with memcpy and memmove
+extern SetMemcpyCacheLimit1            ; Set the size limit for bypassing cache when copying with memcpy
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;      Prolog macro. Determine if we should move forwards or backwards
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Define prolog for this function
+; Parameter 1 is forward function label
+%MACRO  PROLOGM  1
+%IFDEF  WINDOWS
+        ; Check if dest overlaps src
+        mov     rax, rcx
+        sub     rax, rdx
+        cmp     rax, r8
+        ; We can avoid testing for dest < src by using unsigned compare:
+        ; (Assume that the memory block cannot span across address 0)
+        ; Must move backwards if unsigned(dest-src) < count
+        jae     %1                     ; Jump to memcpy if we can move forwards
+        push    rsi
+        push    rdi
+        mov     rdi, rcx               ; dest
+        mov     r9,  rcx               ; dest
+        mov     rsi, rdx               ; src
+        mov     rcx, r8                ; count
+%ELSE   ; Unix
+        ; Check if dest overlaps src
+        mov     rax, rdi
+        sub     rax, rsi
+        cmp     rax, rdx
+        ; Must move backwards if unsigned(dest-src) < count
+        jae     %1                     ; Jump to memcpy if we can move forwards
+        mov     rcx, rdx               ; count
+        mov     r9,  rdi               ; dest
+%ENDIF
+%ENDM
+
+
+; Define return from this function
+%MACRO  RETURNM  0
+%IFDEF  WINDOWS
+        pop     rdi
+        pop     rsi
+%ENDIF
+        mov     rax, r9                ; Return value = dest
+        ret
+%ENDMACRO
+
+
+SECTION .text  align=16
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;                          Common entry for dispatch
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; extern "C" void * A_memmove(void * dest, const void * src, size_t count);
+; Function entry:
+A_memmove:
+EXP(memmove):
+        jmp     qword [memmoveDispatch] ; Go to appropriate version, depending on instruction set
+
+        
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; AVX Version for processors with fast unaligned read and fast 32 bytes write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+memmoveU256:   ; Version for processors with fast 256-bit read/write
+memmoveU256@:  ; local label
+        PROLOGM memcpyU256
+      
+        cmp     rcx, 40H
+        jb      A1000                    ; Use simpler code if count < 64
+        
+        ; count >= 64
+        ; Note: this part will not always work if count < 64
+        ; Calculate size of last block after last regular boundary of dest
+        lea     edx, [rdi+rcx]          ; end of dext
+        and     edx, 1FH
+        jz      B4300                   ; Skip if end of dest aligned by 32
+        
+        ; edx = size of last partial block, 1 - 31 bytes
+        test    dl, 3
+        jz      B4210
+        test    dl, 1
+        jz      B4201      ; B4200 if we haven't tested edx,3
+        ; move 1 byte
+        dec     rcx
+        movzx   eax, byte [rsi+rcx]
+        mov     [rdi+rcx], al        
+B4200:  test    dl, 2
+        jz      B4210
+B4201:  ; move 2 bytes
+        sub     rcx, 2
+        movzx   eax, word [rsi+rcx]
+        mov     [rdi+rcx], ax        
+B4210:  test    dl, 4
+        jz      B4220
+        ; move 4 bytes
+        sub     rcx, 4
+        mov     eax, [rsi+rcx]
+        mov     [rdi+rcx], eax
+B4220:  test    dl, 8
+        jz      B4230
+        ; move 8 bytes
+        sub     rcx, 8
+        mov     rax, [rsi+rcx]
+        mov     [rdi+rcx], rax
+B4230:  test    dl, 16
+        jz      B4300
+        ; move 16 bytes
+        sub     rcx, 16
+        movups  xmm0, [rsi+rcx]
+        movaps  [rdi+rcx], xmm0
+        
+B4300:  ; Now end of dest is aligned by 32. Any partial block has been moved        
+        mov     rdx, rcx
+        and     ecx, 1FH              ; remaining size after 32 bytes blocks moved
+        and     rdx, -20H             ; number of 32 bytes blocks
+        jz      H4100
+        add     rsi, rcx
+        add     rdi, rcx
+        
+        ; Check if count very big
+        cmp     rdx, [CacheBypassLimit]
+        ja      H4800                   ; Use non-temporal store if count > _CacheBypassLimit
+
+align   16 
+H4000:  ; 32 bytes move loop
+        vmovups  ymm0, [rsi+rdx-20H]
+        vmovaps  [rdi+rdx-20H], ymm0
+        sub      rdx, 20H
+        jnz      H4000
+        vzeroupper
+        
+H4090:  sub      rsi, rcx
+        sub      rdi, rcx
+
+H4100:  ; remaining 0-31 bytes
+        test    ecx, ecx
+        jz      H4600        
+        test    cl, 10H
+        jz      H4200
+        ; move 16 bytes
+        sub     ecx, 10H
+        movups  xmm0, [rsi+rcx]
+        movaps  [rdi+rcx], xmm0
+        jz      H4600                     ; early out if count divisible by 16
+H4200:  test    cl, 8
+        jz      H4300
+        ; move 8 bytes
+        sub     ecx, 8
+        mov     rax, [rsi+rcx]
+        mov     [rdi+rcx], rax
+H4300:  test    cl, 4
+        jz      H4400
+        ; move 4 bytes
+        sub     ecx, 4
+        mov     eax, [rsi+rcx]
+        mov     [rdi+rcx], eax
+        jz      H4600                     ; early out if count divisible by 4
+H4400:  test    cl, 2
+        jz      H4500
+        ; move 2 bytes
+        sub     ecx, 2
+        movzx   eax, word [rsi+rcx]
+        mov     [rdi+rcx], ax
+H4500:  test    cl, 1
+        jz      H4600
+        ; move 1 byte
+        movzx   eax, byte [rsi]   ; rcx-1 = 0
+        mov     [rdi], al
+H4600:  ; finished
+        RETURNM
+
+align 16
+H4800:  ; 32 bytes move loop, bypass cache
+        vmovups  ymm0, [rsi+rdx-20H]
+        vmovntps [rdi+rdx-20H], ymm0
+        sub      rdx, 20H
+        jnz      H4800        
+        sfence
+        vzeroupper
+        jmp      H4090
+        
+A1000:  ; count < 64. Move 32-16-8-4-2-1 bytes
+        test    cl, 20H
+        jz      A1100
+        ; move 32 bytes
+        ; movups is faster on processors with SSSE3
+        sub     ecx, 20H
+        movups     xmm0, [rsi+rcx+10H]
+        movups     xmm1, [rsi+rcx]
+        movups     [rdi+rcx+10H], xmm0
+        movups     [rdi+rcx], xmm1
+A1100:  test    cl, 10H
+        jz      A1200
+        ; move 16 bytes
+        sub     ecx, 10H
+        movups     xmm0, [rsi+rcx]
+        movups     [rdi+rcx], xmm0
+A1200:  test    cl, 8
+        jz      A1300
+        ; move 8 bytes
+        sub     ecx, 8
+        mov     rax, [rsi+rcx]
+        mov     [rdi+rcx], rax
+A1300:  test    cl, 4
+        jz      A1400
+        ; move 4 bytes
+        sub     ecx, 4
+        mov     eax, [rsi+rcx]
+        mov     [rdi+rcx], eax
+        jz      A1900                     ; early out if count divisible by 4
+A1400:  test    cl, 2
+        jz      A1500
+        ; move 2 bytes
+        sub     ecx, 2
+        movzx   eax, word [rsi+rcx]
+        mov     [rdi+rcx], ax
+A1500:  test    cl, 1
+        jz      A1900
+        ; move 1 byte
+        movzx   eax, byte [rsi]   ; rcx-1 = 0
+        mov     [rdi], al
+A1900:  ; finished
+        RETURNM
+        
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;  Version for processors with fast unaligned read and fast 16 bytes write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+                
+align 16
+memmoveU:  ; Version for processors with fast unaligned read
+memmoveU@: ; local label
+        PROLOGM memcpyU
+      
+        cmp     rcx, 40H
+        jb      A1000                    ; Use simpler code if count < 64
+        
+        ; count >= 64
+        ; Note: this part will not always work if count < 64
+        ; Calculate size of last block after last regular boundary of dest
+        lea     edx, [rdi+rcx]          ; end of dext
+        and     edx, 0FH
+        jz      B3300                   ; Skip if end of dest aligned by 16
+        
+        ; edx = size of last partial block, 1 - 15 bytes
+        test    dl, 3
+        jz      B3210
+        test    dl, 1
+        jz      B3201      ; B3200 if we haven't tested edx,3
+        ; move 1 byte
+        dec     rcx
+        movzx   eax, byte [rsi+rcx]
+        mov     [rdi+rcx], al        
+B3200:  test    dl, 2
+        jz      B3210
+B3201:  ; move 2 bytes
+        sub     rcx, 2
+        movzx   eax, word [rsi+rcx]
+        mov     [rdi+rcx], ax        
+B3210:  test    dl, 4
+        jz      B3220
+        ; move 4 bytes
+        sub     rcx, 4
+        mov     eax, [rsi+rcx]
+        mov     [rdi+rcx], eax
+B3220:  test    dl, 8
+        jz      B3300
+        ; move 8 bytes
+        sub     rcx, 8
+        mov     rax, [rsi+rcx]
+        mov     [rdi+rcx], rax        
+        
+B3300:  ; Now end of dest is aligned by 16. Any partial block has been moved        
+        mov     rdx, rcx
+        and     ecx, 1FH              ; remaining size after 32 bytes blocks moved
+        and     rdx, -20H             ; number of 32 bytes blocks
+        jz      H1100
+        add     rsi, rcx
+        add     rdi, rcx
+        
+        ; Check if count very big
+        cmp     rdx, [CacheBypassLimit]
+        ja      H1800                   ; Use non-temporal store if count > _CacheBypassLimit
+
+align   16    ; minimize 16-bytes boundaries in H1000 loop
+H1000:  ; 32 bytes move loop
+        movups   xmm1, [rsi+rdx-20H]
+        movups   xmm0, [rsi+rdx-10H]
+        movaps   [rdi+rdx-20H], xmm1
+        movaps   [rdi+rdx-10H], xmm0
+        sub      rdx, 20H
+        jnz      H1000
+        
+H1090:  sub      rsi, rcx
+        sub      rdi, rcx
+
+H1100:  ; remaining 0-31 bytes
+        test    ecx, ecx
+        jz      H1600        
+        test    cl, 10H
+        jz      H1200
+        ; move 16 bytes
+        sub     ecx, 10H
+        movups  xmm0, [rsi+rcx]
+        movaps  [rdi+rcx], xmm0
+        jz      H1600                     ; early out if count divisible by 16
+H1200:  test    cl, 8
+        jz      H1300
+        ; move 8 bytes
+        sub     ecx, 8
+        mov     rax, [rsi+rcx]
+        mov     [rdi+rcx], rax
+H1300:  test    cl, 4
+        jz      H1400
+        ; move 4 bytes
+        sub     ecx, 4
+        mov     eax, [rsi+rcx]
+        mov     [rdi+rcx], eax
+        jz      H1600                     ; early out if count divisible by 4
+H1400:  test    cl, 2
+        jz      H1500
+        ; move 2 bytes
+        sub     ecx, 2
+        movzx   eax, word [rsi+rcx]
+        mov     [rdi+rcx], ax
+H1500:  test    cl, 1
+        jz      H1600
+        ; move 1 byte
+        movzx   eax, byte [rsi]   ; rcx-1 = 0
+        mov     [rdi], al
+H1600:  ; finished
+        RETURNM
+
+align 16
+H1800:  ; 32 bytes move loop, bypass cache
+        movups   xmm1, [rsi+rdx-20H]
+        movups   xmm0, [rsi+rdx-10H]
+        movntps  [rdi+rdx-20H], xmm1
+        movntps  [rdi+rdx-10H], xmm0
+        sub      rdx, 20H
+        jnz      H1800        
+        sfence
+        jmp      H1090
+        
+        
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;  Version for processors with SSSE3. Aligned read + shift + aligned write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        
+align 16
+memmoveSSSE3:   ; SSSE3 version begins here
+memmoveSSSE3@:  ; local label
+        PROLOGM memcpySSSE3
+
+        ; Cannot use memcpy. Must move backwards because of overlap between src and dest
+        cmp     rcx, 40H
+        jb      A1000                    ; Use simpler code if count < 64
+        ; count >= 64
+        ; Note: this part will not always work if count < 64
+        ; Calculate size of last block after last regular boundary of dest
+        lea     edx, [rdi+rcx]         ; end of dext
+        and     edx, 0FH
+        jz      B1300                   ; Skip if end of dest aligned by 16
+        
+        ; edx = size of last partial block, 1 - 15 bytes
+        test    dl, 3
+        jz      B1210
+        test    dl, 1
+        jz      B1201      ; B1200 if we haven't tested edx,3
+        ; move 1 byte
+        dec     rcx
+        movzx   eax, byte [rsi+rcx]
+        mov     [rdi+rcx], al        
+B1200:  test    dl, 2
+        jz      B1210
+B1201:  ; move 2 bytes
+        sub     rcx, 2
+        movzx   eax, word [rsi+rcx]
+        mov     [rdi+rcx], ax        
+B1210:  test    dl, 4
+        jz      B1220
+        ; move 4 bytes
+        sub     rcx, 4
+        mov     eax, [rsi+rcx]
+        mov     [rdi+rcx], eax
+B1220:  test    dl, 8
+        jz      B1300
+        ; move 8 bytes
+        sub     rcx, 8
+        mov     rax, [rsi+rcx]
+        mov     [rdi+rcx], rax
+              
+B1300:  ; Now end of dest is aligned by 16. Any partial block has been moved        
+        ; Find alignment of end of src modulo 16 at this point:
+        lea     eax, [rsi+rcx]
+        and     eax, 0FH
+        
+        ; Set up for loop moving 32 bytes per iteration:
+        mov     edx, ecx               ; Save count
+        and     rcx, -20H              ; Round down to nearest multiple of 32
+        sub     edx, ecx               ; Remaining data after loop
+        sub     rsi, rax               ; Nearest preceding aligned block of src
+        ; Add the same to rsi and rdi as we have subtracted from rcx
+        add     rsi, rdx
+        add     rdi, rdx
+        
+        ; Check if count very big
+        cmp     rcx, [CacheBypassLimit]
+        ja      B1400                   ; Use non-temporal store if count > CacheBypassLimit
+        
+        ; Dispatch to different codes depending on src alignment
+        lea     r8, [MAlignmentDispatchSSSE3]
+        jmp     near [r8+rax*8]
+
+B1400:  ; Dispatch to different codes depending on src alignment
+        lea     r8, [MAlignmentDispatchNT]
+        jmp     near [r8+rax*8]
+        
+
+align   16
+C100:   ; Code for aligned src. SSE2 and later CPUs
+        ; The nice case, src and dest have same alignment.
+
+        ; Loop. rcx has positive index from the beginning, counting down to zero
+        movaps  xmm0, [rsi+rcx-10H]
+        movaps  xmm1, [rsi+rcx-20H]
+        movaps  [rdi+rcx-10H], xmm0
+        movaps  [rdi+rcx-20H], xmm1
+        sub     rcx, 20H
+        jnz     C100
+        
+        ; Move the remaining edx bytes (0 - 31):
+        ; move 16-8-4-2-1 bytes, aligned
+        test    edx, edx
+        jz      C500                   ; Early out if no more data
+        test    dl, 10H
+        jz      C200
+        ; move 16 bytes
+        sub     rcx, 10H
+        movaps  xmm0, [rsi+rcx]
+        movaps  [rdi+rcx], xmm0
+        
+C200:   ; Other branches come in here, rcx may contain arbitrary offset
+        test    edx, edx
+        jz      C500                   ; Early out if no more data
+        test    dl, 8
+        jz      C210        
+        ; move 8 bytes
+        sub     rcx, 8 
+        mov     rax, [rsi+rcx]
+        mov     [rdi+rcx], rax
+C210:   test    dl, 4
+        jz      C220        
+        ; move 4 bytes
+        sub     rcx, 4        
+        mov     eax, [rsi+rcx]
+        mov     [rdi+rcx], eax
+        jz      C500                   ; Early out if count divisible by 4
+C220:   test    dl, 2
+        jz      C230        
+        ; move 2 bytes
+        sub     rcx, 2
+        movzx   eax, word [rsi+rcx]
+        mov     [rdi+rcx], ax
+C230:   test    dl, 1
+        jz      C500        
+        ; move 1 byte
+        movzx   eax, byte [rsi+rcx-1]   ; rcx-1 is not always 0 here
+        mov     [rdi+rcx-1], al
+C500:   ; finished     
+        RETURNM
+        
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;  Version for processors with SSE2. Aligned read + shift + aligned write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        
+memmoveSSE2:   ; SSE2 version begins here
+memmoveSSE2@:  ; local label
+        PROLOGM  memcpySSE2
+
+        ; Cannot use memcpy. Must move backwards because of overlap between src and dest
+        cmp     rcx, 40H
+        jae     B0100                    ; Use simpler code if count < 64
+        
+        ; count < 64. Move 32-16-8-4-2-1 bytes
+        test    cl, 20H
+        jz      A100
+        ; move 32 bytes
+        ; mov is faster than movdqu on SSE2 processors,
+        ; movdqu is faster on later processors
+        sub     ecx, 20H
+        mov     rax, [rsi+rcx+18H]
+        mov     rdx, [rsi+rcx+10H]
+        mov     [rdi+rcx+18H], rax
+        mov     [rdi+rcx+10H], rdx
+        mov     rax, [rsi+rcx+8]
+        mov     rdx, [rsi+rcx]
+        mov     [rdi+rcx+8], rax
+        mov     [rdi+rcx], rdx
+A100:   test    cl, 10H
+        jz      A200
+        ; move 16 bytes
+        sub     ecx, 10H
+        mov     rax, [rsi+rcx+8]
+        mov     rdx, [rsi+rcx]
+        mov     [rdi+rcx+8], rax
+        mov     [rdi+rcx], rdx
+A200:   test    cl, 8
+        jz      A300
+        ; move 8 bytes
+        sub     ecx, 8
+        mov     rax, [rsi+rcx]
+        mov     [rdi+rcx], rax
+A300:   test    cl, 4
+        jz      A400
+        ; move 4 bytes
+        sub     ecx, 4
+        mov     eax, [rsi+rcx]
+        mov     [rdi+rcx], eax
+        jz      A900                     ; early out if count divisible by 4
+A400:   test    cl, 2
+        jz      A500
+        ; move 2 bytes
+        sub     ecx, 2
+        movzx   eax, word [rsi+rcx]
+        mov     [rdi+rcx], ax
+A500:   test    cl, 1
+        jz      A900
+        ; move 1 byte
+        movzx   eax, byte [rsi]       ; rcx-1 = 0
+        mov     [rdi], al
+A900:   ; finished
+        RETURNM
+        
+B0100:  ; count >= 64
+        ; Note: this part will not always work if count < 64
+        ; Calculate size of last block after last regular boundary of dest
+        lea     edx, [rdi+rcx]         ; end of dext
+        and     edx, 0FH
+        jz      B0300                   ; Skip if end of dest aligned by 16
+        
+        ; edx = size of last partial block, 1 - 15 bytes
+        test    dl, 3
+        jz      B0210
+        test    dl, 1
+        jz      B0201      ; B0200 if we haven't tested edx,3
+        ; move 1 byte
+        dec     rcx
+        movzx   eax, byte [rsi+rcx]
+        mov     [rdi+rcx], al        
+B0200:  test    dl, 2
+        jz      B0210
+B0201:  ; move 2 bytes
+        sub     rcx, 2
+        movzx   eax, word [rsi+rcx]
+        mov     [rdi+rcx], ax        
+B0210:  test    dl, 4
+        jz      B0220
+        ; move 4 bytes
+        sub     rcx, 4
+        mov     eax, [rsi+rcx]
+        mov     [rdi+rcx], eax
+B0220:  test    dl, 8
+        jz      B0300
+        ; move 8 bytes
+        sub     rcx, 8
+        mov     rax, [rsi+rcx]
+        mov     [rdi+rcx], rax
+              
+B0300:  ; Now end of dest is aligned by 16. Any partial block has been moved        
+        ; Find alignment of end of src modulo 16 at this point:
+        lea     eax, [rsi+rcx]
+        and     eax, 0FH
+        
+        ; Set up for loop moving 32 bytes per iteration:
+        mov     edx, ecx               ; Save count
+        and     rcx, -20H              ; Round down to nearest multiple of 32
+        sub     edx, ecx               ; Remaining data after loop
+        sub     rsi, rax               ; Nearest preceding aligned block of src
+        ; Add the same to rsi and rdi as we have subtracted from rcx
+        add     rsi, rdx
+        add     rdi, rdx
+        
+        ; Check if count very big
+        cmp     rcx, [CacheBypassLimit]
+        ja      B0400                   ; Use non-temporal store if count > CacheBypassLimit
+        
+        ; Dispatch to different codes depending on src alignment
+        lea     r8, [MAlignmentDispatchSSE2]
+        jmp     near [r8+rax*8]
+
+B0400:   ; Dispatch to different codes depending on src alignment
+        lea     r8, [MAlignmentDispatchNT]
+        jmp     near [r8+rax*8]
+        
+        
+        
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;  Macros and alignment jump tables
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Macros for each src alignment, SSE2 instruction set:
+; Make separate code for each alignment u because the shift instructions
+; have the shift count as a constant:
+
+%MACRO MOVE_REVERSE_UNALIGNED_SSE2  2 ; u, nt
+; Move rcx + rdx bytes of data
+; Source is misaligned. (src-dest) modulo 16 = %1
+; %2 = 1 if non-temporal store desired
+; eax = %1
+; rsi = src - %1 = nearest preceding 16-bytes boundary
+; rdi = dest (aligned)
+; rcx = count rounded down to nearest divisible by 32
+; edx = remaining bytes to move after loop
+        movdqa  xmm0, [rsi+rcx]        ; Read from nearest following 16B boundary        
+%%L1:   ; Loop. rcx has positive index from the beginning, counting down to zero
+        sub     rcx, 20H
+        movdqa  xmm1, [rsi+rcx+10H]    ; Read next two blocks aligned
+        movdqa  xmm2, [rsi+rcx]
+        movdqa  xmm3, xmm1             ; Copy because used twice
+        pslldq  xmm0, 16-%1            ; shift left
+        psrldq  xmm1, %1               ; shift right
+        por     xmm0, xmm1             ; combine blocks
+        %IF %2 == 0
+        movdqa  [rdi+rcx+10H], xmm0    ; Save aligned
+        %ELSE
+        movntdq [rdi+rcx+10H], xmm0    ; Save aligned
+        %ENDIF
+        movdqa  xmm0, xmm2             ; Save for next iteration
+        pslldq  xmm3, 16-%1            ; shift left
+        psrldq  xmm2, %1                ; shift right
+        por     xmm3, xmm2             ; combine blocks
+        %IF %2 == 0
+        movdqa  [rdi+rcx], xmm3        ; Save aligned
+        %ELSE
+        movntdq [rdi+rcx], xmm3        ; Save aligned
+        %ENDIF
+        jnz     %%L1
+                
+        ; Move edx remaining bytes
+        test    dl, 10H
+        jz      %%L2
+        ; One more 16-bytes block to move
+        sub     rcx, 10H
+        movdqa  xmm1, [rsi+rcx]
+        pslldq  xmm0, 16-%1            ; shift left
+        psrldq  xmm1, %1               ; shift right
+        por     xmm0, xmm1             ; combine blocks
+        %IF %2 == 0
+        movdqa  [rdi+rcx], xmm0        ; Save aligned
+        %ELSE
+        movntdq [rdi+rcx], xmm0        ; Save aligned
+        %ENDIF        
+%%L2:   ; Get src pointer back to misaligned state
+        add     rsi, rax
+        %IF %2 == 1
+        sfence
+        %ENDIF
+        ; Move remaining 0 - 15 bytes, unaligned
+        jmp     C200
+%ENDMACRO
+
+
+%MACRO  MOVE_REVERSE_UNALIGNED_SSE2_4  1 ; nt
+; Special case: u = 4
+; %1 = 1 if non-temporal store desired
+        movaps  xmm0, [rsi+rcx]        ; Read from nearest following 16B boundary
+%%L1:   ; Loop. rcx has positive index from the beginning, counting down to zero
+        sub     rcx, 20H
+        movaps  xmm1, [rsi+rcx+10H]    ; Read next two blocks aligned
+        movaps  xmm2, [rsi+rcx]
+        movaps  xmm3, xmm0
+        movaps  xmm0, xmm2        
+        movss   xmm2, xmm1
+        shufps  xmm2, xmm2, 00111001B  ; Rotate right
+        movss   xmm1, xmm3
+        shufps  xmm1, xmm1, 00111001B  ; Rotate right
+        %IF %1 == 0
+        movaps  [rdi+rcx+10H], xmm1    ; Save aligned
+        movaps  [rdi+rcx], xmm2        ; Save aligned
+        %ELSE
+        movntps [rdi+rcx+10H], xmm1    ; Non-temporal save
+        movntps [rdi+rcx], xmm2        ; Non-temporal save
+        %ENDIF
+        jnz     %%L1
+                
+        ; Move edx remaining bytes
+        test    dl, 10H
+        jz      %%L2
+        ; One more 16-bytes block to move
+        sub     rcx, 10H
+        movaps  xmm1, [rsi+rcx]
+        movss   xmm1, xmm0
+        shufps  xmm1, xmm1, 00111001B  ; Rotate right
+        %IF %1 == 0
+        movaps  [rdi+rcx], xmm1        ; Save aligned
+        %ELSE
+        movntps [rdi+rcx], xmm1        ; Non-temporal save
+        %ENDIF        
+%%L2:     ; Get src pointer back to misaligned state
+        add     rsi, rax
+        %IF %1 == 1
+        sfence
+        %ENDIF
+        ; Move remaining 0 - 15 bytes, unaligned
+        jmp     C200
+%ENDMACRO
+
+
+%MACRO  MOVE_REVERSE_UNALIGNED_SSE2_8  1 ; nt
+; Special case: u = 8
+; %1 = 1 if non-temporal store desired
+        movaps  xmm0, [rsi+rcx]        ; Read from nearest following 16B boundary
+        shufps  xmm0, xmm0, 01001110B  ; Rotate
+%%L1:   ; Loop. rcx has positive index from the beginning, counting down to zero
+        sub     rcx, 20H
+        movaps  xmm1, [rsi+rcx+10H]    ; Read next two blocks aligned
+        shufps  xmm1, xmm1, 01001110B  ; Rotate
+        movsd   xmm0, xmm1
+        %IF %1 == 0
+        movaps  [rdi+rcx+10H], xmm0    ; Save aligned
+        %ELSE
+        movntps [rdi+rcx+10H], xmm0    ; Non-temporal save
+        %ENDIF
+        movaps  xmm0, [rsi+rcx]
+        shufps  xmm0, xmm0, 01001110B  ; Rotate
+        movsd   xmm1, xmm0
+        %IF %1 == 0
+        movaps  [rdi+rcx], xmm1        ; Save aligned
+        %ELSE
+        movntps [rdi+rcx], xmm1        ; Non-temporal save
+        %ENDIF
+        jnz     %%L1
+                
+        ; Move edx remaining bytes
+        test    dl, 10H
+        jz      %%L2
+        ; One more 16-bytes block to move
+        sub     rcx, 10H
+        movaps  xmm1, [rsi+rcx]
+        shufps  xmm1, xmm1, 01001110B  ; Rotate 
+        movsd   xmm0, xmm1
+        %IF %1 == 0
+        movaps  [rdi+rcx], xmm0        ; Save aligned
+        %ELSE
+        movntps [rdi+rcx], xmm0        ; Non-temporal save
+        %ENDIF        
+%%L2:   ; Get src pointer back to misaligned state
+        add     rsi, rax
+        %IF %1 == 1
+        sfence
+        %ENDIF
+        ; Move remaining 0 - 15 bytes, unaligned
+        jmp     C200
+%ENDMACRO
+
+
+%MACRO  MOVE_REVERSE_UNALIGNED_SSE2_12  1 ; nt
+; Special case: u = 12
+; %1 = 1 if non-temporal store desired
+        movaps  xmm0, [rsi+rcx]        ; Read from nearest following 16B boundary
+        shufps  xmm0, xmm0, 10010011B  ; Rotate right
+%%L1:   ; Loop. rcx has positive index from the beginning, counting down to zero
+        sub     rcx, 20H
+        movaps  xmm1, [rsi+rcx+10H]    ; Read next two blocks aligned
+        shufps  xmm1, xmm1, 10010011B  ; Rotate left
+        movss   xmm0, xmm1
+        %IF %1 == 0
+        movaps  [rdi+rcx+10H], xmm0    ; Save aligned
+        %ELSE
+        movntps [rdi+rcx+10H], xmm0    ; Non-temporal save
+        %ENDIF
+        movaps  xmm0, [rsi+rcx]
+        shufps  xmm0, xmm0, 10010011B  ; Rotate left
+        movss   xmm1, xmm0
+        %IF %1 == 0
+        movaps  [rdi+rcx], xmm1        ; Save aligned
+        %ELSE
+        movntps [rdi+rcx], xmm1        ; Non-temporal save
+        %ENDIF
+        jnz     %%L1
+                
+        ; Move edx remaining bytes
+        test    dl, 10H
+        jz      %%L2
+        ; One more 16-bytes block to move
+        sub     rcx, 10H
+        movaps  xmm1, [rsi+rcx]
+        shufps  xmm1, xmm1, 10010011B  ; Rotate left
+        movss   xmm0, xmm1
+        %IF %1 == 0
+        movaps  [rdi+rcx], xmm0        ; Save aligned
+        %ELSE
+        movntps [rdi+rcx], xmm0        ; Non-temporal save
+        %ENDIF        
+%%L2:   ; Get src pointer back to misaligned state
+        add     rsi, rax
+        %IF %1 == 1
+        sfence
+        %ENDIF
+        ; Move remaining 0 - 15 bytes, unaligned
+        jmp     C200
+%ENDMACRO
+
+
+; Macros for each src alignment, Suppl.SSE3 instruction set:
+; Code for unaligned src, Suppl.SSE3 instruction set.
+; Make separate code for each alignment u because the palignr instruction
+; has the shift count as a constant:
+
+%MACRO  MOVE_REVERSE_UNALIGNED_SSSE3  1; u
+; Move rcx + rdx bytes of data
+; Source is misaligned. (src-dest) modulo 16 = %1
+; eax = %1
+; rsi = src - %1 = nearest preceding 16-bytes boundary
+; rdi = dest (aligned)
+; rcx = - (count rounded down to nearest divisible by 32)
+; edx = remaining bytes to move after loop
+        movdqa  xmm0, [rsi+rcx]        ; Read from nearest following 16B boundary
+        
+%%L1:   ; Loop. rcx has positive index from the beginning, counting down to zero
+        movdqa  xmm1, [rsi+rcx-10H]    ; Read next two blocks        
+        palignr xmm0, xmm1, %1         ; Combine parts into aligned block
+        movdqa  [rdi+rcx-10H], xmm0    ; Save aligned
+        movdqa  xmm0, [rsi+rcx-20H]
+        palignr xmm1, xmm0, %1         ; Combine parts into aligned block
+        movdqa  [rdi+rcx-20H], xmm1    ; Save aligned
+        sub     rcx, 20H
+        jnz     %%L1
+        
+        ; Set up for edx remaining bytes
+        test    dl, 10H
+        jz      %%L2
+        ; One more 16-bytes block to move
+        sub     rcx, 10H
+        movdqa  xmm1, [rsi+rcx]        ; Read next two blocks        
+        palignr xmm0, xmm1, %1         ; Combine parts into aligned block
+        movdqa  [rdi+rcx], xmm0        ; Save aligned
+        
+%%L2:   ; Get src pointer back to misaligned state
+        add     rsi, rax
+        ; Move remaining 0 - 15 bytes
+        jmp     C200
+%ENDMACRO
+
+
+; Make 15 instances of SSE2 macro for each value of the alignment u.
+; These are pointed to by the jump table MAlignmentDispatchSSE2 below
+; (aligns and fillers are inserted manually to minimize the 
+;  number of 16-bytes boundaries inside loops)
+
+align   16
+D104:   MOVE_REVERSE_UNALIGNED_SSE2_4    0
+D108:   MOVE_REVERSE_UNALIGNED_SSE2_8    0
+D10C:   MOVE_REVERSE_UNALIGNED_SSE2_12   0
+D101:   MOVE_REVERSE_UNALIGNED_SSE2 1,   0
+D102:   MOVE_REVERSE_UNALIGNED_SSE2 2,   0
+D103:   MOVE_REVERSE_UNALIGNED_SSE2 3,   0
+D105:   MOVE_REVERSE_UNALIGNED_SSE2 5,   0
+D106:   MOVE_REVERSE_UNALIGNED_SSE2 6,   0
+D107:   MOVE_REVERSE_UNALIGNED_SSE2 7,   0
+D109:   MOVE_REVERSE_UNALIGNED_SSE2 9,   0
+D10A:   MOVE_REVERSE_UNALIGNED_SSE2 0AH, 0
+D10B:   MOVE_REVERSE_UNALIGNED_SSE2 0BH, 0
+D10D:   MOVE_REVERSE_UNALIGNED_SSE2 0DH, 0
+D10E:   MOVE_REVERSE_UNALIGNED_SSE2 0EH, 0
+D10F:   MOVE_REVERSE_UNALIGNED_SSE2 0FH, 0
+
+; Make 15 instances of Suppl-SSE3 macro for each value of the alignment u.
+; These are pointed to by the jump table MAlignmentDispatchSupSSE3 below
+
+align   16
+E104:   MOVE_REVERSE_UNALIGNED_SSSE3 4
+E108:   MOVE_REVERSE_UNALIGNED_SSSE3 8
+E10C:   MOVE_REVERSE_UNALIGNED_SSSE3 0CH
+E101:   MOVE_REVERSE_UNALIGNED_SSSE3 1
+E102:   MOVE_REVERSE_UNALIGNED_SSSE3 2
+E103:   MOVE_REVERSE_UNALIGNED_SSSE3 3
+E105:   MOVE_REVERSE_UNALIGNED_SSSE3 5
+E106:   MOVE_REVERSE_UNALIGNED_SSSE3 6
+E107:   MOVE_REVERSE_UNALIGNED_SSSE3 7
+E109:   MOVE_REVERSE_UNALIGNED_SSSE3 9
+E10A:   MOVE_REVERSE_UNALIGNED_SSSE3 0AH
+E10B:   MOVE_REVERSE_UNALIGNED_SSSE3 0BH
+E10D:   MOVE_REVERSE_UNALIGNED_SSSE3 0DH
+E10E:   MOVE_REVERSE_UNALIGNED_SSSE3 0EH
+E10F:   MOVE_REVERSE_UNALIGNED_SSSE3 0FH
+        
+align   16
+F100:   ; Non-temporal move, src and dest have same alignment.
+        ; Loop. rcx has positive index from the beginning, counting down to zero
+        sub     rcx, 20H
+        movaps  xmm0, [rsi+rcx+10H]
+        movaps  xmm1, [rsi+rcx]
+        movntps [rdi+rcx+10H], xmm0
+        movntps [rdi+rcx], xmm1
+        jnz     F100
+        
+        ; Move the remaining edx bytes (0 - 31):
+        ; move 16-8-4-2-1 bytes, aligned
+        test    dl, 10H
+        jz      C200
+        ; move 16 bytes
+        sub     rcx, 10H
+        movaps  xmm0, [rsi+rcx]
+        movntps  [rdi+rcx], xmm0
+        sfence
+        ; move the remaining 0 - 15 bytes
+        jmp     C200
+
+; Non-temporal move, src and dest have different alignment.
+; Make 15 instances of SSE2 macro for each value of the alignment u.
+; These are pointed to by the jump table MAlignmentDispatchNT below
+
+align 16
+F101:   MOVE_REVERSE_UNALIGNED_SSE2 1,   1
+F102:   MOVE_REVERSE_UNALIGNED_SSE2 2,   1
+F103:   MOVE_REVERSE_UNALIGNED_SSE2 3,   1
+F104:   MOVE_REVERSE_UNALIGNED_SSE2_4    1
+F105:   MOVE_REVERSE_UNALIGNED_SSE2 5,   1
+F106:   MOVE_REVERSE_UNALIGNED_SSE2 6,   1
+F107:   MOVE_REVERSE_UNALIGNED_SSE2 7,   1
+F108:   MOVE_REVERSE_UNALIGNED_SSE2_8    1
+F109:   MOVE_REVERSE_UNALIGNED_SSE2 9,   1
+F10A:   MOVE_REVERSE_UNALIGNED_SSE2 0AH, 1
+F10B:   MOVE_REVERSE_UNALIGNED_SSE2 0BH, 1
+F10C:   MOVE_REVERSE_UNALIGNED_SSE2_12   1
+F10D:   MOVE_REVERSE_UNALIGNED_SSE2 0DH, 1
+F10E:   MOVE_REVERSE_UNALIGNED_SSE2 0EH, 1
+F10F:   MOVE_REVERSE_UNALIGNED_SSE2 0FH, 1
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;    CPU dispatcher
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+memmoveCPUDispatch:   ; CPU dispatcher, check for Suppl-SSE3 instruction set
+        ; This part is executed only once
+        push    rbx
+        push    rcx
+        push    rdx
+        push    rsi
+        push    rdi
+        push    r8        
+
+        ; set CacheBypassLimit to half the size of the largest level cache
+%ifdef  WINDOWS
+        xor     ecx, ecx               ; 0 means default
+%else
+        xor     edi, edi
+%endif
+        call    SetMemcpyCacheLimit@
+        mov     eax, 1
+        cpuid                          ; Get feature flags
+        lea     rbx, [memmoveSSE2@]
+        bt      ecx, 9                 ; Test bit for SupplSSE3
+        jnc     Q100
+        lea     rbx, [memmoveSSSE3@]
+        call    UnalignedIsFaster
+        test    eax, eax
+        jz      Q100
+        lea     rbx, [memmoveU@]
+        call    Store256BitIsFaster
+        test    eax, eax
+        jz      Q100
+        lea     rbx, [memmoveU256@]
+        
+Q100:   ; Insert appropriate pointer
+        mov     [memmoveDispatch], rbx
+        mov     rax, rbx
+        pop     r8
+        pop     rdi
+        pop     rsi
+        pop     rdx
+        pop     rcx
+        pop     rbx
+        ; Jump according to the replaced function pointer
+        jmp     rax
+        
+; Note: Must call SetMemcpyCacheLimit1 defined in memcpy64.asm
+SetMemcpyCacheLimit:
+SetMemcpyCacheLimit@:
+        call    SetMemcpyCacheLimit1
+        mov     [CacheBypassLimit], rax
+        ret 
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;    data section. jump tables, dispatch function pointer, cache size
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Data segment must be included in function namespace
+SECTION .data
+align 16
+
+; Jump tables for alignments 0 - 15:
+; The CPU dispatcher replaces MAlignmentDispatch with 
+; MAlignmentDispatchSSE2 or MAlignmentDispatchSupSSE3 if Suppl-SSE3 
+; is supported.
+
+; Code pointer for each alignment for SSE2 instruction set
+MAlignmentDispatchSSE2:
+DQ C100, D101, D102, D103, D104, D105, D106, D107
+DQ D108, D109, D10A, D10B, D10C, D10D, D10E, D10F
+
+; Code pointer for each alignment for Suppl-SSE3 instruction set
+MAlignmentDispatchSSSE3:
+DQ C100, E101, E102, E103, E104, E105, E106, E107
+DQ E108, E109, E10A, E10B, E10C, E10D, E10E, E10F
+
+; Code pointer for each alignment for non-temporal store
+MAlignmentDispatchNT:
+DQ F100, F101, F102, F103, F104, F105, F106, F107
+DQ F108, F109, F10A, F10B, F10C, F10D, F10E, F10F
+
+memmoveDispatch: DQ memmoveCPUDispatch
+
+; Bypass cache by using non-temporal moves if count > _CacheBypassLimit
+; The optimal value of CacheBypassLimit is difficult to estimate, but
+; a reasonable value is half the size of the largest cache:
+CacheBypassLimit: DD 0
diff --git a/contrib/libs/asmlib/memset64.asm b/contrib/libs/asmlib/memset64.asm
new file mode 100644
index 0000000000..52d647984d
--- /dev/null
+++ b/contrib/libs/asmlib/memset64.asm
@@ -0,0 +1,372 @@
+%include "defs.asm"
+
+;*************************  memset64.asm  *************************************
+; Author:           Agner Fog
+; Date created:     2008-07-19
+; Last modified:    2016-11-12 (patched version with AVX512 support removed)
+; Description:
+; Faster version of the standard memset function:
+; void * A_memset(void * dest, int c, size_t count);
+; Sets 'count' bytes from 'dest' to the 8-bit value 'c'
+;
+; Overriding standard function memset:
+; The alias ?OVR_memset is changed to _memset in the object file if
+; it is desired to override the standard library function memset.
+;
+; extern "C" size_t GetMemsetCacheLimit(); // Data blocks bigger than this will be stored uncached by memset
+; extern "C" void   SetMemsetCacheLimit(); // Change limit in GetMemsetCacheLimit
+;
+; Optimization:
+; Uses XMM registers to set 16 bytes at a time, aligned.
+;
+; The latest version of this file is available at:
+; www.agner.org/optimize/asmexamples.zip
+; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+default rel
+
+global A_memset: function              ; Function memset
+global EXP(memset): function           ; ?OVR removed if standard function memset overridden
+global memsetSSE2: function            ; SSE2 version
+global memsetAVX: function             ; version for CPUs with fast 256-bit store
+global GetMemsetCacheLimit: function   ; Data blocks bigger than this will be stored uncached by memset
+global SetMemsetCacheLimit: function   ; Change limit in GetMemsetCacheLimit
+
+; Imported from cachesize64.asm:
+extern DataCacheSize                   ; Get size of data cache
+
+; Imported from unalignedisfaster64.asm:
+extern Store256BitIsFaster             ; Tells if a 256 bit store is faster than two 128 bit stores
+
+; Define prolog for this function
+%MACRO  PROLOGM  0
+%IFDEF  WINDOWS
+%define Rdest   rcx                    ; dest
+        movzx   eax, dl                ; c
+        mov     rdx, r8                ; count
+%define Rcount  rdx                    ; count
+%define Rdest2  r9                     ; copy of dest
+%define Rcount2 r8                     ; copy of count
+
+%ELSE   ; Unix
+%define Rdest   rdi                    ; dest
+        movzx   eax, sil               ; c
+%define Rcount  rdx                    ; count
+%define Rdest2  rcx                    ; copy of dest
+%define Rcount2 rsi                    ; copy of count
+        mov     Rcount2, Rcount        ; copy count
+%ENDIF
+%ENDMACRO
+
+
+SECTION .text  align=16
+
+; extern "C" void * memset(void * dest, int c, size_t count);
+; Function entry:
+A_memset:
+EXP(memset):
+        jmp     [memsetDispatch]       ; CPU dispatch table
+        
+memsetAVX:  ; AVX version. Use ymm register
+memsetAVX@: ; local label
+        PROLOGM
+        imul    eax, 01010101H         ; Broadcast c into all bytes of eax
+        mov     Rdest2, Rdest          ; save dest
+        cmp     Rcount, 16
+        ja      B100
+B050:   lea     r10, [MemsetJTab]      ; SSE2 version comes in here
+        jmp     qword [r10+Rcount*8]   ; jump table for small counts
+        
+; Separate code for each count from 0 to 16:
+M16:    mov     [Rdest+12], eax
+M12:    mov     [Rdest+8],  eax
+M08:    mov     [Rdest+4],  eax
+M04:    mov     [Rdest],    eax
+M00:    mov     rax, Rdest2            ; return dest
+        ret
+
+M15:    mov     [Rdest+11], eax
+M11:    mov     [Rdest+7],  eax
+M07:    mov     [Rdest+3],  eax
+M03:    mov     [Rdest+1],  ax
+M01:    mov     [Rdest],    al
+        mov     rax, Rdest2            ; return dest
+        ret
+       
+M14:    mov     [Rdest+10], eax
+M10:    mov     [Rdest+6],  eax
+M06:    mov     [Rdest+2],  eax
+M02:    mov     [Rdest],    ax
+        mov     rax, Rdest2            ; return dest
+        ret
+
+M13:    mov     [Rdest+9],  eax
+M09:    mov     [Rdest+5],  eax
+M05:    mov     [Rdest+1],  eax
+        mov     [Rdest],    al
+        mov     rax, Rdest2            ; return dest
+        ret
+        
+B100:   ; AVX version, Rcount > 16
+        movd    xmm0, eax
+        pshufd  xmm0, xmm0, 0          ; Broadcast c into all bytes of xmm0
+        
+        lea     rax, [Rdest+Rcount]    ; point to end
+        
+        cmp     Rcount, 20H
+        jbe     K600                   ; faster to use xmm registers if small
+        
+        ; Store the first possibly unaligned 16 bytes
+        ; It is faster to always write 16 bytes, possibly overlapping
+        ; with the subsequent regular part, than to make possibly mispredicted
+        ; branches depending on the size of the first part.
+        movups  oword [Rdest], xmm0
+        
+        ; store another 16 bytes, aligned        
+        add     Rdest, 10H
+        and     Rdest, -10H
+        movaps  oword [Rdest], xmm0
+        
+        ; go to next 32 bytes boundary
+        add     Rdest, 10H
+        and     Rdest, -20H
+        
+        ; Check if count very big
+        cmp     Rcount, [MemsetCacheLimit]        
+        ja      K300                   ; Use non-temporal store if count > MemsetCacheLimit
+        
+        ; find last 32 bytes boundary
+        mov     Rcount, rax
+        and     Rcount, -20H
+        
+        ; - size of 32-bytes blocks
+        sub     Rdest, Rcount
+        jnb     K200                   ; Jump if not negative
+        
+        ; extend value to 256 bits
+        vinsertf128 ymm0,ymm0,xmm0,1
+        
+align   16        
+K100:   ; Loop through 32-bytes blocks. Register use is swapped
+        ; Rcount = end of 32-bytes blocks part
+        ; Rdest = negative index from the end, counting up to zero
+        vmovaps [Rcount+Rdest], ymm0
+        add     Rdest, 20H
+        jnz     K100
+        vzeroupper
+        
+K200:   ; the last part from Rcount to rax is < 32 bytes. write last 32 bytes with overlap
+        movups  [rax-20H], xmm0
+        movups  [rax-10H], xmm0
+        mov     rax, Rdest2            ; return dest
+        ret
+        
+K300:   ; Use non-temporal moves, same code as above:
+
+        ; find last 32 bytes boundary
+        mov     Rcount, rax
+        and     Rcount, -20H
+        
+        ; - size of 32-bytes blocks
+        sub     Rdest, Rcount
+        jnb     K500                   ; Jump if not negative
+        
+        ; extend value to 256 bits
+        vinsertf128 ymm0,ymm0,xmm0,1
+        
+align   16        
+K400:   ; Loop through 32-bytes blocks. Register use is swapped
+        ; Rcount = end of 32-bytes blocks part
+        ; Rdest = negative index from the end, counting up to zero
+        vmovntps [Rcount+Rdest], ymm0
+        add     Rdest, 20H
+        jnz     K400
+        sfence
+        vzeroupper
+        
+K500:   ; the last part from Rcount to rax is < 32 bytes. write last 32 bytes with overlap
+        movups  [rax-20H], xmm0
+        movups  [rax-10H], xmm0
+        mov     rax, Rdest2            ; return dest
+        ret
+        
+K600:   ; 16 < count <= 32
+        movups [Rdest], xmm0
+        movups [rax-10H], xmm0
+        mov     rax, Rdest2            ; return dest
+        ret
+        
+
+memsetSSE2:  ; count > 16. Use SSE2 instruction set
+memsetSSE2@: ; local label
+        PROLOGM
+        imul    eax, 01010101H         ; Broadcast c into all bytes of eax
+        mov     Rdest2, Rdest          ; save dest
+        cmp     Rcount, 16
+        jna     B050
+
+        movd    xmm0, eax
+        pshufd  xmm0, xmm0, 0          ; Broadcast c into all bytes of xmm0
+        
+        ; Store the first unaligned part.
+        ; The size of this part is 1 - 16 bytes.
+        ; It is faster to always write 16 bytes, possibly overlapping
+        ; with the subsequent regular part, than to make possibly mispredicted
+        ; branches depending on the size of the first part.
+        movq    qword [Rdest],   xmm0
+        movq    qword [Rdest+8], xmm0
+        
+        ; Check if count very big
+M150:   mov     rax, [MemsetCacheLimit]        
+        cmp     Rcount, rax
+        ja      M500                   ; Use non-temporal store if count > MemsetCacheLimit
+        
+        ; Point to end of regular part:
+        ; Round down dest+count to nearest preceding 16-bytes boundary
+        lea     Rcount, [Rdest+Rcount-1]
+        and     Rcount, -10H
+        
+        ; Point to start of regular part:
+        ; Round up dest to next 16-bytes boundary
+        add     Rdest, 10H
+        and     Rdest, -10H
+        
+        ; -(size of regular part)
+        sub     Rdest, Rcount
+        jnb     M300                   ; Jump if not negative
+        
+align 16
+M200:   ; Loop through regular part
+        ; Rcount = end of regular part
+        ; Rdest = negative index from the end, counting up to zero
+        movdqa  [Rcount+Rdest], xmm0
+        add     Rdest, 10H
+        jnz     M200
+        
+M300:   ; Do the last irregular part
+        ; The size of this part is 1 - 16 bytes.
+        ; It is faster to always write 16 bytes, possibly overlapping
+        ; with the preceding regular part, than to make possibly mispredicted
+        ; branches depending on the size of the last part.
+        mov     rax, Rdest2                          ; dest
+        movq    qword [rax+Rcount2-10H], xmm0
+        movq    qword [rax+Rcount2-8], xmm0
+        ret
+
+        
+M500:   ; Use non-temporal moves, same code as above:
+        ; End of regular part:
+        ; Round down dest+count to nearest preceding 16-bytes boundary
+        lea     Rcount, [Rdest+Rcount-1]
+        and     Rcount, -10H
+        
+        ; Start of regular part:
+        ; Round up dest to next 16-bytes boundary
+        add     Rdest, 10H
+        and     Rdest, -10H
+        
+        ; -(size of regular part)
+        sub     Rdest, Rcount
+        jnb     M700                   ; Jump if not negative
+
+align 16        
+M600:   ; Loop through regular part
+        ; Rcount = end of regular part
+        ; Rdest = negative index from the end, counting up to zero
+        movntdq [Rcount+Rdest], xmm0
+        add     Rdest, 10H
+        jnz     M600
+        sfence
+
+M700:   ; Do the last irregular part
+        ; The size of this part is 1 - 16 bytes.
+        ; It is faster to always write 16 bytes, possibly overlapping
+        ; with the preceding regular part, than to make possibly mispredicted
+        ; branches depending on the size of the last part.
+        mov     rax, Rdest2            ; dest
+        movq    qword [rax+Rcount2-10H], xmm0
+        movq    qword [rax+Rcount2-8], xmm0
+        ret
+        
+        
+memsetCPUDispatch:    ; CPU dispatcher, check for instruction sets and which method is fastest        
+        ; This part is executed only once
+        push    rbx
+        push    rcx
+        push    rdx
+        push    rsi
+        push    rdi
+        push    r8
+        ; set CacheBypassLimit to half the size of the largest level cache
+        call    GetMemsetCacheLimit@
+        lea     rbx, [memsetSSE2@]
+        call    Store256BitIsFaster    ; Test if 256-bit read/write is available and faster than 128-bit read/write
+        test    eax, eax
+        jz      Q100
+        lea     rbx, [memsetAVX@]
+Q100:
+        ; Insert appropriate pointer
+        mov     [memsetDispatch], rbx
+        mov     rax, rbx
+        pop     r8
+        pop     rdi
+        pop     rsi
+        pop     rdx
+        pop     rcx
+        pop     rbx
+        ; Jump according to the replaced function pointer
+        jmp     rax
+
+        
+; extern "C" size_t GetMemsetCacheLimit(); // Data blocks bigger than this will be stored uncached by memset
+GetMemsetCacheLimit:
+GetMemsetCacheLimit@:
+        mov     rax, [MemsetCacheLimit]
+        test    rax, rax
+        jnz     U200
+        ; Get half the size of the largest level cache
+%ifdef  WINDOWS
+        xor     ecx, ecx               ; 0 means largest level cache
+%else
+        xor     edi, edi               ; 0 means largest level cache
+%endif
+        call    DataCacheSize          ; get cache size
+        shr     eax, 1                 ; half the size
+        jnz     U100
+        mov     eax, 400000H           ; cannot determine cache size. use 4 Mbytes
+U100:   mov     [MemsetCacheLimit], eax
+U200:   ret
+
+; extern "C" void   SetMemsetCacheLimit(); // Change limit in GetMemsetCacheLimit
+SetMemsetCacheLimit:
+%ifdef  WINDOWS
+        mov     rax, rcx
+%else
+        mov     rax, rdi
+%endif
+        test    rax, rax
+        jnz     U400
+        ; zero, means default
+        mov     [MemsetCacheLimit], rax
+        call    GetMemsetCacheLimit@
+U400:   mov     [MemsetCacheLimit], rax
+        ret
+        
+   
+SECTION .data
+align 16
+; Jump table for count from 0 to 16:
+MemsetJTab:DQ M00, M01, M02, M03, M04, M05, M06, M07
+           DQ M08, M09, M10, M11, M12, M13, M14, M15, M16
+           
+; Pointer to appropriate version.
+; This initially points to memsetCPUDispatch. memsetCPUDispatch will
+; change this to the appropriate version of memset, so that
+; memsetCPUDispatch is only executed once:
+memsetDispatch: DQ memsetCPUDispatch           
+
+; Bypass cache by using non-temporal moves if count > MemsetCacheLimit
+; The optimal value of MemsetCacheLimit is difficult to estimate, but
+; a reasonable value is half the size of the largest cache
+MemsetCacheLimit: DQ 0
diff --git a/contrib/libs/asmlib/mersenne64.asm b/contrib/libs/asmlib/mersenne64.asm
new file mode 100644
index 0000000000..758075d61d
--- /dev/null
+++ b/contrib/libs/asmlib/mersenne64.asm
@@ -0,0 +1,616 @@
+%include "defs.asm"
+
+; ----------------------------- MERSENNE64.ASM ---------------------------
+; Author:           Agner Fog
+; Date created:     1998
+; Last modified:    2013-09-13
+; Source URL:       www.agner.org/optimize
+; Project:          asmlib.zip
+; Language:         assembly, NASM/YASM syntax, 64 bit
+; Description:
+; Random Number generator 'Mersenne Twister' type MT11213A (or MT19937)
+;
+;
+;  This random number generator is described in the article by
+;  M. Matsumoto & T. Nishimura, in:
+;  ACM Transactions on Modeling and Computer Simulation,
+;  vol. 8, no. 1, 1998, pp. 3-30. See also:
+;  http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
+;
+;  Initialization:
+;  MersRandomInit must be called before the first call to any of the other
+;  random number functions. The seed is any 32-bit integer.
+;  You may use MersRandomInitByArray instead if you want more
+;  than 32 bits for seed. length is the number of integers in seeds[].
+;  length must be > 0, there is no upper limit for length.
+;
+;  Generating random numbers:
+;  MersRandom returns a floating point number in the interval 0 <= x < 1 with
+;  a resolution of 32 bits.
+;  MersIRandom returns an integer in the interval defined by min and max with
+;  a resolution of 32 bits.
+;  MersIRandomX returns an integer in the interval defined by min and max with
+;  exactly equal probabilities of all values in the interval.
+;  MersBRandom returns 32 random bits.
+;
+;  Error conditions:
+;  If MersRandomInit or MersRandomInitByArray has not been called then MersRandom
+;  and MersBRandom keep returning 0, and MersIRandom and MersIRandomX return min.
+;  MersIRandom and MersIRandomX return a large negative number if max < min.
+;
+;  C++ prototypes in randoma.h:
+;  Thread-safe versions:
+;  extern "C" void   MersRandomInit(void * Pthis, int seed);         // Re-seed
+;  extern "C" void   MersRandomInitByArray(void * Pthis, unsigned int seeds[], int length); // Seed by more than 32 bits
+;  extern "C" int    MersIRandom (void * Pthis, int min, int max);   // Output random integer
+;  extern "C" int    MersIRandomX(void * Pthis, int min, int max);   // Output random integer, exact
+;  extern "C" double MersRandom(void * Pthis);                       // Output random float
+;  extern "C" unsigned int MersBRandom(void * Pthis);                // Output random bits
+;
+;  Single-threaded versions:
+;  extern "C" void   MersenneRandomInit(int seed);                   // Re-seed
+;  extern "C" void   MersenneRandomInitByArray(unsigned int seeds[], int length); // Seed by more than 32 bits
+;  extern "C" int    MersenneIRandom (int min, int max);             // Output random integer
+;  extern "C" int    MersenneIRandomX(int min, int max);             // Output random integer, exact
+;  extern "C" double MersenneRandom();                               // Output random float
+;  extern "C" unsigned int MersenneBRandom();                        // Output random bits
+;
+; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+
+; structure definition and constants:
+%INCLUDE "randomah.asi"
+
+global MersenneRandomInit, MersenneRandomInitD, MersRandomInit
+global MersenneRandomInitByArray, MersenneRandomInitByArrayD, MersRandomInitByArray
+global MersenneBRandom, MersenneBRandomD, MersBRandom
+global MersenneRandom, MersenneRandomD, MersRandom
+global MersenneIRandom, MersenneIRandomD, MersIRandom
+global MersenneIRandomX, MersenneIRandomXD, MersIRandomX
+
+
+section .data
+align 16
+
+; Data for single instance of random number generator
+MersenneInstance: ISTRUC CRandomMersenneA
+IEND
+; Size of structure
+MersenneSize equ $ - MersenneInstance
+
+
+SECTION .CODE  ALIGN=16
+
+MersenneRandomInit: ; PROC
+%IFDEF UNIX
+        mov     edx, edi                                   ; seed
+        lea     rcx, [MersenneInstance]                    ; Pthis = point to instance
+        jmp     ?Windows_MersRandomInit
+%ENDIF
+%IFDEF WINDOWS
+MersenneRandomInitD:                                       ; alias
+        mov     edx, ecx                                   ; seed
+        lea     rcx, [MersenneInstance]                    ; Pthis = point to instance
+        ;jmp     ?Windows_MersRandomInit
+%ENDIF
+;MersenneRandomInit ENDP
+
+        
+; Thread-safe version:
+;  extern "C" void MersRandomInit(void * Pthis, int seed); // Re-seed
+MersRandomInit: ;   PROC
+%IFDEF UNIX
+        ; translate calling convention
+        mov     edx, esi                                   ; seed
+        mov     rcx, rdi                                   ; Pthis
+%ENDIF
+        ; parameters: rcx = Pthis, edx = seed
+        and     rcx, -16                                   ; align buffer
+        ?Windows_MersRandomInit:
+        call    Mers_init0                                 ; initialize mt buffer with seeds
+        
+        ; Number of premade numbers that are lost in the initialization when the  
+        ; SSE2 implementation makes up to 4 premade numbers at a time:
+%IF MERS_N & 3        
+   PREMADELOST equ (MERS_N & 3)
+%ELSE
+   PREMADELOST equ 4
+%ENDIF
+        ; We want the C++ and the assembly implementation to give exactly the same
+        ; sequence. The C++ version discards 37 random numbers after initialization.
+        ; The assembly version generates a sequence that is PREMADELOST + 1 numbers
+        ; behind. Therefore we discard the first 37 + PREMADELOST + 1 numbers if
+        ; SSE2 is supported, otherwise 37 + 1.
+        
+        push    rbx
+        mov     ebx, 37+PREMADELOST+1
+        ; CMP     dword [rcx+CRandomMersenneA.Instset], 4  ; can we use XMM registers and SSE2 ?
+        ; jae     M110
+        ; sub     ebx, PREMADELOST                         ; SSE2 not supported
+        ; mov     dword [rcx+CRandomMersenneA.PreInx], 0   ; reset index to premade list
+M110:   ; loop
+M120:   call    ?Windows_MersBRandom
+        dec     ebx
+        jnz     M120
+        pop     rbx
+        ret
+;MersRandomInit ENDP
+        
+
+Mers_init0:                                                ; make random seeds from eax and put them into MT buffer
+; Input parameters: 
+; rcx points to CRandomMersenneA
+; edx: seed
+; rcx unchanged by procedure
+
+        push    rdi
+        ; clear my buffer
+        push    rcx
+        mov     rdi, rcx                                   ; Pthis
+        add     rdi, 16
+        mov     ecx, (MersenneSize - 16) / 4
+        xor     eax, eax
+        cld
+        rep     stosd
+        pop     rcx                                        ; Pthis
+        mov     edi, edx                                   ; seed
+        
+        ; initialize CRandomMersenneA structure
+        mov     dword [rcx+CRandomMersenneA.PreInx], 4*4
+        mov     dword [rcx+CRandomMersenneA.Instset], 4
+        mov     eax, MERS_B
+        mov     [rcx+CRandomMersenneA.TMB], eax
+        mov     [rcx+CRandomMersenneA.TMB+4], eax
+        mov     [rcx+CRandomMersenneA.TMB+8], eax
+        mov     [rcx+CRandomMersenneA.TMB+12], eax
+        mov     eax, MERS_C
+        mov     [rcx+CRandomMersenneA.TMC], eax
+        mov     [rcx+CRandomMersenneA.TMC+4], eax
+        mov     [rcx+CRandomMersenneA.TMC+8], eax
+        mov     [rcx+CRandomMersenneA.TMC+12], eax
+        mov     eax, 3FF00000H                             ; upper dword of 1.0, double precision
+        mov     [rcx+CRandomMersenneA.one+4], eax
+        mov     [rcx+CRandomMersenneA.one+12], eax        
+        mov     dword [rcx+CRandomMersenneA.LMASK], LOWER_MASK
+        mov     dword [rcx+CRandomMersenneA.UMASK], UPPER_MASK
+        mov     dword [rcx+CRandomMersenneA.MATA],  MERS_A
+
+        ; put random numbers into MT buffer
+        xor     eax, eax
+M210:   mov     [rcx+rax*4+CRandomMersenneA.MT], edi
+        mov     edx, edi
+        shr     edi, 30
+        xor     edi, edx
+        imul    edi, 1812433253
+        inc     eax
+        add     edi, eax
+        cmp     eax, MERS_N
+        jb      M210
+        
+        ; Set index MTI to end of list, (scaled by 4)
+        ; Round up to multiple of 4 to avoid alignment error
+        mov     dword [rcx+CRandomMersenneA.MTI], ((MERS_N+3) & (-4)) * 4
+        
+        pop     rdi
+        ret      
+
+
+; Single threaded version:
+; extern "C" void MersenneRandomInitByArray(unsigned int seeds[], int length);
+
+MersenneRandomInitByArray: ; PROC                          ; entry for Linux call
+%IFDEF UNIX
+        mov     r8d, esi                                   ; length
+        mov     rdx, rdi                                   ; seeds
+        lea     rcx, [MersenneInstance]                    ; Pthis = point to instance
+        jmp     ?Windows_MersRandomInitByArray
+%ENDIF
+%IFDEF WINDOWS
+MersenneRandomInitByArrayD: ; LABEL NEAR                   ; alias
+        mov     r8d, edx                                   ; length
+        mov     rdx, rcx                                   ; seeds
+        lea     rcx, [MersenneInstance]                    ; Pthis = point to instance
+        jmp     ?Windows_MersRandomInitByArray
+%ENDIF        
+;MersenneRandomInitByArray ENDP       
+
+; Thread-safe version:
+; extern "C" int MersRandomInitByArray(void * Pthis, unsigned int seeds[], int length);
+MersRandomInitByArray: ; PROC
+%IFDEF UNIX
+        ; translate calling convention
+        mov     r8d, edx                                   ; length
+        mov     rdx, rsi                                   ; seeds
+        mov     rcx, rdi                                   ; Pthis
+%ENDIF
+        
+?Windows_MersRandomInitByArray:
+; parameters: rcx = Pthis, rdx = seeds, r8d = length
+
+        and     rcx, -16                                   ; align buffer
+        push    rbx
+        push    rsi
+        push    rdi
+        push    rbp
+        mov     rbx, rdx                                   ; seeds
+        mov     ebp, r8d                                   ; length
+        
+        mov     edx, 19650218
+        call    Mers_init0                                 ; init0(19650218); (rcx unchanged)
+        
+        mov     r8d, ebp                                   ; r8d = length, ebp = k
+        test    ebp, ebp
+        jle     M380                                       ; error: length <= 0
+        xor     edi, edi                                   ; j = 0
+        lea     esi, [rdi+1]                               ; i = 1
+        cmp     ebp, MERS_N
+        ja      M310
+        mov     ebp, MERS_N                                ; k = max (MERS_N,length)
+M310:
+
+        ; for (; k; k--) {
+M320:   mov     eax, [rcx+rsi*4-4+CRandomMersenneA.MT]     ; mt[i-1]
+        mov     edx, eax
+        shr     eax, 30
+        xor     eax, edx                                   ; mt[i-1] ^ (mt[i-1] >> 30)
+        imul    eax, 1664525                               ; * 1664525
+        xor     eax, [rcx+rsi*4+CRandomMersenneA.MT]       ; ^ mt[i]
+        add     eax, [rbx+rdi*4]                           ; + seeds[j]
+        add     eax, edi                                   ; + j
+        mov     [rcx+rsi*4+CRandomMersenneA.MT], eax       ; save in mt[i]
+        inc     esi                                        ; i++
+        inc     edi                                        ; j++
+        cmp     esi, MERS_N
+        jb      M330                                       ; if (i>=MERS_N)
+        mov     eax, [rcx+(MERS_N-1)*4+CRandomMersenneA.MT]; mt[0] = mt[MERS_N-1];
+        mov     [rcx+CRandomMersenneA.MT], eax
+        mov     esi, 1                                     ; i=1;
+M330:
+        cmp     edi, r8d                                   ; length
+        jb      M340                                       ; if (j>=length)
+        xor     edi, edi                                   ; j = 0;
+M340:
+        dec     ebp                                        ; k--
+        jnz     M320                                       ; first k loop
+M350:
+        mov     ebp, MERS_N-1                              ; k
+M360:   mov     eax, [rcx+rsi*4-4+CRandomMersenneA.MT]     ; mt[i-1]
+        mov     edx, eax
+        shr     eax, 30
+        xor     eax, edx                                   ; mt[i-1] ^ (mt[i-1] >> 30)
+        imul    eax, 1566083941                            ; * 1566083941
+        xor     eax, [rcx+rsi*4+CRandomMersenneA.MT]       ; ^ mt[i]
+        sub     eax, esi                                   ; - i
+        mov     [rcx+rsi*4+CRandomMersenneA.MT], eax       ; save in mt[i]
+        inc     esi                                        ; i++
+        cmp     esi, MERS_N
+        jb      M370                                       ; if (i>=MERS_N)
+        mov     eax, [rcx+(MERS_N-1)*4+CRandomMersenneA.MT]; mt[0] = mt[MERS_N-1];
+        mov     [rcx+CRandomMersenneA.MT], eax
+        mov     esi, 1                                     ; i=1;
+M370:
+        dec     ebp                                        ; k--
+        jnz     M360                                       ; second k loop
+        mov     dword [rcx+CRandomMersenneA.MT], 80000000H ; mt[0] = 0x80000000
+M380:
+        mov     dword [rcx+CRandomMersenneA.MTI], 0
+        mov     dword [rcx+CRandomMersenneA.PreInx], 0
+
+; discard first MERS_N random numbers + PREMADELOST+1 to compensate for lag
+        mov     edi, MERS_N + PREMADELOST+1
+M391:   call    ?Windows_MersBRandom
+        dec     edi
+        jnz     M391
+
+        pop     rbp                                        ; restore registers
+        pop     rdi
+        pop     rsi
+        pop     rbx
+        ret
+;MersRandomInitByArray ENDP
+
+
+; Single threaded version:
+; extern "C" unsigned int MersenneBRandom(); // Output random bits
+
+MersenneBRandom: ; PROC                                    ; entry for both Windows and Linux call
+%IFDEF WINDOWS
+MersenneBRandomD: ; LABEL NEAR                             ; alias
+%ENDIF
+        lea     rcx, [MersenneInstance]                    ; Point to instance
+        jmp     ?Windows_MersBRandom
+;MersenneBRandom ENDP       
+
+; Thread-safe version:
+; extern "C" unsigned int MersBRandom(void * Pthis);       // Output random bits
+
+MersBRandom: ; PROC
+%IFDEF UNIX
+        mov     rcx, rdi                                   ; translate calling convention
+%ENDIF
+
+?Windows_MersBRandom: ; LABEL NEAR                         ; Label used internally
+        and     rcx, -16                                   ; align buffer
+        mov     edx, [rcx+CRandomMersenneA.PreInx]         ; index into premade numbers
+        mov     eax, [rcx+rdx*1+CRandomMersenneA.PreInt]   ; fetch premade random number
+        add     edx, 4
+        mov     [rcx+CRandomMersenneA.PreInx], edx
+        cmp     edx, 4*4
+        jnb     M410
+        ret                                                ; return premade number
+
+M410:
+; PREMADE list is empty. Make 4 more numbers ready for next call:
+        mov     edx, [rcx+CRandomMersenneA.MTI]            ; fetch 4 numbers from MT buffer
+        movdqa  xmm0, oword [rcx+rdx*1+CRandomMersenneA.MT]
+        
+%IF TEMPERING                                              ; optional tempering algorithm
+        movdqa  xmm1, xmm0
+        psrld   xmm0, MERS_U
+        pxor    xmm0, xmm1
+        movdqa  xmm1, xmm0        
+        pslld   xmm0, MERS_S
+        pand    xmm0, oword [rcx+CRandomMersenneA.TMB]
+        pxor    xmm0, xmm1
+        movdqa  xmm1, xmm0        
+        pslld   xmm0, MERS_T
+        pand    xmm0, oword [rcx+CRandomMersenneA.TMC]
+        pxor    xmm0, xmm1
+        movdqa  xmm1, xmm0        
+        psrld   xmm0, MERS_L
+        pxor    xmm0, xmm1
+%ENDIF   ; tempering
+
+        ; save four premade integers
+        movdqa  oword [rcx+CRandomMersenneA.PreInt], xmm0
+        ; premake four floating point numbers
+        pxor    xmm1, xmm1
+        pxor    xmm2, xmm2
+        punpckldq xmm1, xmm0                               ; get first two numbers into bits 32-63 and 96-127
+        punpckhdq xmm2, xmm0                               ; get next  two numbers into bits 32-63 and 96-127
+        psrlq   xmm1, 12                                   ; get bits into mantissa position
+        psrlq   xmm2, 12                                   ; get bits into mantissa position
+        por     xmm1,oword[rcx+CRandomMersenneA.one]       ; set exponent for interval [1,2)
+        por     xmm2,oword[rcx+CRandomMersenneA.one]       ; set exponent for interval [1,2)
+        movdqa  oword [rcx+CRandomMersenneA.PreFlt], xmm1  ; store two premade numbers
+        movdqa  oword [rcx+CRandomMersenneA.PreFlt+16],xmm2; store two more premade numbers        
+        mov     dword [rcx+CRandomMersenneA.PreInx], 0     ; index to premade numbers 
+        add     edx, 4*4                                   ; increment MTI index into MT buffer by 4
+        mov     [rcx+CRandomMersenneA.MTI], edx
+        cmp     edx, MERS_N*4
+        jae     M420
+        ret                                                ; return random number in eax
+
+; MT buffer exhausted. Make MERS_N new numbers ready for next time
+M420:                                                      ; eax is the random number to return
+%IF     MERS_N & 3                                         ; if MERS_N is not divisible by 4
+        NVALID equ MERS_N & 3                              ; only NVALID of the 4 premade numbers are valid
+        ; Move premade numbers (4-NVALID) positions forward
+        movdqa  xmm0, [rcx+CRandomMersenneA.PreInt]
+        movdqa  xmm1, [rcx+CRandomMersenneA.PreFlt]
+        movdqa  xmm2, [rcx+CRandomMersenneA.PreFlt+16]
+        movdqu  [rcx+CRandomMersenneA.PreInt + (4-NVALID)*4], xmm0
+        movdqu  [rcx+CRandomMersenneA.PreFlt + (4-NVALID)*8], xmm1
+%IF NVALID == 3        
+        movq    [rcx+CRandomMersenneA.PreFlt+16 + 8], xmm2
+%ENDIF        
+        ; save index to first valid premade number
+        mov     [rcx+CRandomMersenneA.PreInx], (4-NVALID)*4  
+%ENDIF
+        
+; MT buffer is empty. Fill it up
+        push    rbx
+        movd    xmm3, [rcx+CRandomMersenneA.UMASK]         ; load constants
+        movd    xmm4, [rcx+CRandomMersenneA.LMASK]
+        movd    xmm5, [rcx+CRandomMersenneA.MATA]
+        pshufd  xmm3, xmm3, 0                              ; broadcast constants
+        pshufd  xmm4, xmm4, 0
+        pshufd  xmm5, xmm5, 0
+        xor     rbx,  rbx                                  ; kk = 0
+        mov     edx,  MERS_M*4                             ; km
+        
+; change rcx from pointing to CRandomMersenneA to pointing to CRandomMersenneA.MT
+        add     rcx, CRandomMersenneA.MT
+
+M430:   ; kk loop
+        movdqa  xmm2, [rcx+rbx]                            ; mt[kk]
+        movd    xmm0, dword [rcx+rbx+16]
+        movdqa  xmm1, [rcx+rbx]                            ; mt[kk]        
+        movss   xmm2, xmm0                                 ; faster than movdqu xmm2,[]
+        pshufd  xmm2, xmm2, 00111001B                      ; mt[kk+1]
+        movdqu  xmm0, oword [rcx+rdx]                      ; mt[km]        
+        ;movq   xmm0, qword [rcx+rdx]                      ; mt[km]
+        ;movhps xmm0, qword [rcx+rdx+8]                    ; faster than movdqu on older processors        
+        pand    xmm1, xmm3                                 ; mt[kk] & UPPER_MASK
+        pand    xmm2, xmm4                                 ; mt[kk+1] & LOWER_MASK
+        por     xmm1, xmm2                                 ; y        
+        movdqa  xmm2, xmm1                                 ; y
+        pslld   xmm1, 31                                   ; copy bit 0 into all bits
+        psrad   xmm1, 31                                   ; -(y & 1)
+        pand    xmm1, xmm5                                 ; & MERS_A
+        psrld   xmm2, 1                                    ; y >> 1
+        pxor    xmm0, xmm1
+        pxor    xmm0, xmm2
+        movdqa  [rcx+rbx], xmm0                            ; result into mt[kk]
+        cmp     ebx, (MERS_N-4)*4
+        jae     M440                                       ; exit loop when kk past end of buffer
+        add     ebx, 16                                    ; kk += 4
+        add     rdx, 16                                    ; km += 4 (signed)
+        cmp     edx, (MERS_N-4)*4
+        jbe     M430                                       ; skip unless km wraparound
+        sub     rdx, MERS_N*4                              ; km wraparound (signed)
+        movdqu  xmm0, [rcx+(MERS_N-4)*4]                   ; copy end to before begin for km wraparound
+        movdqa  [rcx-4*4], xmm0        
+        movdqa  xmm0, [rcx]                                ; copy begin to after end for kk wraparound
+        movdqu  [rcx+MERS_N*4], xmm0
+        jmp     M430
+
+M440:   ; loop finished. discard excess part of last result
+
+; change ecx back to pointing to CRandomMersenneA
+        sub     rcx, CRandomMersenneA.MT        
+
+        mov     dword [rcx+CRandomMersenneA.MTI], 0
+        pop     rbx
+        ret                                                ; random number is still in eax
+        
+;MersBRandom ENDP
+
+
+; Single threaded version:
+; extern "C" unsigned int MersenneRandom();  // Get floating point random number
+
+MersenneRandom: ; PROC                                     ; entry for both Windows and Linux call
+%IFDEF WINDOWS
+MersenneRandomD:                                           ; alias
+        lea     rcx, [MersenneInstance]                    ; Point to instance
+        ; continue in next function
+%ENDIF
+%IFDEF UNIX
+        lea     rdi, [MersenneInstance]                    ; Point to instance
+        ; continue in next function
+%ENDIF
+
+; Thread-safe version:
+; extern "C" double MersRandom(void * Pthis);  // Get floating point random number
+MersRandom: 
+%IFDEF UNIX
+        mov     rcx, rdi                                   ; translate calling convention
+%ENDIF
+        mov     edx, [rcx+CRandomMersenneA.PreInx]         ; index into premade numbers
+        movsd   xmm0, [rcx+rdx*2+CRandomMersenneA.PreFlt]  ; fetch premade floating point random number
+        subsd   xmm0, [rcx+CRandomMersenneA.one]           ; subtract 1.0
+        movsd   [rcx+CRandomMersenneA.TmpFlt], xmm0        ; store random number
+        call    ?Windows_MersBRandom                       ; prepare next random number
+        movsd   xmm0, [rcx+CRandomMersenneA.TmpFlt]        ; recall random number
+        ret        
+;MersenneRandom ENDP       
+
+
+
+; Single threaded version:
+; extern "C" unsigned int MersenneIRandom(int min, int max); // Get integer random number in desired interval
+
+MersenneIRandom: ; PROC 
+%IFDEF UNIX
+        push    rsi                                        ; max
+        push    rdi                                        ; min
+        lea     rcx, [MersenneInstance]                    ; Pthis = point to instance
+        jmp     MersIRandom_max_min_on_stack
+%ENDIF
+%IFDEF WINDOWS
+MersenneIRandomD:                                          ; Alias
+        push    rdx                                        ; max
+        push    rcx                                        ; min
+        lea     rcx, [MersenneInstance]                    ; Pthis = point to instance
+        jmp     MersIRandom_max_min_on_stack
+%ENDIF
+;MersenneIRandom ENDP       
+
+; Thread-safe version:
+; extern "C" int MersIRandom(void * Pthis, int min, int max); // Get integer random number in desired interval
+MersIRandom: ; PROC
+%IFDEF UNIX
+        ; translate calling convention
+        mov     r8d, edx                                   ; max
+        mov     edx, esi                                   ; min
+        mov     rcx, rdi                                   ; Pthis
+%ENDIF
+        push    r8                                         ; max
+        push    rdx                                        ; min
+MersIRandom_max_min_on_stack:
+        
+        call    ?Windows_MersBRandom                       ; random bits
+        pop     rcx                                        ; min
+        pop     rdx                                        ; max
+        sub     edx, ecx
+        js      short M720                                 ; max < min
+        add     edx, 1                                     ; interval = max - min + 1
+        mul     edx                                        ; multiply random number by interval and truncate
+        lea     eax, [rdx+rcx]                             ; add min
+        ret
+M720:   mov     eax, 80000000H                             ; error exit
+        ret
+;MersIRandom ENDP
+
+
+; Single threaded version:
+; extern "C" unsigned int MersenneIRandomX(int min, int max); // Get integer random number in desired interval
+
+MersenneIRandomX: ; PROC
+%IFDEF UNIX
+        mov     r8d, esi                                   ; max
+        mov     edx, edi                                   ; min
+        lea     rcx, [MersenneInstance]                    ; Pthis = point to instance
+        jmp     ?Windows_MersIRandomX
+%ENDIF
+%IFDEF WINDOWS
+MersenneIRandomXD:                                         ; alias
+        mov     r8d, edx                                   ; max
+        mov     edx, ecx                                   ; min
+        lea     rcx, [MersenneInstance]                    ; Pthis = point to instance
+        jmp     ?Windows_MersIRandomX
+%ENDIF
+;MersenneIRandomX ENDP       
+
+; Thread-safe version:
+; extern "C" int MersIRandomX(void * Pthis, int min, int max); // Get integer random number in desired interval
+MersIRandomX: ; PROC
+%IFDEF UNIX
+        ; translate calling convention
+        mov     r8d, edx                                   ; max
+        mov     edx, esi                                   ; min
+        mov     rcx, rdi                                   ; Pthis
+%ENDIF
+        
+?Windows_MersIRandomX:
+; parameters: rcx = Pthis, edx = min, r8d = max
+
+        and     rcx, -16                                   ; align buffer
+        push    rdi
+        mov     edi, r8d                                   ; max
+
+        sub     edi, edx                                   ; max - min
+        jle     short M830                                 ; max <= min (signed)
+        inc     edi                                        ; interval = max - min + 1
+        push    rdx                                        ; save min
+        
+        ; if (interval != LastInterval) {
+        cmp     edi, [rcx+CRandomMersenneA.LastInterval]
+        je      M810
+        ; RLimit = uint32(((uint64)1 << 32) / interval) * interval - 1;}
+        xor     eax, eax                                   ; 0
+        lea     edx, [rax+1]                               ; 1
+        div     edi                                        ; (would give overflow if interval = 1)
+        mul     edi
+        dec     eax
+        mov     [rcx+CRandomMersenneA.RLimit], eax
+        mov     [rcx+CRandomMersenneA.LastInterval], edi
+M810:
+M820:   ; do { // Rejection loop
+        call    ?Windows_MersBRandom                       ; random bits (rcx is preserved)
+        ; longran  = (uint64)BRandom() * interval;
+        mul     edi
+        ; } while (remainder > RLimit);
+        cmp     eax, [rcx+CRandomMersenneA.RLimit]
+        ja      M820
+        
+        ; return (int32)iran + min
+        pop     rax                                        ; min
+        add     eax, edx
+        pop     rdi
+        ret
+        
+M830:   jl      M840
+        ; max = min. Return min
+        mov     eax, edx
+        pop     rdi
+        ret                                                ; max = min exit
+        
+M840:   ; max < min: error
+        mov     eax, 80000000H                             ; error exit
+        pop     rdi
+        ret
+;MersIRandomX ENDP
diff --git a/contrib/libs/asmlib/mother64.asm b/contrib/libs/asmlib/mother64.asm
new file mode 100644
index 0000000000..c6fd34ec3b
--- /dev/null
+++ b/contrib/libs/asmlib/mother64.asm
@@ -0,0 +1,242 @@
+%include "defs.asm"
+
+; ----------------------------- MOTHER64.ASM -----------------------------
+; Author:           Agner Fog
+; Date created:     1998
+; Last modified:    2013-09-11
+; Source URL:       www.agner.org/optimize
+; Project:          asmlib.zip
+; Language:         assembly, NASM/YASM syntax, 64 bit
+; Description:
+; Mother-of-All random number generator by Agner Fog
+; 64-bit mode version for x86-64 compatible microprocessors.
+;
+;  This is a multiply-with-carry type of random number generator
+;  invented by George Marsaglia.  The algorithm is:             
+;  S = 2111111111*X[n-4] + 1492*X[n-3] + 1776*X[n-2] + 5115*X[n-1] + C
+;  X[n] = S modulo 2^32
+;  C = floor(S / 2^32) 
+;
+; C++ prototypes:
+; extern "C" void         MotRandomInit(void * Pthis, int seed);      // Initialization
+; extern "C" int          MotIRandom(void * Pthis, int min, int max); // Get integer random number in desired interval
+; extern "C" double       MotRandom(void * Pthis);                    // Get floating point random number
+; extern "C" unsigned int MotBRandom(void * Pthis);                   // Output random bits
+;
+; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+
+; structure definition and constants:
+%INCLUDE "randomah.asi"
+
+; publics:
+global MotherBRandom, MotBRandom, ?Windows_MotBRandom
+global MotherRandom, MotRandom, MotherIRandom, MotIRandom
+global MotherRandomInit, MotRandomInit
+
+section .data
+align 16
+
+; Data for single instance of random number generator
+MotherInstance: ISTRUC CRandomMotherA
+IEND
+; Size of structure
+MotherSize equ $-MotherInstance
+
+
+SECTION .CODE ALIGN=16   ; code segment
+
+; Single threaded version:
+; extern "C" unsigned int MotherBRandom(); // Output random bits
+
+MotherBRandom: ; PROC                          ; entry for both Windows and Linux call
+        lea     rcx, [MotherInstance]         ; Point to instance
+        jmp     ?Windows_MotBRandom
+;MotherBRandom ENDP       
+
+; Thread-safe version:
+; extern "C" unsigned int MotBRandom(void * Pthis); // Output random bits
+
+MotBRandom: ; PROC 
+%IFDEF UNIX
+        mov     rcx, rdi                    ; translate calling convention
+%ENDIF
+?Windows_MotBRandom:
+        and     rcx, -16                    ; align
+        movdqa  xmm1, oword [rcx+CRandomMotherA.M3]  ; load M3,M2,M1,M0
+        mov     eax,  [rcx+CRandomMotherA.M0]              ; Retrieve previous random number
+        movdqa  xmm2, xmm1                                 ; copy
+        movdqa  xmm3, oword [rcx+CRandomMotherA.MF3] ; factors
+        psrlq   xmm2, 32                                   ; move M2,M0 down
+        movq    qword [rcx+CRandomMotherA.M4], xmm1    ; M4=M3, M3=M2
+        movhps  qword [rcx+CRandomMotherA.M2], xmm1    ; M2=M1, M1=M0
+        pmuludq xmm1, xmm3                                 ; M3*MF3, M1*MF1
+        psrlq   xmm3, 32                                   ; move MF2,MF0 down
+        pmuludq xmm2, xmm3                                 ; M2*MF2, M0*MF0
+        paddq   xmm1, xmm2                                 ; P2+P3, P0+P1
+        movhlps xmm2, xmm1                                 ; Get high qword
+        paddq   xmm1, xmm2                                 ; P0+P1+P2+P3
+        paddq   xmm1, oword [rcx+CRandomMotherA.MC]    ; +carry
+        movq    qword [rcx+CRandomMotherA.M0], xmm1    ; Store new M0 and carry
+        ; convert to double precision float
+        psllq   xmm1, 32                                   ; Discard carry bits
+        psrlq   xmm1, 12                                   ; Get bits into mantissa position
+        por     xmm1, oword [rcx+CRandomMotherA.one] ; Add exponent bits to get number in interval [1,2)
+        movq    [rcx+CRandomMotherA.RanP1], xmm1           ; Store floating point number
+        ret
+        
+;MotBRandom ENDP
+
+        
+; Single threaded version:
+; extern "C" unsigned int MotherRandom();  // Get floating point random number
+
+MotherRandom:
+%IFDEF UNIX
+        lea     rdi, [MotherInstance]         ; Point to instance
+%ENDIF
+%IFDEF WINDOWS
+        lea     rcx, [MotherInstance]         ; Point to instance
+%ENDIF
+
+; Thread-safe version:
+; extern "C" double MotRandom(void * Pthis);  // Get floating point random number
+MotRandom:
+%IFDEF UNIX
+        mov     rcx, rdi                                   ; translate calling convention
+%ENDIF
+        and     rcx, -16                    ; align
+        ; get previously prepared random number
+        movsd   xmm0, [rcx+CRandomMotherA.RanP1]
+        subsd   xmm0, [rcx+CRandomMotherA.one]
+
+        ; make new random number ready for next time
+        call    ?Windows_MotBRandom
+        ret
+;MotherRandom ENDP
+
+
+; Single threaded version:
+; extern "C" unsigned int MotherIRandom(int min, int max); // Get integer random number in desired interval
+
+MotherIRandom: ; PROC
+%IFDEF UNIX
+        mov     r8d, esi                    ; max
+        mov     edx, edi                    ; min
+        lea     rcx, [MotherInstance]       ; Pthis = point to instance
+        jmp     ?Windows_MotIRandom
+%ENDIF
+%IFDEF WINDOWS
+        mov     r8d, edx                    ; max
+        mov     edx, ecx                    ; min
+        lea     rcx, [MotherInstance]       ; Pthis = point to instance
+        jmp     ?Windows_MotIRandom
+%ENDIF
+; MotherIRandom ENDP       
+
+; Thread-safe version:
+; extern "C" int MotIRandom(void * Pthis, int min, int max); // Get integer random number in desired interval
+MotIRandom:
+%IFDEF UNIX
+        ; translate calling convention
+        mov     r8d, edx                    ; max
+        mov     edx, esi                    ; min
+        mov     rcx, rdi                    ; Pthis
+%ENDIF
+        
+?Windows_MotIRandom: ;   LABEL NEAR         ; entry for Windows call
+        and     rcx, -16                    ; align
+        push    r8
+        push    rdx
+        call    ?Windows_MotBRandom         ; make random number
+        pop     rcx                         ; min
+        pop     r8                          ; max
+        sub     r8d, ecx
+        js      short rerror                ; max < min
+        inc     r8d                         ; interval = max - min + 1
+        mul     r8d                         ; multiply random number eax by interval and truncate
+        lea     eax, [rdx+rcx]              ; add min to interval*BRandom >> 32
+        ret                                 ; ret 8 if not _cdecl calling
+
+rerror: mov     eax, 80000000h              ; error exit   
+        ret                                 ; ret 8 if not _cdecl calling
+;MotIRandom ENDP
+
+
+; Single threaded version:
+; extern "C" unsigned int MotherRandomInit(int seed);  // Initialization
+
+MotherRandomInit: ; PROC
+%IFDEF UNIX
+        mov     edx, edi                    ; seed
+        lea     rcx, [MotherInstance]       ; Pthis = point to instance
+        jmp     ?Windows_MotRandomInit
+%ENDIF
+%IFDEF WINDOWS
+        mov     edx, ecx                    ; seed
+        lea     rcx, [MotherInstance]       ; Pthis = point to instance
+        jmp     ?Windows_MotRandomInit
+%ENDIF
+;MotherRandomInit ENDP       
+
+; Thread-safe version:
+; extern "C" void MotRandomInit(void * Pthis, int seed);  // Initialization
+MotRandomInit: ; PROC
+%IFDEF UNIX
+        ; translate calling convention
+        mov     edx, esi                    ; seed
+        mov     rcx, rdi                    ; Pthis
+%ENDIF
+        
+?Windows_MotRandomInit: ;   LABEL NEAR         ; entry for Windows call
+        and     rcx, -16                    ; align
+        ; clear my buffer
+        push    rdi
+        push    rcx
+        mov     rdi, rcx                    ; Pthis
+        add     rdi, 16
+        mov     ecx, (MotherSize - 16) / 4
+        xor     eax, eax
+        cld
+        rep     stosd
+        pop     rcx
+        
+        ; insert constants
+        mov     dword [rcx+CRandomMotherA.one+4], 3FF00000H  ; high dword of 1.0       
+        mov     dword [rcx+CRandomMotherA.MF0], 5115             ; factors
+        mov     dword [rcx+CRandomMotherA.MF1], 1776
+        mov     dword [rcx+CRandomMotherA.MF2], 1492
+        mov     dword [rcx+CRandomMotherA.MF3], 2111111111
+        
+        ; initialize from seed
+        mov     eax, edx                                   ; seed        
+        ; make random numbers and put them into buffer
+        mov     edx, 29943829
+        imul    eax, edx
+        dec     eax
+        mov     [rcx+CRandomMotherA.M0], eax
+        imul    eax, edx
+        dec     eax
+        mov     [rcx+CRandomMotherA.M1], eax
+        imul    eax, edx
+        dec     eax
+        mov     [rcx+CRandomMotherA.M2], eax
+        imul    eax, edx
+        dec     eax
+        mov     [rcx+CRandomMotherA.M3], eax
+        imul    eax, edx
+        dec     eax
+        mov     [rcx+CRandomMotherA.MC], eax
+
+        ; randomize some more
+        mov     edi, 20                                    ; loop counter
+r90:    call    ?Windows_MotBRandom                        ; (rcx and rdi unchanged)
+        dec     edi
+        jnz     r90
+        pop     rdi
+        ret
+;MotRandomInit ENDP
+
+ ;       END
diff --git a/contrib/libs/asmlib/physseed64.asm b/contrib/libs/asmlib/physseed64.asm
new file mode 100644
index 0000000000..b30fc26712
--- /dev/null
+++ b/contrib/libs/asmlib/physseed64.asm
@@ -0,0 +1,396 @@
+%include "defs.asm"
+
+;*************************  physseed64.asm  **********************************
+; Author:           Agner Fog
+; Date created:     2010-08-03
+; Last modified:    2013-09-13
+; Source URL:       www.agner.org/optimize
+; Project:          asmlib.zip
+; C++ prototype:
+; extern "C" int PhysicalSeed(int seeds[], int NumSeeds);
+;
+; Description:
+; Generates a non-deterministic random seed from a physical random number generator 
+; which is available on some processors. 
+; Uses the time stamp counter (which is less random) if no physical random number
+; generator is available.
+; The code is not optimized for speed because it is typically called only once.
+;
+; Parameters:
+; int seeds[]       An array which will be filled with random numbers
+; int NumSeeds      Indicates the desired number of 32-bit random numbers
+;
+; Return value:     0   Failure. No suitable instruction available (processor older than Pentium)
+;                   1   No physical random number generator. Used time stamp counter instead
+;                   2   Success. VIA physical random number generator used
+;                   3   Success. Intel physical random number generator used
+;                   4   Success. Intel physical seed generator used
+; 
+; The return value will indicate the availability of a physical random number generator
+; even if NumSeeds = 0.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+
+%define NUM_TRIES   20                 ; max number of tries for rdseed and rdrand instructions
+
+%define TESTING     0                  ; 1 for test only
+
+global PhysicalSeed
+
+; Direct entries to CPU-specific versions
+global PhysicalSeedNone: function
+global PhysicalSeedRDTSC: function
+global PhysicalSeedVIA: function
+global PhysicalSeedRDRand: function
+global PhysicalSeedRDSeed function
+
+; ***************************************************************************
+; Define registers used for function parameters, used in 64-bit mode only
+; ***************************************************************************
+ 
+%IFDEF WINDOWS
+  %define par1     rcx
+  %define par2     rdx
+  %define par3     r8
+  %define par1d    ecx
+  %define par2d    edx
+  %define par3d    r8d
+%ENDIF
+  
+%IFDEF UNIX
+  %define par1     rdi
+  %define par2     rsi
+  %define par3     rdx
+  %define par1d    edi
+  %define par2d    esi
+  %define par3d    edx
+%ENDIF 
+
+
+SECTION .text  align=16
+
+%IFDEF WINDOWS
+global PhysicalSeedD@8                 ; DLL version
+PhysicalSeedD@8:
+%ENDIF
+
+PhysicalSeed:
+        jmp     [PhysicalSeedDispatch] ; Go to appropriate version, depending on instructions available
+
+
+PhysicalSeedRDSeed:
+        push    rbx
+        test    par2d, par2d           ; NumSeeds
+        jz      S300 
+        js      S900
+        mov     par3d, par2d           ; NumSeeds
+        shr     par3d, 1
+        jz      S150
+        ; do 64 bits at a time
+S100:   mov     ebx, NUM_TRIES
+S110:   ; rdseed rax
+%if     TESTING
+        mov     eax, par3d
+        stc
+%ELSE
+        db 48h, 0Fh, 0C7h, 0F8h        ; rdseed rax
+%ENDIF
+        jc      S120
+        ; failed. try again
+        dec     ebx
+        jz      S900
+        jmp     S110
+S120:   mov     [par1], rax
+        add     par1, 8
+        dec     par3d
+        jnz     S100                   ; loop 64 bits
+S150:
+        and     par2d, 1
+        jz      S300
+        ; an odd 32 bit remains
+S200:   mov     ebx, NUM_TRIES
+S210:   ; rdseed rax
+%if     TESTING
+        mov     eax, par3d
+        stc
+%ELSE
+        db 0Fh, 0C7h, 0F8h             ; rdseed eax
+%ENDIF
+        jc      S220
+        ; failed. try again
+        dec     ebx
+        jz      S900
+        jmp     S210
+S220:   mov     [par1], eax
+S300:   mov     eax, 4                 ; return value
+        pop     rbx
+        ret
+S900:   ; failure 
+        xor     eax, eax               ; return 0
+        pop     rbx
+        ret
+                
+
+PhysicalSeedRDRand:
+        push    rbx
+        test    par2d, par2d           ; NumSeeds
+        jz      R300
+        js      R900         
+        mov     par3d, par2d           ; NumSeeds
+        shr     par3d, 1               ; NumSeeds/2
+        jz      R150
+        ; do 64 bits at a time
+R100:   mov     ebx, NUM_TRIES
+R110:   ; rdrand rax
+%if     TESTING
+        mov     eax, par3d
+        stc
+%ELSE
+        db 48h, 0Fh, 0C7h, 0F0h        ; rdrand rax
+%ENDIF
+        jc      R120
+        ; failed. try again
+        dec     ebx
+        jz      R900
+        jmp     R110
+R120:   mov     [par1], rax
+        add     par1, 8
+        dec     par3d
+        jnz     R100                   ; loop 64 bits
+R150:
+        and     par2d, 1
+        jz      R300
+        ; an odd 32 bit remains
+R200:   mov     ebx, NUM_TRIES
+R210:   ; rdrand eax
+%if     TESTING
+        mov     eax, par3d
+        stc
+%ELSE
+        db 0Fh, 0C7h, 0F0h             ; rdrand eax
+%ENDIF
+        jc      R220
+        ; failed. try again
+        dec     ebx
+        jz      R900
+        jmp     R210
+R220:   mov     [par1], eax
+R300:   mov     eax, 4                 ; return value
+        pop     rbx
+        ret
+R900:   ; failure 
+        xor     eax, eax               ; return 0
+        pop     rbx
+        ret
+
+
+PhysicalSeedVIA:
+;       VIA XSTORE  supported
+        push    rbx
+%IFDEF WINDOWS
+        push    rsi
+        push    rdi
+        mov     rdi, rcx               ; seeds
+        mov     esi, edx               ; NumSeeds
+%ENDIF        
+        mov     ecx, esi               ; NumSeeds
+        and     ecx, -2                ; round down to nearest even
+        jz      T200                   ; NumSeeds <= 1
+        ; make an even number of random dwords
+        shl     ecx, 2                 ; number of bytes (divisible by 8)
+        mov     edx, 3                 ; quality factor
+%if     TESTING
+        mov     eax, 1
+        rep stosb
+%ELSE        
+        db 0F3H, 00FH, 0A7H, 0C0H      ; rep xstore instuction
+%ENDIF
+T200:        
+        test    esi, 1
+        jz      T300
+        ; NumSeeds is odd. Make 8 bytes in temporary buffer and store 4 of the bytes
+        mov     rbx, rdi               ; current output pointer
+        mov     ecx, 4                 ; Will generate 4 or 8 bytes, depending on CPU
+        mov     edx, 3                 ; quality factor
+        push    rcx                    ; make temporary space on stack
+        mov     rdi, rsp               ; point to buffer on stack
+%if     TESTING
+        mov     eax, 1
+        rep stosb
+%ELSE        
+        db 0F3H, 00FH, 0A7H, 0C0H      ; rep xstore instuction
+%ENDIF
+        pop     rax
+        mov     [rbx], eax             ; store the last 4 bytes
+T300:
+        mov     eax, 2                 ; return value        
+%IFDEF WINDOWS
+        pop     rdi
+        pop     rsi
+%ENDIF  
+        pop     rbx      
+        ret        
+
+
+PhysicalSeedRDTSC:
+%IFDEF WINDOWS
+        push    rbx
+        push    rcx
+        push    rdx
+        xor     eax, eax
+        cpuid                          ; serialize
+        rdtsc                          ; get time stamp counter
+        pop     rbx                    ; numseeds
+        pop     rcx                    ; seeds
+        test    ebx, ebx
+        jz      U300                   ; zero seeds
+        js      U900                   ; failure
+        mov     [rcx], eax             ; store time stamp counter as seeds[0]
+        add     rcx, 4
+        dec     ebx
+        jz      U300
+        mov     [rcx], edx             ; store upper part of time stamp counter as seeds[1]
+        add     rcx, 4
+        dec     ebx
+        jz      U300
+        xor     eax, eax
+U100:   mov     [rcx], eax             ; store 0 for the rest
+        add     rcx, 4
+        dec     ebx
+        jnz     U100
+U300:   mov     eax, 1                 ; return value        
+        pop     rbx
+        ret
+U900:   ; failure         
+        xor     eax, eax               ; return 0
+        pop     rbx
+        ret
+        
+%ELSE   ; UNIX
+
+        push    rbx
+        xor     eax, eax
+        cpuid                          ; serialize
+        rdtsc                          ; get time stamp counter
+        test    esi, esi               ; numseeds
+        jz      U300                   ; zero seeds
+        js      U900                   ; failure
+        mov     [rdi], eax             ; store time stamp counter as seeds[0]
+        add     rdi, 4
+        dec     esi
+        jz      U300
+        mov     [rdi], edx             ; store upper part of time stamp counter as seeds[1]
+        add     rdi, 4
+        dec     esi
+        jz      U300
+        xor     eax, eax
+U100:   mov     [rdi], eax             ; store 0 for the rest
+        add     rdi, 4
+        dec     esi
+        jnz     U100
+U300:   mov     eax, 1                 ; return value        
+        pop     rbx
+        ret
+U900:   ; failure         
+        xor     eax, eax               ; return 0
+        pop     rbx
+        ret 
+
+%ENDIF  
+
+
+PhysicalSeedNone:                      ; no possible generation
+        xor     eax, eax
+        test    par2d, par2d           ; numseeds
+        jz      N200
+N100:   mov     [par1], eax
+        add     par1, 4
+        dec     par2d
+        jnz     N100
+N200:   ret                            ; return 0
+
+
+PhysicalSeedDispatcher:
+        push    rbx
+%IFDEF WINDOWS
+        push    rcx
+        push    rdx
+%ENDIF
+        ; test if RDSEED supported
+        xor     eax, eax
+        cpuid
+        cmp     eax, 7
+        jb      P200                   ; RDSEED not supported
+        mov     eax, 7
+        xor     ecx, ecx
+        cpuid
+        bt      ebx, 18
+       ; jc      USE_RDSEED             ; not tested yet!!
+
+P200:   ; test if RDRAND supported
+        mov     eax, 1
+        cpuid
+        bt      ecx, 30
+        jc      USE_RDRAND
+
+        ; test if VIA xstore instruction supported
+        mov     eax, 0C0000000H
+        push    rax
+        cpuid
+        pop     rbx
+        cmp     eax, ebx
+        jna     P300                   ; not a VIA processor
+        lea     eax, [rbx+1]
+        cpuid
+        bt      edx, 3
+        jc      VIA_METHOD
+
+P300:   ; test if RDTSC supported
+        mov     eax, 1
+        cpuid
+        bt      edx, 4
+        jc      USE_RDTSC              ; XSTORE instruction not supported or not enabled
+        
+FAILURE: ; No useful instruction supported
+        lea     rax, [PhysicalSeedNone]
+        jmp     P800
+
+USE_RDRAND:     ; Use RDRAND instruction        
+        lea     rax, [PhysicalSeedRDRand]
+        jmp     P800
+
+USE_RDSEED:     ; Use RDSEED instruction (not tested yet)
+        lea     rax, [PhysicalSeedRDSeed]
+        jmp     P800
+
+VIA_METHOD:     ; Use VIA xstore instructions   
+        lea     rax, [PhysicalSeedVIA]
+        jmp     P800
+        
+USE_RDTSC:
+        lea     rax, [PhysicalSeedRDTSC]
+        ;jmp     P800
+        
+P800:   mov     [PhysicalSeedDispatch], rax
+%IFDEF WINDOWS
+        pop     rdx
+        pop     rcx
+%ENDIF
+        pop     rbx
+        jmp     rax                    ; continue in dispatched version
+        
+
+; -----------------------------------------------------------------
+;  Data section for dispatcher
+; -----------------------------------------------------------------
+
+SECTION .data
+
+; Pointer to appropriate versions. Initially point to dispatcher
+PhysicalSeedDispatch  DQ PhysicalSeedDispatcher
+
+%IFDEF POSITIONINDEPENDENT
+; Fix potential problem in Mac linker
+        DD      0, 0
+%ENDIF
diff --git a/contrib/libs/asmlib/popcount64.asm b/contrib/libs/asmlib/popcount64.asm
new file mode 100644
index 0000000000..c4ad64e03b
--- /dev/null
+++ b/contrib/libs/asmlib/popcount64.asm
@@ -0,0 +1,112 @@
+%include "defs.asm"
+
+;*************************  popcount64.asm  ************************************
+; Author:           Agner Fog
+; Date created:     2011-07-20
+; Last modified:    2011-07-20
+
+; Description:
+; Population count function. Counts the number of 1-bits in a 32-bit integer
+; unsigned int A_popcount (unsigned int x);
+;
+; Position-independent code is generated if POSITIONINDEPENDENT is defined.
+;
+; CPU dispatching included for 386 and SSE4.2 instruction sets.
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+default rel
+
+global A_popcount: function
+
+; Direct entries to CPU-specific versions
+global popcountGeneric: function
+global popcountSSE42: function
+
+; Imported from instrset32.asm:
+extern InstructionSet                 ; Instruction set for CPU dispatcher
+
+section .text
+
+;******************************************************************************
+;                               popcount function
+;******************************************************************************
+
+
+A_popcount: ; function dispatching
+        jmp     near [popcountDispatch] ; Go to appropriate version, depending on instruction set
+
+align 16
+popcountSSE42: ; SSE4.2 version
+%ifdef  WINDOWS
+        popcnt  eax, ecx
+%else
+        popcnt  eax, edi
+%endif        
+        ret
+
+
+;******************************************************************************
+;                               popcount function generic
+;******************************************************************************
+
+popcountGeneric: ; Generic version
+%ifdef  WINDOWS
+        mov     eax, ecx
+%else
+        mov     eax, edi
+%endif        
+        mov     edx, eax
+        shr     eax, 1
+        and     eax, 55555555h         ; odd bits in eax, even bits in edx
+        and     edx, 55555555h
+        add     eax, edx
+        mov     edx, eax
+        shr     eax, 2
+        and     eax, 33333333h
+        and     edx, 33333333h
+        add     eax, edx
+        mov     edx, eax
+        shr     eax, 4
+        add     eax, edx
+        and     eax, 0F0F0F0Fh
+        mov     edx, eax
+        shr     eax, 8
+        add     eax, edx
+        mov     edx, eax
+        shr     eax, 16
+        add     eax, edx
+        and     eax, 03FH
+        ret
+;popcountGeneric end
+
+; ********************************************************************************
+; CPU dispatching for popcount. This is executed only once
+; ********************************************************************************
+
+%ifdef  WINDOWS
+%define par1      rcx                  ; parameter 1, pointer to haystack
+%else
+%define par1      rdi                  ; parameter 1, pointer to haystack
+%endif
+
+popcountCPUDispatch:
+        ; get supported instruction set
+        push    par1
+        call    InstructionSet
+        pop     par1
+        ; Point to generic version of strstr
+        lea     rdx, [popcountGeneric]
+        cmp     eax, 9                ; check popcnt supported
+        jb      Q100
+        ; SSE4.2 supported
+        ; Point to SSE4.2 version of strstr
+        lea     rdx, [popcountSSE42]
+Q100:   mov     [popcountDispatch], rdx
+        ; Continue in appropriate version 
+        jmp     rdx
+
+SECTION .data
+
+; Pointer to appropriate versions. Initially point to dispatcher
+popcountDispatch  DQ popcountCPUDispatch
diff --git a/contrib/libs/asmlib/procname64.asm b/contrib/libs/asmlib/procname64.asm
new file mode 100644
index 0000000000..1b77b74320
--- /dev/null
+++ b/contrib/libs/asmlib/procname64.asm
@@ -0,0 +1,145 @@
+%include "defs.asm"
+
+;                   procname64.asm 
+;
+; Author:           Agner Fog
+; Date created:     2007
+; Last modified:    2011-07-02
+; Description:
+; ProcessorName
+; =============
+; This function produces a zero-terminated ASCII string containing a name
+; for the microprocessor in human-readable format.
+; 
+; Copyright (c) 2007-2011 GNU General Public License www.gnu.org/licenses
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+
+global  ProcessorName: function
+
+SECTION .data
+align 16
+
+NameBuffer times 50H db 0              ; Static buffer to contain name
+
+
+SECTION .text  align=16
+
+; ********** ProcessorName function **********
+; C++ prototype:
+; void ProcessorName (char * text);
+
+; This function finds the name of the microprocessor. The name is returned
+; in the parameter text, which must be a character array of at least 68 bytes.
+
+ProcessorName:
+        push    rbx
+        push    rdi
+        lea     rdi, [NameBuffer]      ; text pointer
+        
+        mov     eax, 80000000H
+        cpuid
+        cmp     eax, 80000004H         ; text if extended vendor string available
+        jb      no_ext_vendor_string
+
+        ; Has extended vendor string
+        mov     eax, 80000002H
+        cpuid
+        mov     [rdi], eax             ; store 16 bytes of extended vendor string
+        mov     [rdi+4], ebx
+        mov     [rdi+8], ecx
+        mov     [rdi+0CH], edx
+        mov     eax, 80000003H
+        cpuid
+        mov     [rdi+10H], eax         ; next 16 bytes
+        mov     [rdi+14H], ebx
+        mov     [rdi+18H], ecx
+        mov     [rdi+1CH], edx
+        mov     eax, 80000004H
+        cpuid
+        mov     [rdi+20H], eax         ; next 16 bytes
+        mov     [rdi+24H], ebx
+        mov     [rdi+28H], ecx
+        mov     [rdi+2CH], edx
+        jmp     get_family_and_model
+        
+no_ext_vendor_string:
+        ; No extended vendor string. Get short vendor string
+        xor     eax, eax
+        cpuid
+        mov     [rdi],ebx              ; store short vendor string
+        mov     [rdi+4],edx
+        mov     [rdi+8],ecx
+        mov     byte [rdi+12],0    ; terminate string
+        
+get_family_and_model:
+        xor     eax, eax
+        mov     ecx, 30H
+        cld
+        repne   scasb                  ; find end of text
+        dec     rdi
+        
+        mov     dword [rdi], ' Fam'   ; Append text " Family "
+        mov     dword [rdi+4], 'ily '
+        add     rdi, 8
+
+        mov     eax, 1
+        cpuid                          ; Get family and model
+        mov     ebx, eax
+        mov     ecx, eax
+        shr     eax, 8
+        and     eax, 0FH               ; Family
+        shr     ecx, 20
+        and     ecx, 0FFH              ; Extended family
+        add     eax, ecx               ; Family + extended family
+        call    WriteHex               ; Write as hexadecimal
+
+        mov     dword [rdi], 'H Mo' ; Write text "H Model "
+        mov     dword [rdi+4], 'del '
+        add     rdi, 8
+        
+        mov     eax, ebx
+        shr     eax, 4
+        and     eax, 0FH               ; Model
+        mov     ecx, ebx
+        shr     ecx, 12
+        and     ecx, 0F0H              ; Extended model
+        or      eax, ecx               ; Model | extended model
+        call    WriteHex               ; Write as hexadecimal
+
+        mov     dword [rdi], 'H'       ; Write text "H"
+        
+PNEND:  ; finished
+        lea     rax, [NameBuffer]      ; Pointer to result
+        pop     rdi
+        pop     rbx
+        ret
+;ProcessorName ENDP
+
+WriteHex:                              ; Local function: Write 2 hexadecimal digits
+        ; Parameters: AL = number to write, RDI = text destination
+        mov     ecx, eax
+        shr     ecx, 4
+        and     ecx, 0FH               ; most significant digit first
+        cmp     ecx, 10
+        jnb     W1
+        ; 0 - 9
+        add     ecx, '0'
+        jmp     W2
+W1:     ; A - F
+        add     ecx, 'A' - 10
+W2:     mov     [rdi], cl              ; write digit
+                
+        mov     ecx, eax
+        and     ecx, 0FH               ; next digit
+        cmp     ecx, 10
+        jnb     W3
+        ; 0 - 9
+        add     ecx, '0'
+        jmp     W4
+W3:     ; A - F
+        add     ecx, 'A' - 10
+W4:     mov     [rdi+1], cl            ; write digit
+        add     rdi, 2                 ; advance string pointer
+        ret
diff --git a/contrib/libs/asmlib/randomah.asi b/contrib/libs/asmlib/randomah.asi
new file mode 100644
index 0000000000..ed7a0185a4
--- /dev/null
+++ b/contrib/libs/asmlib/randomah.asi
@@ -0,0 +1,290 @@
+; ----------------------------- RANDOMAH.ASI ---------------------------
+;
+;  Author:           Agner Fog
+;  Date created:     1998
+;  Last modified:    2013-09-09
+;  Description:
+;  Assembly include file containing
+;  structure/class definitions for random number generators
+;
+; Copyright (c) 1998-2013 GNU General Public License www.gnu.org/licenses
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Definitions for Mersenne Twister:
+
+TEMPERING EQU 1              ; set to 0 if no tempering (improves speed by 25%)
+
+%if 0
+; define constants for MT11213A:
+MERS_N    EQU 351
+MERS_M    EQU 175
+MERS_R    EQU 19
+MERS_A    EQU 0E4BD75F5H
+MERS_U    EQU 11
+MERS_S    EQU 7
+MERS_T    EQU 15
+MERS_L    EQU 17
+MERS_B    EQU 655E5280H
+MERS_C    EQU 0FFD58000H
+
+%ELSE
+; or constants for MT19937:
+MERS_N    EQU 624
+MERS_M    EQU 397
+MERS_R    EQU 31
+MERS_A    EQU 09908B0DFH
+MERS_U    EQU 11
+MERS_S    EQU 7
+MERS_T    EQU 15
+MERS_L    EQU 18
+MERS_B    EQU 9D2C5680H
+MERS_C    EQU 0EFC60000H
+
+%ENDIF
+
+LOWER_MASK EQU (1 << MERS_R) - 1             ; lower MERS_R bits
+UPPER_MASK EQU -1 << MERS_R                  ; upper 32-MERS_R bits
+
+; Define class CRandomMersenneA member data
+; Must be aligned by 16.
+
+STRUC CRandomMersenneA
+.Fill1    RESD      4        ; Alignment filler
+.PreInt:  RESD      4        ; premade tempered integer numbers, ready to use
+.PreFlt:  RESQ      4        ; premade floating point numbers, ready to use (subtract 1.0)
+          RESQ      1        ; last PreFlt unaligned overrun if MERS_N mod 4 = 1
+.TmpFlt:  RESQ      1        ; temporary storage of floating point random number
+.PreInx:  RESD      1        ; index to next PreInt and PreFlt number
+.Instset: RESD      1        ; Instruction set
+.LastInterval: RESD 1        ; Last interval length for IRandomX
+.RLimit:  RESD      1        ; Rejection limit used by IRandomX
+.TMB:     RESD      4        ; 4 copies of MERS_B constant
+.TMC:     RESD      4        ; 4 copies of MERS_C constant
+.one:     RESQ      2        ; 2 copies of 1.0 constant
+.MTI:     RESD      1        ; index into MT buffer
+.UMASK:   RESD      1        ; UPPER_MASK
+.LMASK:   RESD      1        ; LOWER_MASK             ; constants
+.MATA:    RESD      1        ; MERS_A
+.wrap1:   RESD      4        ; MT buffer km wraparound
+.MT:      RESD      MERS_N   ; MT history buffer (aligned by 16)
+.wrap2:   RESD      4        ; MT buffer kk wraparound
+%if MERS_N & 3
+         ; MERS_N not divisible by 4. align by 4
+          RESD      (4 - (MERS_N & 3))
+%ENDIF        
+endstruc ; CRandomMersenneA
+
+
+; Definitions for Mother-of-all generator:
+
+; Define class CRandomMotherA member data
+; Must be aligned by 16. Preferably aligned by 64 to fit a cache line
+STRUC   CRandomMotherA 
+.Fill2   RESD      4         ; Alignment filler
+.one     RESQ      1         ; 1.0
+.Instset RESD      1         ; Instruction set
+.M4      RESD      1         ; x[n-4]
+.M3      RESD      1         ; x[n-3] (aligned)
+.M2      RESD      1         ; x[n-2]
+.M1      RESD      1         ; x[n-1]
+.M0      RESD      1         ; x[n]
+.MC      RESD      1         ; Carry (aligned)
+.zero    RESD      1         ; Zero-extension of carry
+.RanP1   RESQ      1         ; Double random number in interval [1,2)
+.MF3     RESD      1         ; 2111111111 (aligned)
+.MF2     RESD      1         ; 1492
+.MF1     RESD      1         ; 1776
+.MF0     RESD      1         ; 5115
+endstruc ; CRandomMotherA
+
+MOTHERF0 EQU 5115            ; factor 0
+MOTHERF1 EQU 1776            ; factor 1
+MOTHERF2 EQU 1492            ; factor 2
+MOTHERF3 EQU 2111111111      ; factor 3
+
+
+; ***************************************************************************
+; Definitions for SFMT generator
+; ***************************************************************************
+
+; Choose Mersenne exponent.
+; Higher values give longer cycle length and use more memory:
+; MEXP equ    607
+; MEXP equ   1279
+; MEXP equ   2281
+; MEXP equ   4253
+  MEXP equ  11213
+; MEXP equ  19937
+; MEXP equ  44497
+
+%if MEXP == 44497
+SFMT_N      equ  348         ; Size of state vector
+SFMT_M      equ  330         ; Position of intermediate feedback
+SFMT_SL1    equ    5         ; Left shift of W[N-1], 32-bit words
+SFMT_SL2    equ    3         ; Left shift of W[0], *8, 128-bit words
+SFMT_SR1    equ    9         ; Right shift of W[M], 32-bit words
+SFMT_SR2    equ	   3         ; Right shift of W[N-2], *8, 128-bit words
+SFMT_MASK1  equ  0effffffbH  ;first DWORD of AND mask
+; AND mask:
+%define SFMT_MASK   0effffffbH,0dfbebfffH,0bfbf7befH,09ffd7bffH
+; Period certification vector
+%define 1,0,0a3ac4000H,0ecc1327aH
+
+%elif MEXP == 19937
+SFMT_N      equ  156         ; Size of state vector
+SFMT_M      equ  122         ; Position of intermediate feedback
+SFMT_SL1    equ   18         ; Left shift of W[N-1], 32-bit words
+SFMT_SL2    equ    1         ; Left shift of W[0], *8, 128-bit words
+SFMT_SR1    equ   11         ; Right shift of W[M], 32-bit words
+SFMT_SR2    equ	   1         ; Right shift of W[N-2], *8, 128-bit words
+SFMT_MASK1  equ  0dfffffefH  ;first DWORD of AND mask
+%define SFMT_MASK   0dfffffefH,0ddfecb7fH,0bffaffffH,0bffffff6H
+%define SFMT_PARITY 1,0,0,013c9e684H
+
+%elif MEXP == 11213
+SFMT_N      equ  88          ; Size of state vector
+SFMT_M      equ  68          ; Position of intermediate feedback
+SFMT_SL1	equ  14          ; Left shift of W[N-1], 32-bit words
+SFMT_SL2	equ   3          ; Left shift of W[0], *8, 128-bit words
+SFMT_SR1	equ   7          ; Right shift of W[M], 32-bit words
+SFMT_SR2	equ   3          ; Right shift of W[N-2], *8, 128-bit words
+SFMT_MASK1  equ  0effff7fbH  ;first DWORD of AND mask
+%define SFMT_MASK	0effff7fbH,0ffffffefH,0dfdfbfffH,07fffdbfdH
+%define SFMT_PARITY 1,0,0e8148000H,0d0c7afa3H
+
+%elif MEXP == 4253
+SFMT_N      equ  34          ; Size of state vector
+SFMT_M      equ  17          ; Position of intermediate feedback
+SFMT_SL1	equ  20          ; Left shift of W[N-1], 32-bit words
+SFMT_SL2	equ  1           ; Left shift of W[0], *8, 128-bit words
+SFMT_SR1	equ  7           ; Right shift of W[M], 32-bit words
+SFMT_SR2	equ  1           ; Right shift of W[N-2], *8, 128-bit words
+SFMT_MASK1  equ  09f7bffffH  ;first DWORD of AND mask
+%define SFMT_MASK	09f7bffffH,09fffff5fH,03efffffbH,0fffff7bbH
+%define SFMT_PARITY 0a8000001H,0af5390a3H,0b740b3f8H,06c11486dH
+
+%elif MEXP == 2281
+SFMT_N      equ  18          ; Size of state vector
+SFMT_M      equ  12          ; Position of intermediate feedback
+SFMT_SL1	equ  19          ; Left shift of W[N-1], 32-bit words
+SFMT_SL2	equ   1          ; Left shift of W[0], *8, 128-bit words
+SFMT_SR1	equ   5          ; Right shift of W[M], 32-bit words
+SFMT_SR2	equ   1          ; Right shift of W[N-2], *8, 128-bit words
+SFMT_MASK1  equ   0bff7ffbfH ;first DWORD of AND mask
+%define SFMT_MASK	0bff7ffbfH,0fdfffffeH,0f7ffef7fH,0f2f7cbbfH
+%define SFMT_PARITY 1,0,0,041dfa600H
+
+%elif MEXP == 1279
+SFMT_N      equ  10          ; Size of state vector
+SFMT_M      equ   7          ; Position of intermediate feedback
+SFMT_SL1	equ  14          ; Left shift of W[N-1], 32-bit words
+SFMT_SL2	equ   3          ; Left shift of W[0], *8, 128-bit words
+SFMT_SR1	equ   5          ; Right shift of W[M], 32-bit words
+SFMT_SR2	equ   1          ; Right shift of W[N-2], *8, 128-bit words
+SFMT_MASK1  equ   0f7fefffdH ;first DWORD of AND mask
+%define SFMT_MASK	0f7fefffdH,07fefcfffH,0aff3ef3fH,0b5ffff7fH
+%define SFMT_PARITY 1,0,0,020000000H
+
+%elif MEXP == 607
+SFMT_N      equ   5          ; Size of state vector
+SFMT_M      equ   2          ; Position of intermediate feedback
+SFMT_SL1	equ  15          ; Left shift of W[N-1], 32-bit words
+SFMT_SL2	equ   3          ; Left shift of W[0], *8, 128-bit words
+SFMT_SR1	equ  13          ; Right shift of W[M], 32-bit words
+SFMT_SR2	equ   3          ; Right shift of W[N-2], *8, 128-bit words
+SFMT_MASK1  equ   0fdff37ffH ;first DWORD of AND mask
+%define SFMT_MASK	0fdff37ffH,0ef7f3f7dH,0ff777b7dH,07ff7fb2fH
+%define SFMT_PARITY 1,0,0,05986f054H
+
+%ELSE
+%error MEXP must have one of the predefined values
+%ENDIF
+
+STRUC CRandomSFMTA 
+.Fill3         RESD     4    ; Alignment filler
+
+; Parameters for Mother-Of-All generator:
+.M3:           RESD     1    ; x[n-3] (aligned)
+               RESD     1    ; unused filler to fit the pmuludq instruction
+.M2:           RESD     1    ; x[n-2]
+               RESD     1    ; unused filler to fit the pmuludq instruction
+.M1:           RESD     1    ; x[n-1]
+               RESD     1    ; unused filler to fit the pmuludq instruction
+.M0:           RESD     1    ; x[n]
+.MC:           RESD     1    ; Carry (zero-extends into one)
+.one:          RESQ     1    ; 1.0 (low dword = zero-extension of carry) (aligned)
+.TempRan:      RESQ     1    ; Temporary random number
+.MF3:          RESD     1    ; 2111111111 (aligned)
+.Instset:      RESD     1    ; Instruction set
+.MF2:          RESD     1    ; 1492 (MF3,MF2,MF1,MF0 interleaved with other variables to fit the pmuludq instruction)
+               RESD     1    ; Filler (may be used for read-only parameter, but not for read/write parameter)
+.MF1:          RESD     1    ; 1776
+               RESD     1    ; Filler (may be used for read-only parameter, but not for read/write parameter)
+.MF0:          RESD     1    ; 5115
+               RESD     1    ; Filler (may be used for read-only parameter, but not for read/write parameter)
+
+; Parameters for IRandomX:
+.LASTINTERVAL: RESD     1    ; Last interval length for IRandomX
+.RLIMIT:       RESD     1    ; Rejection limit used by IRandomX
+
+; Parameters for SFMT generator:
+.USEMOTHER:    RESD     1    ; 1 if combine with Mother-Of-All generator
+.IX:           RESD     1    ; Index into state buffer for SFMT
+
+.AMASK:        RESD     4    ; AND mask (aligned)
+.STATE:        RESD SFMT_N*4 ; State vector (aligned)
+endstruc ; CRandomSFMTA 
+
+
+; Load offset of TARGET into ecx. Use position-independent method if necessary
+%macro LOADOFFSET2ECX 1
+%IFNDEF  POSITIONINDEPENDENT
+        mov     ecx, %1
+%ELSE
+        ; get position-independent address of TARGET
+        call    get_thunk_ecx
+        add ecx, %1 - $
+%ENDIF
+%endmacro
+
+; Load offset of TARGET into edi. Use position-independent method if necessary
+%macro LOADOFFSET2EDI 1
+%IFNDEF  POSITIONINDEPENDENT
+        mov     edi, %1
+%ELSE
+        ; get position-independent address of TARGET
+        call    get_thunk_edi
+        add edi, %1 - $
+%ENDIF
+%endmacro
+
+
+; ***************************************************************************
+; Define registers used for function parameters, used in 64-bit mode only
+; ***************************************************************************
+ 
+%IFDEF WINDOWS
+  %define par1     rcx
+  %define par2     rdx
+  %define par3     r8
+  %define par4     r9
+  %define par5     qword [rsp+32+8]   ; stack offset including shadow space
+  %define par1d    ecx
+  %define par2d    edx
+  %define par3d    r8d
+  %define par4d    r9d
+  %define par5d    dword [rsp+32+8]
+%ENDIF
+  
+%IFDEF UNIX
+  %define par1     rdi
+  %define par2     rsi
+  %define par3     rdx
+  %define par4     rcx
+  %define par5     r8
+  %define par1d    edi
+  %define par2d    esi
+  %define par3d    edx
+  %define par4d    ecx
+  %define par5d    r8d
+%ENDIF 
diff --git a/contrib/libs/asmlib/rdtsc64.asm b/contrib/libs/asmlib/rdtsc64.asm
new file mode 100644
index 0000000000..42a0e23203
--- /dev/null
+++ b/contrib/libs/asmlib/rdtsc64.asm
@@ -0,0 +1,53 @@
+%include "defs.asm"
+
+;          RDTSC64.ASM
+;
+; Author:           Agner Fog
+; Date created:     2003
+; Last modified:    2008-10-16
+; Description:
+;
+; Copyright (c) 2009 GNU General Public License www.gnu.org/licenses
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+
+global  ReadTSC: function
+
+SECTION .text  align=16
+
+; ********** ReadTSC function **********
+; C++ prototype:
+; extern "C" __int64 ReadTSC (void);
+
+; This function returns the value of the time stamp counter, which counts 
+; clock cycles. To count how many clock cycles a piece of code takes, call
+; Rdtsc before and after the code to measure and calculate the difference.
+
+; The number of clock cycles taken by the ReadTSC function itself is approximately:
+; Core 2:   730
+; Pentium 4:  700
+; Pentium II and Pentium III: 225
+; AMD Athlon 64, Opteron: 126
+; Does not work on 80386 and 80486.
+
+; Note that clock counts may not be fully reproducible on Intel Core and
+; Core 2 processors because the clock frequency can change. More reliable
+; instruction timings are obtained with the performance monitor counter
+; for "core clock cycles". This requires a kernel mode driver as the one
+; included with www.agner.org/optimize/testp.zip.
+
+ReadTSC:
+        push    rbx                    ; ebx is modified by cpuid
+        sub     eax, eax               ; 0
+        cpuid                          ; serialize
+        rdtsc                          ; read time stamp counter into edx:eax
+        shl     rdx, 32
+        or      rax, rdx               ; combine into 64 bit register        
+        push    rax
+        sub     eax, eax
+        cpuid                          ; serialize
+        pop     rax                    ; return value
+        pop     rbx
+        ret
+;ReadTSC ENDP
diff --git a/contrib/libs/asmlib/round64.asm b/contrib/libs/asmlib/round64.asm
new file mode 100644
index 0000000000..5ed55c53c6
--- /dev/null
+++ b/contrib/libs/asmlib/round64.asm
@@ -0,0 +1,40 @@
+%include "defs.asm"
+
+;          ROUND64.ASM 
+
+; Author:           Agner Fog
+; Date created:     2007-06-15
+; Last modified:    2008-10-16
+; Description:
+; Round function
+
+; Copyright (c) 2009 GNU General Public License www.gnu.org/licenses
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+
+global RoundD: function
+global RoundF: function
+
+
+SECTION .text  align=16
+
+; ********** round function **********
+; C++ prototype:
+; extern "C" int RoundD (double x);
+; extern "C" int RoundF (float  x);
+
+; This function converts a single or double precision floating point number 
+; to an integer, rounding to nearest or even. Does not check for overflow.
+; This function is much faster than the default conversion method in C++
+; which uses truncation.
+
+RoundD:
+        cvtsd2si eax, xmm0             ; Round xmm0 to eax
+        ret
+;RoundD  ENDP
+
+RoundF:
+        cvtss2si eax, xmm0             ; Round xmm0 to eax
+        ret
+;RoundF ENDP
diff --git a/contrib/libs/asmlib/sfmt64.asm b/contrib/libs/asmlib/sfmt64.asm
new file mode 100644
index 0000000000..3ca3cedca0
--- /dev/null
+++ b/contrib/libs/asmlib/sfmt64.asm
@@ -0,0 +1,889 @@
+%include "defs.asm"
+
+; ----------------------------- SFMT64.ASM ---------------------------
+; Author:        Agner Fog
+; Date created:  2008-11-01
+; Last modified: 2013-09-13
+; Project:       randoma library of random number generators
+; Source URL:    www.agner.org/random
+; Description:
+; Random number generator of type SIMD-oriented Fast Mersenne Twister (SFMT)
+; (Mutsuo Saito and Makoto Matsumoto: "SIMD-oriented Fast Mersenne Twister:
+; a 128-bit Pseudorandom Number Generator", Monte Carlo and Quasi-Monte 
+; Carlo Methods 2006, Springer, 2008, pp. 607-622).
+;
+; 64-bit mode version for x86-64 compatible microprocessors.
+; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
+; ----------------------------------------------------------------------
+
+default rel
+
+global SFMTRandomInit, SFMTRandomInitByArray, SFMTBRandom, SFMTRandom
+global SFMTRandomL, SFMTIRandom, SFMTIRandomX, SFMTgenRandomInit
+global SFMTgenRandomInitByArray, SFMTgenRandom, SFMTgenRandomL, SFMTgenIRandom
+global SFMTgenIRandomX, SFMTgenBRandom
+
+extern InstructionSet
+
+; structure definition and constants:
+%INCLUDE "randomah.asi"
+
+
+section .data
+align 16
+; Data for single instance of random number generator
+SFMTInstance: ISTRUC CRandomSFMTA
+; Size of structure
+IEND
+SFMTSize equ $-SFMTInstance
+
+
+align 16
+; Initialization constants for Mother-Of-All:
+InitMother DD 2111111111, 0, 1492, 0, 1776, 0, 5115, 0
+; Initialization Mask for SFMT:
+InitMask   DD SFMT_MASK
+; Period certification vector for SFMT:
+InitParity DD SFMT_PARITY
+
+
+SECTION .CODE align=16   ; code segment
+
+
+; ---------------------------------------------------------------
+;  Thread-safe static link versions for SFMT
+; ---------------------------------------------------------------
+
+; extern "C" void SFMTRandomInit(void * Pthis, int ThisSize, int seed, int IncludeMother = 0);
+; Parameters:
+; par1  = Pthis
+; par2d = ThisSize
+; par3d = seed
+; par4d = IncludeMother
+
+SFMTRandomInit:
+        cmp     par2d, SFMTSize
+        jb      Error                                      ; Error exit if buffer too small
+        push    rbx
+
+        ; Align by 16. Will overlap part of Fill if Pthis unaligned        
+        and     par1, -16
+
+        xor     eax, eax
+        test    par4d, par4d                               ; IncludeMother
+        setnz   al                                         ; convert any nonzero value to 1
+        ; Store USEMOTHER
+        mov     [par1+CRandomSFMTA.USEMOTHER], eax
+        
+        mov     eax, par3d                                 ; seed
+        xor     ebx, ebx                                   ; loop counter i
+        jmp     L002                                       ; go into seeding loop
+
+L001:   ; seeding loop for SFMT
+        ; y = factor * (y ^ (y >> 30)) + (++i);
+        call    InitSubf0                                  ; randomization subfunction
+L002:   mov     [par1+rbx*4+CRandomSFMTA.STATE],eax        ; initialize state
+        cmp     ebx, SFMT_N*4 - 1
+        jb      L001
+
+        ; Put 5 more values into Mother-Of-All generator
+        call    InitSubf0
+        mov     [par1+CRandomSFMTA.M0], eax
+        call    InitSubf0
+        mov     [par1+CRandomSFMTA.M1], eax
+        call    InitSubf0
+        mov     [par1+CRandomSFMTA.M2], eax
+        call    InitSubf0
+        mov     [par1+CRandomSFMTA.M3], eax
+        call    InitSubf0
+        mov     [par1+CRandomSFMTA.MC], eax
+        
+        ; more initialization and period certification
+        call    InitAndPeriod
+        
+        pop     rbx
+        ret
+;SFMTRandomInit ENDP
+        
+Error:                                                     ; Error exit
+        xor     eax, eax
+        div     eax                                        ; Divide by 0
+        ret
+        
+; Subfunction used by SFMTRandomInit
+InitSubf0: ; private
+; y = 1812433253 * (y ^ (y >> 30)) + (++i);
+; input parameters:
+; eax = y
+; ebx = i
+; output:
+; eax = new y
+; ebx = i+1
+; edx modified
+        mov     edx, eax
+        shr     eax, 30
+        xor     eax, edx
+        imul    eax, 1812433253
+        inc     ebx
+        add     eax, ebx
+        ret
+;InitSubf0 endp 
+       
+; Subfunction used by SFMTRandomInitByArray
+InitSubf1: ; private
+; r = 1664525U * (r ^ (r >> 27));
+; input parameters:
+; eax = r
+; output:
+; eax = new r
+; r10 modified
+        mov     r10d, eax
+        shr     eax,  27
+        xor     eax,  r10d
+        imul    eax,  1664525
+        ret
+;InitSubf1 endp
+
+; Subfunction used by SFMTRandomInitByArray
+InitSubf2: ; private
+; r = 1566083941U * (r ^ (r >> 27));
+; input parameters:
+; eax = r
+; output:
+; eax = new r
+; r10 modified
+        mov     r10d, eax
+        shr     eax,  27
+        xor     eax,  r10d
+        imul    eax,  1566083941
+        ret
+;InitSubf2 endp
+
+
+; Subfunciton for initialization and period certification, except seeding
+; par1 = aligned pointer to CRandomSFMTA
+InitAndPeriod: ; private
+        push    rbx
+        
+        ; initialize constants for Mother-Of-All
+        movaps  xmm0, oword [InitMother]
+        movaps  oword [par1+CRandomSFMTA.MF3], xmm0
+        movaps  xmm0, oword [InitMother+16]
+        movaps  oword [par1+CRandomSFMTA.MF1], xmm0
+        
+        ; initialize constants for SFMT
+        movaps  xmm0, oword [InitMask]
+        movaps  oword [par1+CRandomSFMTA.AMASK], xmm0
+
+        ; initialize various variables
+        xor     eax, eax
+        mov     dword [par1+CRandomSFMTA.one], eax
+        mov     dword [par1+4+CRandomSFMTA.one], 3FF00000H
+        mov     dword [par1+CRandomSFMTA.LASTINTERVAL], eax        
+        
+        ; get instruction set
+        push    par1
+        call    InstructionSet
+        pop     par1
+        mov     [par1+CRandomSFMTA.Instset], eax
+        
+        ; Period certification
+        ; Compute parity of STATE[0-4] & InitParity
+        movaps  xmm1, oword [par1+CRandomSFMTA.STATE]
+        andps   xmm1, oword [InitParity]
+        movhlps xmm2, xmm1                                 ; high qword
+        xorps   xmm1, xmm2                                 ; xor two qwords
+        pshufd  xmm2, xmm1, 1                              ; high dword
+        xorps   xmm1, xmm2                                 ; xor two dwords
+        movd    eax,  xmm1                                 ; do rest of xor in eax
+        mov     edx,  eax
+        shr     eax,  16
+        xor     eax,  edx                                  ; xor two words
+        xor     al,   ah                                   ; xor two bytes
+        jpo     L008                                       ; parity odd: period OK
+        
+        ; parity even: period not OK
+        ; Find a nonzero dword in period certification vector
+        xor     ebx, ebx                                   ; loop counter
+        lea     rdx, [InitParity]
+L005:   mov     eax, [rdx+rbx*4]                           ; InitParity[i]
+        test    eax, eax
+        jnz     L006
+        inc     ebx
+        ; assume that there is a nonzero dword in InitParity
+        jmp     L005                                       ; loop until nonzero found
+        
+L006:   ; find first nonzero bit in eax
+        bsf     edx, eax
+        ; flip the corresponding bit in STATE
+        btc     [par1+rbx*4+CRandomSFMTA.STATE], edx
+
+L008:   cmp     dword [par1+CRandomSFMTA.USEMOTHER], 0
+        je      L009
+        call    Mother_Next                                ; Make first random number ready
+
+L009:   ; Generate first random numbers and set IX = 0
+        call    SFMT_Generate
+        pop     rbx
+        ret
+;InitAndPeriod   endp
+
+
+;  extern "C" void SFMTRandomInitByArray
+; (void * Pthis, int ThisSize, int const seeds[], int NumSeeds, int IncludeMother = 0);
+; // Seed by more than 32 bits
+SFMTRandomInitByArray:
+; Parameters
+; par1  = Pthis
+; par2d = ThisSize
+; par3  = seeds
+; par4d = NumSeeds
+; par5d = IncludeMother
+
+; define constants:
+SFMT_SIZE equ SFMT_N*4                                     ; number of 32-bit integers in state
+
+%IF SFMT_SIZE >= 623
+   SFMT_LAG equ 11
+%ELIF SFMT_SIZE >= 68
+   SFMT_LAG equ  7
+%ELIF SFMT_SIZE >= 39
+   SFMT_LAG equ  5
+%ELSE
+   SFMT_LAG equ  3
+%ENDIF
+
+SFMT_MID equ ((SFMT_SIZE - SFMT_LAG) / 2)
+
+        xor     eax, eax
+        cmp     par5d, eax                                 ; IncludeMother (parameter is on stack if windows)
+        setnz   al                                         ; convert any nonzero value to 1
+
+        push    rbx
+        push    rbp
+        
+        cmp     par2d, SFMTSize                            ; ThisSize
+        jb      Error                                      ; Error exit if buffer too small
+
+        ; Align by 16. Will overlap part of Fill if Pthis unaligned        
+        and     par1, -16        
+
+        ; Store USEMOTHER
+        mov     [par1+CRandomSFMTA.USEMOTHER], eax 
+
+; 1. loop: Fill state vector with random numbers from NumSeeds
+; r = NumSeeds;
+; for (i = 0; i < SFMT_N*4; i++) {
+;    r = factor * (r ^ (r >> 30)) + i;
+;    sta[i] = r;}
+
+        mov     eax, par4d                                 ; r = NumSeeds
+        xor     ebx, ebx                                   ; i
+L100:   mov     par2d, eax
+        shr     eax, 30
+        xor     eax, par2d
+        imul    eax, 1812433253
+        add     eax, ebx
+        mov     [par1+rbx*4+CRandomSFMTA.STATE], eax
+        inc     ebx
+        cmp     ebx, SFMT_SIZE
+        jb      L100        
+
+        ; count = max(NumSeeds,size-1)
+        mov     eax,  SFMT_SIZE - 1
+        mov     r11d, par4d                                 ; NumSeeds
+        cmp     r11d, eax
+        cmovb   r11d, eax
+        
+; 2. loop: Fill state vector with random numbers from seeds[]
+; for (i = 1, j = 0; j < count; j++) {
+;    r = func1(sta[i] ^ sta[(i + mid) % size] ^ sta[(i + size - 1) % size]);
+;    sta[(i + mid) % size] += r;
+;    if (j < NumSeeds) r += seeds[j]
+;    r += i;
+;    sta[(i + mid + lag) % size] += r;
+;    sta[i] = r;
+;    i = (i + 1) % size;
+; }
+        ; register use:
+        ; par1  = Pthis
+        ; par2  = j
+        ; par3  = seeds
+        ; par4  = NumSeeds
+        ; eax   = r
+        ; ebx   = i
+        ; ebp   = (i + mid) % size, (i + mid + lag) % size
+        ; r10   = (i + size - 1) % size
+        ; r11   = count
+
+        xor     par2d, par2d           ; j = 0
+        lea     ebx, [par2+1]          ; i = 1
+
+L101:   ; r = sta[i] ^ sta[(i + mid) % size] ^ sta[(i + size - 1) % size];
+        mov     eax,  [par1+rbx*4+CRandomSFMTA.STATE]      ; sta[i]
+        lea     ebp,  [rbx+SFMT_MID]
+        cmp     ebp,  SFMT_SIZE
+        jb      L102
+        sub     ebp,  SFMT_SIZE
+L102:   xor     eax,  [par1+rbp*4+CRandomSFMTA.STATE]      ; sta[(i + mid) % size]
+        lea     r10d, [rbx+SFMT_SIZE-1]
+        cmp     r10d, SFMT_SIZE
+        jb      L103
+        sub     r10d, SFMT_SIZE
+L103:   xor     eax,  [par1+r10*4+CRandomSFMTA.STATE]      ; sta[(i + size - 1) % size]
+
+        ; r = func1(r) = (r ^ (r >> 27)) * 1664525U;
+        call    InitSubf1
+        
+        ; sta[(i + mid) % size] += r;
+        add     [par1+rbp*4+CRandomSFMTA.STATE], eax
+        
+        ; if (j < NumSeeds) r += seeds[j]
+        cmp     par2d, par4d
+        jnb     L104
+        add     eax, [par3+par2*4]        
+L104:
+        ; r += i;
+        add     eax, ebx
+        
+        ; sta[(i + mid + lag) % size] += r;
+        lea     ebp, [rbx+SFMT_MID+SFMT_LAG]
+        cmp     ebp, SFMT_SIZE
+        jb      L105
+        sub     ebp, SFMT_SIZE
+L105:   add     [par1+rbp*4+CRandomSFMTA.STATE], eax
+        
+        ;sta[i] = r;
+        mov     [par1+rbx*4+CRandomSFMTA.STATE], eax
+        
+        ; i = (i + 1) % size;
+        inc     ebx
+        cmp     ebx, SFMT_SIZE
+        jb      L106
+        sub     ebx, SFMT_SIZE
+L106:
+        ; j++, loop while j < count
+        inc     par2d
+        cmp     par2d, r11d
+        jb      L101
+        
+; 3. loop: Randomize some more
+; for (j = 0; j < size; j++) {
+;   r = func2(sta[i] + sta[(i + mid) % size] + sta[(i + size - 1) % size]);
+;   sta[(i + mid) % size] ^= r;
+;   r -= i;
+;   sta[(i + mid + lag) % size] ^= r;
+;   sta[i] = r;
+;   i = (i + 1) % size;
+; }
+        ; j = 0
+        xor     par2d, par2d
+
+L110:    ; r = sta[i] + sta[(i + mid) % size] + sta[(i + size - 1) % size]
+        mov     eax,  [par1+rbx*4+CRandomSFMTA.STATE]      ; sta[i]
+        lea     ebp,  [rbx+SFMT_MID]
+        cmp     ebp,  SFMT_SIZE
+        jb      L111
+        sub     ebp,  SFMT_SIZE
+L111:   add     eax,  [par1+rbp*4+CRandomSFMTA.STATE]      ; sta[(i + mid) % size]
+        lea     r10d, [rbx+SFMT_SIZE-1]
+        cmp     r10d, SFMT_SIZE
+        jb      L112
+        sub     r10d, SFMT_SIZE
+L112:   add     eax,  [par1+r10*4+CRandomSFMTA.STATE]      ; sta[(i + size - 1) % size]
+
+        ; r = func2(r) = (x ^ (x >> 27)) * 1566083941U;
+        call    InitSubf2
+        
+        ; sta[(i + mid) % size] ^= r;
+        xor     [par1+rbp*4+CRandomSFMTA.STATE], eax
+        
+        ; r -= i;
+        sub     eax, ebx
+        
+        ; sta[(i + mid + lag) % size] ^= r;
+        lea     ebp, [rbx+SFMT_MID+SFMT_LAG]
+        cmp     ebp, SFMT_SIZE
+        jb      L113
+        sub     ebp, SFMT_SIZE
+L113:   xor     [par1+rbp*4+CRandomSFMTA.STATE], eax
+
+        ; sta[i] = r;
+        mov     [par1+rbx*4+CRandomSFMTA.STATE], eax
+        
+        ; i = (i + 1) % size;
+        inc     ebx
+        cmp     ebx, SFMT_SIZE
+        jb      L114
+        sub     ebx, SFMT_SIZE
+L114:
+        ; j++, loop while j < size
+        inc     par2d
+        cmp     par2d, SFMT_SIZE
+        jb      L110
+    
+        ; if (UseMother) {
+        cmp     dword [par1+CRandomSFMTA.USEMOTHER], 0
+        jz      L120
+        
+; 4. loop: Initialize MotherState
+; for (j = 0; j < 5; j++) {
+;    r = func2(r) + j;
+;    MotherState[j] = r + sta[2*j];
+; }
+        call    InitSubf2
+        mov     par2d, [par1+CRandomSFMTA.STATE]
+        add     par2d, eax
+        mov     [par1+CRandomSFMTA.M0], par2d
+        call    InitSubf2
+        inc     eax
+        mov     par2d, [par1+8+CRandomSFMTA.STATE]
+        add     par2d, eax
+        mov     [par1+CRandomSFMTA.M1], par2d
+        call    InitSubf2
+        add     eax, 2
+        mov     par2d, [par1+16+CRandomSFMTA.STATE]
+        add     par2d, eax        
+        mov     [par1+CRandomSFMTA.M2], par2d
+        call    InitSubf2
+        add     eax, 3
+        mov     par2d, [par1+24+CRandomSFMTA.STATE]
+        add     par2d, eax        
+        mov     [par1+CRandomSFMTA.M3], par2d
+        call    InitSubf2
+        add     eax, 4
+        mov     par2d, [par1+32+CRandomSFMTA.STATE]
+        add     par2d, eax        
+        mov     [par1+CRandomSFMTA.MC], par2d
+        
+L120:    ; More initialization and period certification
+        call    InitAndPeriod
+        
+        pop     rbp
+        pop     rbx
+        ret
+;SFMTRandomInitByArray ENDP
+
+
+Mother_Next: ; private
+; Internal procedure: advance Mother-Of-All generator
+; The random value is in M0
+; par1 = aligned pointer to structure CRandomSFMTA
+; eax, par1, xmm0 unchanged
+
+        movdqa  xmm1, oword [par1+CRandomSFMTA.M3]         ; load M3,M2
+        movdqa  xmm2, oword [par1+CRandomSFMTA.M1]         ; load M1,M0
+        movhps  qword [par1+CRandomSFMTA.M3], xmm1         ; M3=M2
+        movq    qword [par1+CRandomSFMTA.M2], xmm2         ; M2=M1
+        movhps  qword [par1+CRandomSFMTA.M1], xmm2         ; M1=M0
+        pmuludq xmm1, oword [par1+CRandomSFMTA.MF3]        ; M3*MF3, M2*MF2
+        pmuludq xmm2, oword [par1+CRandomSFMTA.MF1]        ; M1*MF1, M0*MF0
+        paddq   xmm1, xmm2                                 ; P3+P1, P2+P0
+        movhlps xmm2, xmm1                                 ; Get high qword
+        movq    xmm3, qword [par1+CRandomSFMTA.MC]         ; +carry
+        paddq   xmm1, xmm3
+        paddq   xmm1, xmm2                                 ; P0+P1+P2+P3
+        movq    qword [par1+CRandomSFMTA.M0], xmm1         ; Store new M0 and carry
+        ret
+;Mother_Next endp
+
+
+align 16
+SFMT_Generate: ; private
+; void CRandomSFMT::Generate() {
+; Fill state array with new random numbers
+
+        push    rbx
+        
+        ; register use
+        ; par1 = Pthis (rcx or rdi)
+        ; edx  = i*16 + offset state
+        ; eax, ebx = loop end
+        ; xmm1 = r1
+        ; xmm2 = r2 = r
+        ; xmm0, xmm3 = scratch
+        
+        ; r1 = state[SFMT_N*16 - 2];
+        ; r2 = state[SFMT_N*16 - 1];
+        movdqa  xmm1, oword [par1+(SFMT_N-2)*16+CRandomSFMTA.STATE]
+        movdqa  xmm2, oword [par1+(SFMT_N-1)*16+CRandomSFMTA.STATE]
+        mov     edx, CRandomSFMTA.STATE
+        
+;static inline __m128i sfmt_recursion(__m128i const &a, __m128i const &b, 
+;__m128i const &c, __m128i const &d, __m128i const &mask) {
+;    __m128i a1, b1, c1, d1, z1, z2;
+;    b1 = _mm_srli_epi32(b, SFMT_SR1);
+;    a1 = _mm_slli_si128(a, SFMT_SL2);
+;    c1 = _mm_srli_si128(c, SFMT_SR2);
+;    d1 = _mm_slli_epi32(d, SFMT_SL1);
+;    b1 = _mm_and_si128(b1, mask);
+;    z1 = _mm_xor_si128(a, a1);
+;    z2 = _mm_xor_si128(b1, d1);
+;    z1 = _mm_xor_si128(z1, c1);
+;    z2 = _mm_xor_si128(z1, z2);
+;    return z2;}
+
+; for (i = 0; i < SFMT_N - SFMT_M; i++) {
+;    r = sfmt_recursion(state[i], state[i + SFMT_M], r1, r2, mask);
+;    state[i] = r;
+;    r1 = r2;
+;    r2 = r;
+; }
+
+        mov      eax, (SFMT_N-SFMT_M)*16 + CRandomSFMTA.STATE ; first loop end
+        mov      ebx, SFMT_N*16 + CRandomSFMTA.STATE          ; second loop end
+
+; first i loop from 0 to SFMT_N - SFMT_M
+align 8
+L201:   movdqa   xmm0, oword [par1+rdx+SFMT_M*16]          ; b
+        psrld    xmm0, SFMT_SR1                            ; b1
+        pand     xmm0, oword [par1+CRandomSFMTA.AMASK]     ; b1
+        movdqa   xmm3, oword [par1+rdx]                    ; a
+        pxor     xmm0, xmm3
+        pslldq   xmm3, SFMT_SL2                            ; a1
+        psrldq   xmm1, SFMT_SR2                            ; c1, c = r1
+        pxor     xmm0, xmm3
+        pxor     xmm0, xmm1
+        movdqa   xmm1, xmm2                                ; r1 = r2
+        pslld    xmm2, SFMT_SL1                            ; d1, d = r2
+        pxor     xmm2, xmm0                                ; r2 = r
+        ; state[i] = r;
+        movdqa   oword [par1+rdx], xmm2
+        
+        ; i++ while i < SFMT_N - SFMT_M
+        add      edx, 16
+        cmp      edx, eax
+        jb       L201
+        
+;align 16
+L202:   ; second i loop from SFMT_N - SFMT_M + 1 to SFMT_N
+        movdqa   xmm0, oword [par1+rdx+(SFMT_M-SFMT_N)*16]; b
+        psrld    xmm0, SFMT_SR1                            ; b1
+        pand     xmm0, oword [par1+CRandomSFMTA.AMASK]     ; b1
+        movdqa   xmm3, oword [par1+rdx]                    ; a
+        pxor     xmm0, xmm3
+        pslldq   xmm3, SFMT_SL2                            ; a1
+        psrldq   xmm1, SFMT_SR2                            ; c1, c = r1
+        pxor     xmm0, xmm3
+        pxor     xmm0, xmm1
+        movdqa   xmm1, xmm2                                ; r1 = r2
+        pslld    xmm2, SFMT_SL1                            ; d1, d = r2
+        pxor     xmm2, xmm0                                ; r2 = r
+        ; state[i] = r;
+        movdqa   oword [par1+rdx], xmm2
+        
+        ; i++ while i < SFMT_N
+        add      edx, 16
+        cmp      edx, ebx
+        jb       L202
+        
+        ; Check if initialized
+L208:   cmp     dword [par1+CRandomSFMTA.AMASK], SFMT_MASK1
+        jne     Error                                      ; Make error if not initialized
+
+        ; ix = 0;
+        mov      dword [par1+CRandomSFMTA.IX], 0 ; point to start of STATE buffer
+        pop      rbx
+        ret
+;SFMT_Generate endp
+
+
+;  extern "C" unsigned int SFMTBRandom(void * Pthis);                     // Output random bits
+
+SFMTBRandom:                                               ; generate random bits
+        ; Align Pthis by 16. Will overlap part of Fill1 if Pthis unaligned        
+        and     par1, -16        
+
+SFMTBRandom_reg:                                           ; Entry for register parameters, used internally
+
+; if (ix >= SFMT_N*4) Generate();
+        mov     edx, [par1+CRandomSFMTA.IX]
+        cmp     edx, SFMT_N*16
+        jnb     NeedGenerate
+        
+; y = ((uint32_t*)state)[ix++];
+        mov     eax, dword [par1+rdx+CRandomSFMTA.STATE]
+        add     edx, 4
+        mov     [par1+CRandomSFMTA.IX], edx
+
+AfterGenerate:
+; if (UseMother) y += MotherBits();
+        cmp     dword [par1+CRandomSFMTA.USEMOTHER], 0
+        jz      NoMother
+        
+        ; add mother bits
+        add     eax,  [par1+CRandomSFMTA.M0]               ; Add Mother random number        
+        call    Mother_Next                                ; Make next Mother random number ready
+        
+NoMother: ; return y;
+        ret
+        
+NeedGenerate: 
+        call    SFMT_Generate                              ; generate SFMT_N*4 random dwords
+        mov     eax, [par1+CRandomSFMTA.STATE]
+        mov     dword [par1+CRandomSFMTA.IX], 4
+        jmp     AfterGenerate
+        
+;SFMTBRandom ENDP
+
+
+;  extern "C" double SFMTRandom  (void * Pthis); // Output random float
+SFMTRandom:                                                ; generate random float with 52 bits resolution
+        ; Align Pthis by 16. Will overlap part of Fill1 if Pthis unaligned        
+        and     par1, -16
+        
+SFMTRandom_reg:                                            ; internal entry point        
+
+; check if there are at least 64 random bits in state buffer
+; if (ix >= SFMT_N*4-1) Generate();
+        mov     edx, [par1+CRandomSFMTA.IX]
+        cmp     edx, SFMT_N*16-4
+        jnb     L303
+
+L301:   ; read 64 random bits
+        movq    xmm0, qword [par1+rdx+CRandomSFMTA.STATE]
+        add     edx, 8
+        mov     [par1+CRandomSFMTA.IX], edx
+
+        ; combine with Mother-Of-All generator?
+        cmp     dword [par1+CRandomSFMTA.USEMOTHER], 0
+        jz      L302 ; ConvertToFloat
+        
+        ; add mother bits
+        movq    xmm1, qword [par1+CRandomSFMTA.M0]         ; Mother random number MC and M0
+        pshuflw xmm1, xmm1, 01001011B                      ; Put M0 before MC, and swap the words in MC
+        paddq   xmm0, xmm1                                 ; Add SFMT and Mother outputs
+        call    Mother_Next                                ; Make next Mother random number ready
+        
+L302:   ; ConvertToFloat
+        psrlq	xmm0, 12			                       ; align with mantissa field of double precision float
+        movsd   xmm1, [par1+CRandomSFMTA.one]              ; 1.0 double precision
+        por     xmm0, xmm1                                 ; insert exponent to get 1.0 <= x < 2.0
+        subsd   xmm0, xmm1                                 ; subtract 1.0 to get 0.0 <= x < 1.0
+        ret                                                ; return value        
+        
+L303:   ; NeedGenerateR
+        call    SFMT_Generate                              ; generate SFMT_N*4 random dwords
+        xor     edx, edx
+        jmp     L301
+
+;SFMTRandom ENDP
+
+
+; extern "C" long double SFMTRandomL (void * Pthis);
+SFMTRandomL:                                               ; generate random float with 63 bits resolution
+        ; Align Pthis by 16.
+        and     par1, -16
+        
+SFMTRandomL_reg:                                           ; internal entry point        
+
+; check if there are at least 64 random bits in state buffer
+; if (ix >= SFMT_N*4-1) Generate();
+        mov     edx, [par1+CRandomSFMTA.IX]
+        cmp     edx, SFMT_N*16-4
+        jnb     L403
+
+L401:   ; read 64 random bits
+        movq    xmm0, qword [par1+rdx+CRandomSFMTA.STATE]
+        add     edx, 8
+        mov     [par1+CRandomSFMTA.IX], edx
+
+        ; combine with Mother-Of-All generator?
+        cmp     dword [par1+CRandomSFMTA.USEMOTHER], 0
+        jz      L402
+                
+        ; add mother bits
+        movq    xmm1, qword [par1+CRandomSFMTA.M0]        ; Mother random number MC and M0
+        pshuflw xmm1, xmm1, 01001011B                     ; Put M0 before MC, and swap the words in MC
+        paddq   xmm0, xmm1                                ; Add SFMT and Mother outputs
+        call    Mother_Next                               ; Make next Mother random number ready
+        
+L402:   ;ConvertToFloat
+        sub     rsp, 16                                   ; make space for long double
+        psrlq	xmm0, 1                                   ; align with mantissa field of long double
+        pcmpeqw xmm1, xmm1                                ; all 1's
+        psllq   xmm1, 63                                  ; create a 1 in bit 63
+        por     xmm0, xmm1                                ; bit 63 is always 1 in long double
+        movq    qword [rsp], xmm0                         ; store mantissa
+        mov     dword [rsp+8], 3FFFH                      ; exponent
+        fld     tword [rsp]                               ; load long double
+        fsub    qword [par1+CRandomSFMTA.one]             ; subtract 1.0 to get 0.0 <= x < 1.0
+        pcmpeqw xmm0, xmm0                                ; make a NAN for compilers that don't support long double
+        add     rsp, 16
+        ret                                               ; return value in st(0)
+        
+L403:   ;NeedGenerateR
+        call    SFMT_Generate                             ; generate SFMT_N*4 random dwords
+        xor     edx, edx
+        jmp     L401        
+;SFMTRandomL ENDP
+
+
+;  extern "C" int SFMTIRandom (void * Pthis, int min, int max);  // Output random integer
+
+SFMTIRandom:
+; par1  = Pthis
+; par2d = min
+; par3d = max
+
+        ; Align Pthis by 16.
+        and     par1, -16        
+        push    par2                                       ; save min, max
+        push    par3
+        call    SFMTBRandom_reg                            ; random bits
+        pop     rdx                                        ; max
+        pop     rcx                                        ; min        
+        sub     edx, ecx
+        jl      short WrongInterval                        ; max < min
+        inc     edx                                        ; max - min + 1
+        mul     edx                                        ; multiply random number by interval and truncate
+        lea     eax, [rdx+rcx]                             ; add min to high dword of product
+        ret
+WrongInterval:
+        mov     eax, 80000000H                             ; error exit
+        ret
+;SFMTIRandom ENDP
+
+
+;  extern "C" int SFMTIRandomX (void * Pthis, int min, int max); // Output random integer
+
+SFMTIRandomX:
+; par1  = Pthis
+; par2d = min
+; par3d = max
+
+        push    rbx
+        ; Align Pthis by 16.
+        and     par1, -16        
+
+        mov     ebx, par3d 
+        sub     ebx, par2d                                 ; max - min
+        jle     short M30                                  ; max <= min (signed)
+        inc     ebx                                        ; interval = max - min + 1
+        
+        ; if (interval != LastInterval) {
+        cmp     ebx, [par1+CRandomSFMTA.LASTINTERVAL]
+        je      M10
+        ; need to calculate new rejection limit
+        ; RLimit = uint32(((uint64)1 << 32) / interval) * interval - 1;}
+        xor     eax, eax                                   ; 0
+        lea     edx, [eax+1]                               ; 1
+        div     ebx                                        ; (would give overflow if interval = 1)
+        mul     ebx
+        dec     eax
+        mov     [par1+CRandomSFMTA.RLIMIT], eax       
+        mov     [par1+CRandomSFMTA.LASTINTERVAL], ebx
+M10:    mov     ebx, par2d                                 ; save min
+
+M20:    ; do { // Rejection loop
+        call    SFMTBRandom_reg                            ; random bits (par1 is preserved)
+        ; longran  = (uint64)BRandom() * interval;
+        mul     dword [par1+CRandomSFMTA.LASTINTERVAL]
+        ; } while (remainder > RLimit);
+        cmp     eax, [par1+CRandomSFMTA.RLIMIT]
+        ja      M20
+        
+        ; return (int32)iran + min
+        lea     eax, [rbx+rdx]
+        pop     rbx
+        ret
+        
+M30:    jl      M40
+        ; max = min. Return min
+        mov     eax, par2d
+        pop     rbx
+        ret                                                ; max = min exit
+        
+M40:    ; max < min: error
+        mov     eax, 80000000H                             ; error exit
+        pop     rbx
+        ret
+;SFMTIRandomX ENDP
+
+
+
+; -------------------------------------------------------------------------
+;  Single-threaded static link versions for SFMT generator
+; -------------------------------------------------------------------------
+
+;  extern "C" void SFMTgenRandomInit(int seed, int IncludeMother = 0); 
+SFMTgenRandomInit:
+; par1d = seed
+; par2d = IncludeMother
+
+        ; set up parameters for call SFMTRandomInit
+        mov     par4d, par2d                               ; IncludeMother
+        mov     par3d, par1d                               ; seed
+        mov     par2d, SFMTSize                            ; ThisSize
+        lea     par1,  [SFMTInstance]                      ; Get address of SFMTInstance into par1
+        jmp     SFMTRandomInit
+;SFMTgenRandomInit ENDP
+
+
+;  extern "C" void SFMTgenRandomInitByArray(int const seeds[], int NumSeeds, int IncludeMother = 0);
+SFMTgenRandomInitByArray:
+; par1  = seeds
+; par2d = NumSeeds
+; par3d = IncludeMother
+
+        ; set up parameters for call SFMTRandomInitByArray
+%IFDEF   WINDOWS
+        push    par3                                       ; IncludeMother on stack
+        sub     rsp, 32                                    ; empty shadow space
+        mov     par4d, par2d                               ; NumSeeds
+        mov     par3,  par1                                ; seeds
+        mov     par2d, SFMTSize                            ; ThisSize
+        lea     par1,  [SFMTInstance]                      ; Get address of SFMTInstance into par1
+        call	SFMTRandomInitByArray
+        add     rsp, 40
+        ret
+%ELSE    ; UNIX
+        mov     par5d, par3d                               ; IncludeMother in register
+        mov     par4d, par2d                               ; NumSeeds
+        mov     par3,  par1                                ; seeds
+        mov     par2d, SFMTSize                            ; ThisSize
+        lea     par1,  [SFMTInstance]                      ; Get address of SFMTInstance into par1
+        jmp     SFMTRandomInitByArray
+%ENDIF        
+;SFMTgenRandomInitByArray ENDP  
+
+
+;  extern "C" double SFMTgenRandom();
+SFMTgenRandom:                                             ; generate random float with 52 bits resolution
+        lea     par1, [SFMTInstance]                       ; Get address of SFMTInstance into par1
+        jmp     SFMTRandom_reg                             ; random bits
+;SFMTgenRandom ENDP
+
+
+;  extern "C" double SFMTgenRandom();
+SFMTgenRandomL:                                            ; generate random float with 63 bits resolution
+        lea     par1, [SFMTInstance]                       ; Get address of SFMTInstance into par1
+        jmp     SFMTRandomL_reg                            ; random bits
+;SFMTgenRandomL ENDP
+
+
+;  extern "C" int SFMTgenIRandom (int min, int max);
+SFMTgenIRandom:   
+        mov     par3d, par2d
+        mov     par2d, par1d
+        lea     par1, [SFMTInstance]                       ; Get address of SFMTInstance into par1
+        jmp     SFMTIRandom				                   ; continue in _SFMTIRandom
+;SFMTgenIRandom ENDP
+
+
+;  extern "C" int SFMTgenIRandomX (int min, int max);
+SFMTgenIRandomX:
+        mov     par3d, par2d
+        mov     par2d, par1d
+        lea     par1, [SFMTInstance]                       ; Get address of SFMTInstance into par1
+        jmp	    SFMTIRandomX                               ; continue in _SFMTIRandomX
+;SFMTgenIRandomX ENDP
+
+
+;  extern "C" uint32_t SFMTgenBRandom();
+SFMTgenBRandom:                                            ; generate random float with 32 bits resolution
+        lea     par1, [SFMTInstance]                       ; Get address of SFMTInstance into par1
+        jmp     SFMTBRandom_reg                            ; random bits
+;SFMTgenBRandom ENDP
+
+;END
diff --git a/contrib/libs/asmlib/strcat64.asm b/contrib/libs/asmlib/strcat64.asm
new file mode 100644
index 0000000000..3c8a247e3e
--- /dev/null
+++ b/contrib/libs/asmlib/strcat64.asm
@@ -0,0 +1,70 @@
+%include "defs.asm"
+
+;*************************  strcat64.asm  ************************************
+; Author:           Agner Fog
+; Date created:     2008-07-19
+; Last modified:    2008-10-16
+; Description:
+; Faster version of the standard strcat function:
+; char * strcat(char *dest, const char * src);
+; Copies zero-terminated string from src to end of dest.
+;
+; Overriding standard function strcat:
+; The alias ?OVR_strcat is changed to _strcat in the object file if
+; it is desired to override the standard library function strcat.
+;
+; Optimization:
+; Uses optimized functions A_strlen and A_memcpy.
+;
+; Copyright (c) 2009 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+default rel
+
+global A_strcat: function                  ; Function A_strcat
+global EXP(strcat): function               ; ?OVR removed if standard function strcat overridden
+
+; Imported from strlen64.asm
+extern A_strlen
+
+; Imported from memcpy64.asm
+extern A_memcpy
+
+
+SECTION .text  align=16
+
+; extern "C" char * A_strcat(char * dest, const char * src) {
+;    memcpy(dest+strlen(dest), src, strlen(src)+1);
+;    return dest
+; }
+
+; Function entry:
+A_strcat:
+EXP(strcat):
+
+%IFDEF  WINDOWS
+%define Rpar1   rcx                    ; function parameter 1
+%define Rpar2   rdx                    ; function parameter 2
+%define Rpar3   r8                     ; function parameter 3
+%ENDIF
+%IFDEF  UNIX
+%define Rpar1   rdi                    ; function parameter 1
+%define Rpar2   rsi                    ; function parameter 2
+%define Rpar3   rdx                    ; function parameter 3
+%ENDIF
+
+        push    Rpar1                  ; dest
+        push    Rpar2                  ; src
+        call    A_strlen               ; length of dest
+        push    rax                    ; strlen(dest)
+        mov     Rpar1, [rsp+8]         ; src
+        call    A_strlen               ; length of src
+        pop     Rpar1                  ; strlen(dest)
+        pop     Rpar2                  ; src
+        add     Rpar1, [rsp]           ; dest + strlen(dest)
+        lea     Rpar3, [rax+1]         ; strlen(src)+1
+        call    A_memcpy               ; copy
+        pop     rax                    ; return dest
+        ret
+
+;A_strcat ENDP
diff --git a/contrib/libs/asmlib/strcpy64.asm b/contrib/libs/asmlib/strcpy64.asm
new file mode 100644
index 0000000000..c505c48be7
--- /dev/null
+++ b/contrib/libs/asmlib/strcpy64.asm
@@ -0,0 +1,66 @@
+%include "defs.asm"
+
+;*************************  strcpy64.asm  ************************************
+; Author:           Agner Fog
+; Date created:     2008-07-19
+; Last modified:    2011-07-01
+; Description:
+; Faster version of the standard strcpy function:
+; char * A_strcpy(char * dest, const char * src);
+; Copies zero-terminated string from src to dest, including terminating zero.
+;
+; Overriding standard function memcpy:
+; The alias ?OVR_strcpy is changed to _strcpy in the object file if
+; it is desired to override the standard library function strcpy.
+;
+; Optimization:
+; Uses optimized functions A_strlen and A_memcpy. These functions allow
+; calling without proper stack alignment.
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+default rel
+
+global A_strcpy: function                 ; Function A_strcpy
+global EXP(strcpy): function              ; ?OVR removed if standard function memcpy overridden
+
+; Imported from strlen64.asm
+extern A_strlen
+
+; Imported from memcpy64.asm
+extern A_memcpy
+
+
+SECTION .text  align=16
+
+; extern "C" char * A_strcpy(char * dest, const char * src) {
+;    return memcpy(dest, src, strlen(src)+1);
+; }
+
+; Function entry:
+A_strcpy:
+EXP(strcpy):
+
+%IFDEF  WINDOWS
+%define Rpar1   rcx                    ; function parameter 1
+%define Rpar2   rdx                    ; function parameter 2
+%define Rpar3   r8                     ; function parameter 3
+%ENDIF
+%IFDEF  UNIX
+%define Rpar1   rdi                    ; function parameter 1
+%define Rpar2   rsi                    ; function parameter 2
+%define Rpar3   rdx                    ; function parameter 3
+%ENDIF
+
+        push    Rpar1                  ; dest
+        push    Rpar2                  ; src
+        mov     Rpar1, Rpar2
+        ; (A_strlen does not require stack alignment)
+        call    A_strlen               ; length of src
+        lea     Rpar3,[rax+1]          ; include terminating zero in length
+        pop     Rpar2                  ; src
+        pop     Rpar1                  ; dest
+        jmp     A_memcpy               ; copy and return
+
+;A_strcpy ENDP
diff --git a/contrib/libs/asmlib/stricmp64.asm b/contrib/libs/asmlib/stricmp64.asm
new file mode 100644
index 0000000000..c568832b27
--- /dev/null
+++ b/contrib/libs/asmlib/stricmp64.asm
@@ -0,0 +1,86 @@
+%include "defs.asm"
+
+;*************************  stricmpaz64.asm  **********************************
+; Author:           Agner Fog
+; Date created:     2008-12-05
+; Last modified:    2011-07-01
+; Source URL:       www.agner.org/optimize
+; Project:          asmlib.zip
+; Description:
+; Faster version of the standard stricmp or strcasecmp function:
+; int A_stricmp(const char *string1, const char *string2);
+; Compares two zero-terminated strings without case sensitivity.
+; Does not recognize locale-specific characters. A-Z are changed
+; to a-z before comparing, while other upper-case letters are not
+; converted but considered unique.
+;
+; Optimization:
+; SSE4.2 version not implemented because the gain is small.
+;
+; Copyright (c) 2008-2011 GNU General Public License www.gnu.org/licenses/gpl.html
+;******************************************************************************
+
+default rel
+
+global A_stricmp: function                     ; Function A_stricmp
+
+; ***************************************************************************
+; Define registers used for function parameters, used in 64-bit mode only
+; ***************************************************************************
+ 
+%IFDEF WINDOWS
+  %define par1   rcx                   ; first parameter
+  %define par2   rdx                   ; second parameter
+%ENDIF
+  
+%IFDEF UNIX
+  %define par1   rdi                   ; first parameter
+  %define par2   rsi                   ; second parameter
+%ENDIF
+
+SECTION .text  align=16
+
+; extern "C" int A_stricmp(const char *string1, const char *string2);
+
+A_stricmp:
+        sub     par2, par1
+        
+L10:    mov     al,  [par1]            ; string1
+        cmp     al,  [par1+par2]       ; string2
+        jne     L20
+        inc     par1
+        test    al, al
+        jnz     L10                    ; continue with next byte
+        
+        ; terminating zero found. Strings are equal
+        xor     eax, eax
+        ret        
+        
+L20:    ; bytes are different. check case
+        xor     al, 20H                ; toggle case
+        cmp     al, [par1+par2]
+        jne     L30
+        ; possibly differing only by case. Check if a-z
+        or      al, 20H                ; upper case
+        sub     al, 'a'
+        cmp     al, 'z'-'a'
+        ja      L30                    ; not a-z
+        ; a-z and differing only by case
+        inc     par1
+        jmp     L10                    ; continue with next byte
+
+L30:    ; bytes are different, even after changing case
+        movzx   eax, byte [par1]       ; get original value again
+        sub     eax, 'A'
+        cmp     eax, 'Z' - 'A'
+        ja      L40
+        add     eax, 20H               ; A-Z, make lower case
+L40:    movzx   edx, byte [par1+par2]
+        sub     edx, 'A'
+        cmp     edx, 'Z' - 'A'
+        ja      L50
+        add     edx, 20H                ; A-Z, make lower case
+L50:    sub     eax, edx                ; subtract to get result
+        ret
+
+;A_stricmp END
diff --git a/contrib/libs/asmlib/strlen64.asm b/contrib/libs/asmlib/strlen64.asm
new file mode 100644
index 0000000000..ff65c10127
--- /dev/null
+++ b/contrib/libs/asmlib/strlen64.asm
@@ -0,0 +1,86 @@
+%include "defs.asm"
+
+;**************************  strlen64.asm  **********************************
+; Author:           Agner Fog
+; Date created:     2008-07-19
+; Last modified:    2008-10-16
+; Description:
+; Faster version of the standard strlen function:
+; size_t strlen(const char * str);
+; Finds the length of a zero-terminated string of bytes, optimized for speed.
+;
+; Overriding standard function strlen:
+; The alias ?OVR_strlen is changed to _strlen in the object file if
+; it is desired to override the standard library function strlen.
+;
+; Calling conventions: 
+; Stack alignment is not required. No shadow space or red zone used.
+; Called internally from strcpy and strcat without stack aligned.
+;
+; Optimization:
+; Uses XMM registers to read 16 bytes at a time, aligned.
+; Misaligned parts of the string are read from the nearest 16-bytes boundary
+; and the irrelevant part masked out. It may read both before the begin of 
+; the string and after the end, but will never load any unnecessary cache 
+; line and never trigger a page fault for reading from non-existing memory 
+; pages because it never reads past the nearest following 16-bytes boundary.
+; It may, though, trigger any debug watch within the same 16-bytes boundary.
+;
+; The latest version of this file is available at:
+; www.agner.org/optimize/asmexamples.zip
+; Copyright (c) 2009 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+default rel
+
+global A_strlen: function              ; Function A_strlen
+global EXP(strlen): function           ; ?OVR removed if standard function strlen overridden
+
+
+SECTION .text  align=16
+
+; extern "C" int strlen (const char * s);
+
+; 64-bit Windows version:
+A_strlen:
+EXP(strlen):
+
+%IFDEF  WINDOWS
+        mov      rax,  rcx             ; get pointer to string from rcx
+        mov      r8,   rcx             ; copy pointer
+%define Rscopy   r8                    ; Copy of s
+
+%ELSE   ; Unix
+        mov      rax,  rdi             ; get pointer to string from rdi
+        mov      ecx,  edi             ; copy pointer (lower 32 bits)
+%define Rscopy   rdi                   ; Copy of s
+%ENDIF
+        
+        ; rax = s, ecx = 32 bits of s
+        pxor     xmm0, xmm0            ; set to zero
+        and      ecx,  0FH             ; lower 4 bits indicate misalignment
+        and      rax,  -10H            ; align pointer by 16
+        movdqa   xmm1, [rax]           ; read from nearest preceding boundary
+        pcmpeqb  xmm1, xmm0            ; compare 16 bytes with zero
+        pmovmskb edx,  xmm1            ; get one bit for each byte result
+        shr      edx,  cl              ; shift out false bits
+        shl      edx,  cl              ; shift back again
+        bsf      edx,  edx             ; find first 1-bit
+        jnz      L2                    ; found
+        
+        ; Main loop, search 16 bytes at a time
+L1:     add      rax,  10H             ; increment pointer by 16
+        movdqa   xmm1, [rax]           ; read 16 bytes aligned
+        pcmpeqb  xmm1, xmm0            ; compare 16 bytes with zero
+        pmovmskb edx,  xmm1            ; get one bit for each byte result
+        bsf      edx,  edx             ; find first 1-bit
+        ; (moving the bsf out of the loop and using test here would be faster for long strings on old processors,
+        ;  but we are assuming that most strings are short, and newer processors have higher priority)
+        jz       L1                    ; loop if not found
+        
+L2:     ; Zero-byte found. Compute string length        
+        sub      rax,  Rscopy          ; subtract start address
+        add      rax,  rdx             ; add byte index
+        ret
+        
+;A_strlen ENDP
diff --git a/contrib/libs/asmlib/substring64.asm b/contrib/libs/asmlib/substring64.asm
new file mode 100644
index 0000000000..235b19a5f5
--- /dev/null
+++ b/contrib/libs/asmlib/substring64.asm
@@ -0,0 +1,75 @@
+%include "defs.asm"
+
+;*************************  substring64.asm  **********************************
+; Author:           Agner Fog
+; Date created:     2011-07-18
+; Last modified:    2011-07-18
+; Source URL:       www.agner.org/optimize
+; Project:          asmlib.zip
+; Description:
+; Makes a substring of a zero-terminated ASCII string
+;
+; C++ prototype:
+; extern "C"
+; size_t A_substring(char * dest, const char * source, size_t pos, size_t len);
+; Makes a substring from source, starting at position pos (zero-based) and length
+; len and stores it in the array dest. It is the responsibility of the programmer
+; that the size of the dest array is at least len + 1.
+; The return value is the actual length of the substring. This may be less than 
+; len if the length of source is less than pos + len.
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses/gpl.html
+;******************************************************************************
+
+global A_substring: function                      ; Function _A_substring
+
+extern A_strlen
+extern A_memcpy
+
+SECTION .text
+
+; extern "C"
+; size_t A_substring(char * dest, const char * source, size_t pos, size_t len);
+
+%ifdef WINDOWS
+%define par1    rcx                    ; dest
+%define par2    rdx                    ; source
+%define par3    r8                     ; pos
+%define par4    r9                     ; len
+%else   ; UNIX
+%define par1    rdi
+%define par2    rsi
+%define par3    rdx
+%define par4    rcx
+%endif
+
+A_substring:
+        push    par1
+        push    par2
+        push    par3
+        push    par4
+        mov     par1, par2
+        call    A_strlen               ; rax = strlen(source)
+        pop     par4
+        pop     par3
+        pop     par2
+        pop     par1        
+        sub     rax, par3              ; max length = strlen(source) - pos
+        jbe     empty                  ; strlen(source) <= pos. Return empty string
+        cmp     rax, par4
+        cmova   rax, par4              ; min(len, maxlen)
+        add     par2, par3             ; source + pos = source for memcpy
+        mov     par3, rax              ; length for memcpy
+        push    rax                    ; new length
+        call    A_memcpy
+        pop     rcx                    ; new length = return value, rax = dest
+        mov     byte [rcx+rax], 0      ; terminating zero
+        mov     rax, rcx               ; return new length
+        ret
+        
+empty:  ; return empty string
+        xor     eax, eax               ; return 0
+        mov     byte [par1], al
+        ret
+        
+;A_substring END
diff --git a/contrib/libs/asmlib/unalignedisfaster64.asm b/contrib/libs/asmlib/unalignedisfaster64.asm
new file mode 100644
index 0000000000..eed68a1398
--- /dev/null
+++ b/contrib/libs/asmlib/unalignedisfaster64.asm
@@ -0,0 +1,188 @@
+%include "defs.asm"
+
+;*************************  unalignedisfaster64.asm  ******************************
+; Author:           Agner Fog
+; Date created:     2011-07-09
+; Last modified:    2013-08-30
+; Source URL:       www.agner.org/optimize
+; Project:          asmlib.zip
+; Language:         assembly, NASM/YASM syntax, 64 bit
+;
+; C++ prototype:
+; extern "C" int UnalignedIsFaster(void);
+;
+; Description:
+; This function finds out if unaligned 16-bytes memory read is
+; faster than aligned read followed by an alignment shift (PALIGNR) on the
+; current CPU.
+;
+; Return value:
+; 0:   Unaligned read is probably slower than alignment shift
+; 1:   Unknown
+; 2:   Unaligned read is probably faster than alignment shift
+;
+;
+; C++ prototype:
+; extern "C" int Store256BitIsFaster(void);
+;
+; Description:
+; This function finds out if a 32-bytes memory write is
+; faster than two 16-bytes writes on the current CPU.
+;
+; Return value:
+; 0:   32-bytes memory write is slower or AVX not supported
+; 1:   Unknown
+; 2:   32-bytes memory write is faster
+;
+; Copyright (c) 2011 - 2013 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+;
+; C++ prototype:
+; extern "C" int UnalignedIsFaster(void);
+
+global UnalignedIsFaster: function
+global Store256BitIsFaster: function
+extern CpuType
+extern InstructionSet
+
+
+SECTION .text
+
+UnalignedIsFaster:
+
+%ifdef  UNIX
+        push    0                      ; vendor
+        mov     rdi, rsp
+        push    0                      ; family 
+        mov     rsi, rsp
+        push    0                      ; model
+        mov     rdx, rsp 
+%else   ; WINDOWS
+        push    0                      ; vendor
+        mov     rcx, rsp
+        push    0                      ; family 
+        mov     rdx, rsp
+        push    0                      ; model
+        mov     r8,  rsp 
+%endif
+        call    CpuType                ; get vendor, family, model
+        pop     rdx                    ; model
+        pop     rcx                    ; family
+        pop     r8                     ; vendor
+        xor     eax, eax               ; return value
+        dec     r8d
+        jz      Intel
+        dec     r8d
+        jz      AMD
+        dec     r8d
+        jz      VIA
+        ; unknown vendor
+        inc     eax
+        jmp     Uend
+        
+Intel:  ; Unaligned read is faster on Intel Nehalem and later, but not Atom
+        ; Nehalem  = family 6, model 1AH
+        ; Atom     = family 6, model 1CH
+        ; Netburst = family 0FH
+        ; Future models are likely to be family 6, mayby > 6, model > 1C
+        cmp     ecx, 6
+        jb      Uend                   ; old Pentium 1, etc
+        cmp     ecx, 0FH
+        je      Uend                   ; old Netburst architecture
+        cmp     edx, 1AH
+        jb      Uend                   ; earlier than Nehalem
+        cmp     edx, 1CH
+        je      Uend                   ; Intel Atom
+        or      eax, 2                 ; Intel Nehalem and later, except Atom
+        jmp     Uend
+        
+AMD:    ; AMD processors:
+        ; The PALIGNR instruction is slow on AMD Bobcat but fast on Jaguar
+        ; K10/Opteron = family 10H     ; Use unaligned
+        ; Bobcat = family 14H          ; PALIGNR is very slow. Use unaligned
+        ; Piledriver = family 15H      ; Use unaligned
+        ; Jaguar = family 16H          ; PALIGNR is fast. Use aligned (aligned is faster in most cases, but not all)
+        cmp     ecx, 10H               ; AMD K8 or earlier: use aligned
+        jb      Uend    
+        cmp     ecx, 16H               ; Jaguar: use aligned
+        je      Uend
+        or      eax, 2                 ; AMD K10 or later: use unaligned
+        jmp     Uend
+        
+VIA:    ; Unaligned read is not faster than PALIGNR on VIA Nano 2000 and 3000                
+        cmp     ecx, 0FH
+        jna     Uend                   ; VIA Nano
+        inc     eax                    ; Future versions: unknown
+       ;jmp     Uend
+        
+Uend:   ret
+
+;UnalignedIsFaster ENDP
+
+
+Store256BitIsFaster:
+        call    InstructionSet
+        cmp     eax, 11                ; AVX supported
+        jb      S90
+%ifdef  UNIX
+        push    0                      ; vendor
+        mov     rdi, rsp
+        push    0                      ; family 
+        mov     rsi, rsp
+        push    0                      ; model
+        mov     rdx, rsp 
+%else   ; WINDOWS
+        push    0                      ; vendor
+        mov     rcx, rsp
+        push    0                      ; family 
+        mov     rdx, rsp
+        push    0                      ; model
+        mov     r8,  rsp 
+%endif
+        call    CpuType                ; get vendor, family, model
+        pop     rdx                    ; model
+        pop     rcx                    ; family
+        pop     rax                    ; vendor
+
+        cmp     eax, 1                 ; Intel
+        je      S_Intel
+        cmp     eax, 2                 ; AMD
+        je      S_AMD
+        cmp     eax, 3
+        je      S_VIA        
+        jmp     S91                    ; other vendor, not known
+        
+S_Intel:cmp     ecx, 6
+        jne     S92                    ; unknown family. possibly future model
+        ; model 2AH Sandy Bridge
+        ; model 3AH Ivy Bridge
+        ; model 3CH Haswell
+        ; Sandy Bridge and Ivy Bridge are slightly faster with 128 than with 256 bit moves on large data blocks
+        ; Haswell is much faster with 256 bit moves
+        cmp     edx, 3AH
+        jbe     S90
+        jmp     S92        
+
+S_AMD:  ; AMD
+        cmp     ecx, 15H               ; family 15h = Bulldozer, Piledriver
+        ja      S92                    ; assume future AMD families are faster
+                                       ; family 16H = Jaguar. 256 bit write is slightly faster
+        ; model 1 = Bulldozer is a little slower on 256 bit write
+        ; model 2 = Piledriver is terribly slow on 256 bit write
+        ; assume future models 3-4 are like Bulldozer
+        cmp     edx, 4
+        jbe     S90
+        jmp     S91                    ; later models: don't know
+        
+S_VIA:  jmp     S91                    ; don't know
+        
+S90:    xor     eax, eax               ; return 0
+        ret
+        
+S91:    mov     eax, 1                 ; return 1
+        ret        
+        
+S92:    mov     eax, 2                 ; return 2
+        ret        
+        
+; Store256BitIsFaster ENDP
author	pnv1 <pnv@ydb.tech>	2023-04-27 19:15:07 +0300
committer	pnv1 <pnv@ydb.tech>	2023-04-27 19:15:07 +0300
commit	a66c59109292f9e0fb44ede41adfdebe569e4df3 (patch)
tree	906b3d10274afd16e8e70c61ff416bff9075422e /contrib
parent	9ca91b40d6f45546e20a646d15590c0cc6cc9778 (diff)
download	ydb-a66c59109292f9e0fb44ede41adfdebe569e4df3.tar.gz