Extract asmlib manipulations into separate block

author: thegeorg <thegeorg@yandex-team.com> 2023-08-22 18:56:30 +0300
committer: thegeorg <thegeorg@yandex-team.com> 2023-08-22 19:13:38 +0300
commit: 769d14120ef8e30363c7dd6870ce1b82552587c3 (patch)
tree: c407d1d3f152b9f6eb13f50abc3f5b06db82f9b3
parent: 494eee7cbbaf3e7d71a133c80c96aec26e518c2a (diff)
download: ydb-769d14120ef8e30363c7dd6870ce1b82552587c3.tar.gz
49 files changed, 9 insertions, 9332 deletions
diff --git a/contrib/libs/CMakeLists.darwin-x86_64.txt b/contrib/libs/CMakeLists.darwin-x86_64.txt
index fa9b8be410..d96017ec15 100644
--- a/contrib/libs/CMakeLists.darwin-x86_64.txt
+++ b/contrib/libs/CMakeLists.darwin-x86_64.txt
@@ -8,8 +8,6 @@
 
 add_subdirectory(antlr3_cpp_runtime)
 add_subdirectory(apache)
-add_subdirectory(asmglibc)
-add_subdirectory(asmlib)
 add_subdirectory(aws-sdk-cpp)
 add_subdirectory(base64)
 add_subdirectory(brotli)
diff --git a/contrib/libs/CMakeLists.linux-aarch64.txt b/contrib/libs/CMakeLists.linux-aarch64.txt
index 358f97add0..ced3728c58 100644
--- a/contrib/libs/CMakeLists.linux-aarch64.txt
+++ b/contrib/libs/CMakeLists.linux-aarch64.txt
@@ -8,7 +8,6 @@
 
 add_subdirectory(antlr3_cpp_runtime)
 add_subdirectory(apache)
-add_subdirectory(asmlib)
 add_subdirectory(aws-sdk-cpp)
 add_subdirectory(base64)
 add_subdirectory(brotli)
diff --git a/contrib/libs/CMakeLists.linux-x86_64.txt b/contrib/libs/CMakeLists.linux-x86_64.txt
index ac47fb1732..a1dec91afd 100644
--- a/contrib/libs/CMakeLists.linux-x86_64.txt
+++ b/contrib/libs/CMakeLists.linux-x86_64.txt
@@ -8,7 +8,6 @@
 
 add_subdirectory(antlr3_cpp_runtime)
 add_subdirectory(apache)
-add_subdirectory(asmlib)
 add_subdirectory(aws-sdk-cpp)
 add_subdirectory(base64)
 add_subdirectory(brotli)
diff --git a/contrib/libs/CMakeLists.windows-x86_64.txt b/contrib/libs/CMakeLists.windows-x86_64.txt
index 99a7d95650..0c3e7223c9 100644
--- a/contrib/libs/CMakeLists.windows-x86_64.txt
+++ b/contrib/libs/CMakeLists.windows-x86_64.txt
@@ -8,7 +8,6 @@
 
 add_subdirectory(antlr3_cpp_runtime)
 add_subdirectory(apache)
-add_subdirectory(asmlib)
 add_subdirectory(aws-sdk-cpp)
 add_subdirectory(base64)
 add_subdirectory(brotli)
diff --git a/contrib/libs/asmglibc/CMakeLists.darwin-x86_64.txt b/contrib/libs/asmglibc/CMakeLists.darwin-x86_64.txt
deleted file mode 100644
index e2b4e37fbb..0000000000
--- a/contrib/libs/asmglibc/CMakeLists.darwin-x86_64.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-
-# This file was generated by the build system used internally in the Yandex monorepo.
-# Only simple modifications are allowed (adding source-files to targets, adding simple properties
-# like target_include_directories). These modifications will be ported to original
-# ya.make files by maintainers. Any complex modifications which can't be ported back to the
-# original buildsystem will not be accepted.
-
-
-
-add_library(contrib-libs-asmglibc)
-target_sources(contrib-libs-asmglibc PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmglibc/memchr.S
-)
diff --git a/contrib/libs/asmglibc/CMakeLists.txt b/contrib/libs/asmglibc/CMakeLists.txt
deleted file mode 100644
index 661b6431cc..0000000000
--- a/contrib/libs/asmglibc/CMakeLists.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-
-# This file was generated by the build system used internally in the Yandex monorepo.
-# Only simple modifications are allowed (adding source-files to targets, adding simple properties
-# like target_include_directories). These modifications will be ported to original
-# ya.make files by maintainers. Any complex modifications which can't be ported back to the
-# original buildsystem will not be accepted.
-
-
-if (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
-  include(CMakeLists.darwin-x86_64.txt)
-endif()
diff --git a/contrib/libs/asmglibc/memchr.S b/contrib/libs/asmglibc/memchr.S
deleted file mode 100644
index b0a51115c4..0000000000
--- a/contrib/libs/asmglibc/memchr.S
+++ /dev/null
@@ -1,330 +0,0 @@
-/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
-   Contributed by Intel Corporation.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include "sysdep.h"
-
-#ifdef USE_AS_WMEMCHR
-# define MEMCHR		wmemchr
-# define PCMPEQ		pcmpeqd
-#else
-# define MEMCHR		memchr
-# define PCMPEQ		pcmpeqb
-#endif
-
-/* fast SSE2 version with using pmaxub and 64 byte loop */
-
-	.text
-ENTRY(MEMCHR)
-	movd	%esi, %xmm1
-	mov	%edi, %ecx
-
-#ifdef USE_AS_WMEMCHR
-	test	%rdx, %rdx
-	jz	L(return_null)
-	shl	$2, %rdx
-#else
-	punpcklbw %xmm1, %xmm1
-	test	%rdx, %rdx
-	jz	L(return_null)
-	punpcklbw %xmm1, %xmm1
-#endif
-
-	and	$63, %ecx
-	pshufd	$0, %xmm1, %xmm1
-
-	cmp	$48, %ecx
-	ja	L(crosscache)
-
-	movdqu	(%rdi), %xmm0
-	PCMPEQ	%xmm1, %xmm0
-	pmovmskb %xmm0, %eax
-	test	%eax, %eax
-
-	jnz	L(matches_1)
-	sub	$16, %rdx
-	jbe	L(return_null)
-	add	$16, %rdi
-	and	$15, %ecx
-	and	$-16, %rdi
-	add	%rcx, %rdx
-	sub	$64, %rdx
-	jbe	L(exit_loop)
-	jmp	L(loop_prolog)
-
-	.p2align 4
-L(crosscache):
-	and	$15, %ecx
-	and	$-16, %rdi
-	movdqa	(%rdi), %xmm0
-
-	PCMPEQ	%xmm1, %xmm0
-/* Check if there is a match.  */
-	pmovmskb %xmm0, %eax
-/* Remove the leading bytes.  */
-	sar	%cl, %eax
-	test	%eax, %eax
-	je	L(unaligned_no_match)
-/* Check which byte is a match.  */
-	bsf	%eax, %eax
-
-	sub	%rax, %rdx
-	jbe	L(return_null)
-	add	%rdi, %rax
-	add	%rcx, %rax
-	ret
-
-	.p2align 4
-L(unaligned_no_match):
-        /* "rcx" is less than 16.  Calculate "rdx + rcx - 16" by using
-	   "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void
-	   possible addition overflow.  */
-	neg	%rcx
-	add	$16, %rcx
-	sub	%rcx, %rdx
-	jbe	L(return_null)
-	add	$16, %rdi
-	sub	$64, %rdx
-	jbe	L(exit_loop)
-
-	.p2align 4
-L(loop_prolog):
-	movdqa	(%rdi), %xmm0
-	PCMPEQ	%xmm1, %xmm0
-	pmovmskb %xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches)
-
-	movdqa	16(%rdi), %xmm2
-	PCMPEQ	%xmm1, %xmm2
-	pmovmskb %xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches16)
-
-	movdqa	32(%rdi), %xmm3
-	PCMPEQ	%xmm1, %xmm3
-	pmovmskb %xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	movdqa	48(%rdi), %xmm4
-	PCMPEQ	%xmm1, %xmm4
-	add	$64, %rdi
-	pmovmskb %xmm4, %eax
-	test	%eax, %eax
-	jnz	L(matches0)
-
-	test	$0x3f, %rdi
-	jz	L(align64_loop)
-
-	sub	$64, %rdx
-	jbe	L(exit_loop)
-
-	movdqa	(%rdi), %xmm0
-	PCMPEQ	%xmm1, %xmm0
-	pmovmskb %xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches)
-
-	movdqa	16(%rdi), %xmm2
-	PCMPEQ	%xmm1, %xmm2
-	pmovmskb %xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches16)
-
-	movdqa	32(%rdi), %xmm3
-	PCMPEQ	%xmm1, %xmm3
-	pmovmskb %xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	movdqa	48(%rdi), %xmm3
-	PCMPEQ	%xmm1, %xmm3
-	pmovmskb %xmm3, %eax
-
-	add	$64, %rdi
-	test	%eax, %eax
-	jnz	L(matches0)
-
-	mov	%rdi, %rcx
-	and	$-64, %rdi
-	and	$63, %ecx
-	add	%rcx, %rdx
-
-	.p2align 4
-L(align64_loop):
-	sub	$64, %rdx
-	jbe	L(exit_loop)
-	movdqa	(%rdi), %xmm0
-	movdqa	16(%rdi), %xmm2
-	movdqa	32(%rdi), %xmm3
-	movdqa	48(%rdi), %xmm4
-
-	PCMPEQ	%xmm1, %xmm0
-	PCMPEQ	%xmm1, %xmm2
-	PCMPEQ	%xmm1, %xmm3
-	PCMPEQ	%xmm1, %xmm4
-
-	pmaxub	%xmm0, %xmm3
-	pmaxub	%xmm2, %xmm4
-	pmaxub	%xmm3, %xmm4
-	pmovmskb %xmm4, %eax
-
-	add	$64, %rdi
-
-	test	%eax, %eax
-	jz	L(align64_loop)
-
-	sub	$64, %rdi
-
-	pmovmskb %xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches)
-
-	pmovmskb %xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches16)
-
-	movdqa	32(%rdi), %xmm3
-	PCMPEQ	%xmm1, %xmm3
-
-	PCMPEQ	48(%rdi), %xmm1
-	pmovmskb %xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	pmovmskb %xmm1, %eax
-	bsf	%eax, %eax
-	lea	48(%rdi, %rax), %rax
-	ret
-
-	.p2align 4
-L(exit_loop):
-	add	$32, %edx
-	jle	L(exit_loop_32)
-
-	movdqa	(%rdi), %xmm0
-	PCMPEQ	%xmm1, %xmm0
-	pmovmskb %xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches)
-
-	movdqa	16(%rdi), %xmm2
-	PCMPEQ	%xmm1, %xmm2
-	pmovmskb %xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches16)
-
-	movdqa	32(%rdi), %xmm3
-	PCMPEQ	%xmm1, %xmm3
-	pmovmskb %xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches32_1)
-	sub	$16, %edx
-	jle	L(return_null)
-
-	PCMPEQ	48(%rdi), %xmm1
-	pmovmskb %xmm1, %eax
-	test	%eax, %eax
-	jnz	L(matches48_1)
-	xor	%eax, %eax
-	ret
-
-	.p2align 4
-L(exit_loop_32):
-	add	$32, %edx
-	movdqa	(%rdi), %xmm0
-	PCMPEQ	%xmm1, %xmm0
-	pmovmskb %xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches_1)
-	sub	$16, %edx
-	jbe	L(return_null)
-
-	PCMPEQ	16(%rdi), %xmm1
-	pmovmskb %xmm1, %eax
-	test	%eax, %eax
-	jnz	L(matches16_1)
-	xor	%eax, %eax
-	ret
-
-	.p2align 4
-L(matches0):
-	bsf	%eax, %eax
-	lea	-16(%rax, %rdi), %rax
-	ret
-
-	.p2align 4
-L(matches):
-	bsf	%eax, %eax
-	add	%rdi, %rax
-	ret
-
-	.p2align 4
-L(matches16):
-	bsf	%eax, %eax
-	lea	16(%rax, %rdi), %rax
-	ret
-
-	.p2align 4
-L(matches32):
-	bsf	%eax, %eax
-	lea	32(%rax, %rdi), %rax
-	ret
-
-	.p2align 4
-L(matches_1):
-	bsf	%eax, %eax
-	sub	%rax, %rdx
-	jbe	L(return_null)
-	add	%rdi, %rax
-	ret
-
-	.p2align 4
-L(matches16_1):
-	bsf	%eax, %eax
-	sub	%rax, %rdx
-	jbe	L(return_null)
-	lea	16(%rdi, %rax), %rax
-	ret
-
-	.p2align 4
-L(matches32_1):
-	bsf	%eax, %eax
-	sub	%rax, %rdx
-	jbe	L(return_null)
-	lea	32(%rdi, %rax), %rax
-	ret
-
-	.p2align 4
-L(matches48_1):
-	bsf	%eax, %eax
-	sub	%rax, %rdx
-	jbe	L(return_null)
-	lea	48(%rdi, %rax), %rax
-	ret
-
-	.p2align 4
-L(return_null):
-	xor	%eax, %eax
-	ret
-END(MEMCHR)
-
-#ifndef USE_AS_WMEMCHR
-strong_alias (memchr, __memchr)
-libc_hidden_builtin_def(memchr)
-#endif
-\ No newline at end of file
diff --git a/contrib/libs/asmglibc/sysdep.h b/contrib/libs/asmglibc/sysdep.h
deleted file mode 100644
index 1cfb71673e..0000000000
--- a/contrib/libs/asmglibc/sysdep.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#if defined(__APPLE__)
-    #define ENTRY(X) .globl _## X; .align 1<<3; _ ## X:
-    #define END(X)
-    #define L(X) L ## X
-#else
-    #define ENTRY(X) .globl X; .type X,@function; .align 1<<4; X: .cfi_startproc;
-    #define END(X) .cfi_endproc; .size X,.-X;
-    #define L(X) .L ## X
-#endif
-
-#define libc_hidden_builtin_def(X)
-#define strong_alias(X, Y)
diff --git a/contrib/libs/asmglibc/ya.make b/contrib/libs/asmglibc/ya.make
deleted file mode 100644
index c64ea8388a..0000000000
--- a/contrib/libs/asmglibc/ya.make
+++ /dev/null
@@ -1,17 +0,0 @@
-LIBRARY()
-
-LICENSE(LGPL-2.1-or-later)
-
-LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
-
-VERSION(2.27)
-
-ORIGINAL_SOURCE(http://ftp.gnu.org/gnu/glibc/)
-
-NO_PLATFORM()
-
-SRCS(
-    memchr.S
-)
-
-END()
diff --git a/contrib/libs/asmlib/CMakeLists.darwin-x86_64.txt b/contrib/libs/asmlib/CMakeLists.darwin-x86_64.txt
deleted file mode 100644
index 56e892f3a2..0000000000
--- a/contrib/libs/asmlib/CMakeLists.darwin-x86_64.txt
+++ /dev/null
@@ -1,192 +0,0 @@
-
-# This file was generated by the build system used internally in the Yandex monorepo.
-# Only simple modifications are allowed (adding source-files to targets, adding simple properties
-# like target_include_directories). These modifications will be ported to original
-# ya.make files by maintainers. Any complex modifications which can't be ported back to the
-# original buildsystem will not be accepted.
-
-
-
-add_library(contrib-libs-asmlib)
-target_link_libraries(contrib-libs-asmlib PUBLIC
-  contrib-libs-asmglibc
-)
-target_sources(contrib-libs-asmlib PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/dummy.c
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/debugbreak64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/cachesize64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/divfixedi64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/rdtsc64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/strcat64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/unalignedisfaster64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/strcpy64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/substring64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/strlen64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/cputype64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memcmp64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memmove64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/stricmp64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/divfixedv64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/physseed64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/cpuid64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/round64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memcpy64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/popcount64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/dispatchpatch64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/procname64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memset64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
diff --git a/contrib/libs/asmlib/CMakeLists.linux-aarch64.txt b/contrib/libs/asmlib/CMakeLists.linux-aarch64.txt
deleted file mode 100644
index d29b43c90a..0000000000
--- a/contrib/libs/asmlib/CMakeLists.linux-aarch64.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-
-# This file was generated by the build system used internally in the Yandex monorepo.
-# Only simple modifications are allowed (adding source-files to targets, adding simple properties
-# like target_include_directories). These modifications will be ported to original
-# ya.make files by maintainers. Any complex modifications which can't be ported back to the
-# original buildsystem will not be accepted.
-
-
-
-add_library(contrib-libs-asmlib)
-target_link_libraries(contrib-libs-asmlib PUBLIC
-  contrib-libs-linux-headers
-)
-target_sources(contrib-libs-asmlib PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/dummy.c
-)
diff --git a/contrib/libs/asmlib/CMakeLists.linux-x86_64.txt b/contrib/libs/asmlib/CMakeLists.linux-x86_64.txt
deleted file mode 100644
index e4b9975e9f..0000000000
--- a/contrib/libs/asmlib/CMakeLists.linux-x86_64.txt
+++ /dev/null
@@ -1,216 +0,0 @@
-
-# This file was generated by the build system used internally in the Yandex monorepo.
-# Only simple modifications are allowed (adding source-files to targets, adding simple properties
-# like target_include_directories). These modifications will be ported to original
-# ya.make files by maintainers. Any complex modifications which can't be ported back to the
-# original buildsystem will not be accepted.
-
-
-
-add_library(contrib-libs-asmlib)
-target_link_libraries(contrib-libs-asmlib PUBLIC
-  contrib-libs-linux-headers
-)
-target_sources(contrib-libs-asmlib PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/dummy.c
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/sfmt64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/mother64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/mersenne64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/debugbreak64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/cachesize64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/divfixedi64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/rdtsc64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/strcat64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/unalignedisfaster64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/strcpy64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/substring64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/strlen64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/cputype64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memcmp64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memmove64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/stricmp64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/divfixedv64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/physseed64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/cpuid64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/round64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memcpy64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/popcount64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/dispatchpatch64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/procname64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memset64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
diff --git a/contrib/libs/asmlib/CMakeLists.txt b/contrib/libs/asmlib/CMakeLists.txt
deleted file mode 100644
index f8b31df0c1..0000000000
--- a/contrib/libs/asmlib/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-
-# This file was generated by the build system used internally in the Yandex monorepo.
-# Only simple modifications are allowed (adding source-files to targets, adding simple properties
-# like target_include_directories). These modifications will be ported to original
-# ya.make files by maintainers. Any complex modifications which can't be ported back to the
-# original buildsystem will not be accepted.
-
-
-if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA)
-  include(CMakeLists.linux-aarch64.txt)
-elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
-  include(CMakeLists.darwin-x86_64.txt)
-elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA)
-  include(CMakeLists.windows-x86_64.txt)
-elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA)
-  include(CMakeLists.linux-x86_64.txt)
-endif()
diff --git a/contrib/libs/asmlib/CMakeLists.windows-x86_64.txt b/contrib/libs/asmlib/CMakeLists.windows-x86_64.txt
deleted file mode 100644
index 6e1a2adde6..0000000000
--- a/contrib/libs/asmlib/CMakeLists.windows-x86_64.txt
+++ /dev/null
@@ -1,213 +0,0 @@
-
-# This file was generated by the build system used internally in the Yandex monorepo.
-# Only simple modifications are allowed (adding source-files to targets, adding simple properties
-# like target_include_directories). These modifications will be ported to original
-# ya.make files by maintainers. Any complex modifications which can't be ported back to the
-# original buildsystem will not be accepted.
-
-
-
-add_library(contrib-libs-asmlib)
-target_sources(contrib-libs-asmlib PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/dummy.c
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/sfmt64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/mother64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/mersenne64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/debugbreak64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/cachesize64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/divfixedi64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/rdtsc64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/strcat64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/unalignedisfaster64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/strcpy64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/substring64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/strlen64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/cputype64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memcmp64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memmove64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/stricmp64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/divfixedv64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/physseed64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/cpuid64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/round64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memcpy64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/popcount64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/dispatchpatch64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/procname64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
-target_yasm_source(contrib-libs-asmlib
-  PRIVATE
-  ${CMAKE_SOURCE_DIR}/contrib/libs/asmlib/memset64.asm
-  -I
-  ${CMAKE_BINARY_DIR}
-  -I
-  ${CMAKE_SOURCE_DIR}
-)
diff --git a/contrib/libs/asmlib/cachesize64.asm b/contrib/libs/asmlib/cachesize64.asm
deleted file mode 100644
index c0bce8cf74..0000000000
--- a/contrib/libs/asmlib/cachesize64.asm
+++ /dev/null
@@ -1,335 +0,0 @@
-%include "defs.asm"
-
-;*************************  cachesize64.asm  *************************************
-; Author:           Agner Fog
-; Date created:     2011-07-11
-; Last modified:    2013-08-14
-; Description:
-; Determines the size of the data caches 
-;
-; extern "C" site_t DataCacheSize(int level);
-; Input: 
-; level: n = 1 - 4: level n data cache
-;        0 = largest level data cache
-; Return value: size in bytes of data cache
-;
-; The latest version of this file is available at:
-; www.agner.org/optimize/asmexamples.zip
-; Copyright (c) 2011-2013 GNU General Public License www.gnu.org/licenses
-;******************************************************************************
-
-default rel
-
-global DataCacheSize: function
-
-; Imported from cputype64.asm
-extern CpuType                         ; near. Determine CPU vendor
-
-struc   data_layout
-ok:     resd    2
-level1: resq    1
-level2: resq    1
-level3: resq    1
-level4: resq    1
-descriptortable: resd 60
-endstruc
-
-struc   descriptor_record              ; record for table of cache descriptors
-d_key:          resb 1                 ; key from cpuid instruction
-d_level:        resb 1                 ; cache level
-d_sizem:        resb 1                 ; size multiplier
-d_2pow:         resb 1                 ; power of 2. size = d_sizem << d_2pow
-endstruc
-
-SECTION .data
-
-dataref:                               ; reference point
-ok_:       DD      0, 0                ; 1 when values are determined
-level1_:   DQ      0                   ; level 1 data cache size
-level2_:   DQ      0                   ; level 2 data cache size
-level3_:   DQ      0                   ; level 3 data cache size
-level4_:   DQ      0                   ; level 4 data cache size
-numlevels  equ     4                   ; max level
-
-; From "Intel Processor Identification and the CPUID Instruction, Application note 485
-descriptortable_:                      ; table of Intel cache descriptors
-db 0Ah, 1, 1, 13                       ; 8 kb L1 data cache
-db 0Ch, 1, 1, 14                       ; 16 kb L1 data cache
-db 0Dh, 1, 1, 14                       ; 16 kb L1 data cache
-db 21h, 2, 1, 18                       ; 256 kb L2 data cache
-db 22h, 3, 1, 19                       ; 512 kb L3 data cache
-db 23h, 3, 1, 20                       ; 1 Mb L3 data cache
-db 25h, 3, 1, 21                       ; 2 Mb L3 data cache
-db 29h, 3, 1, 22                       ; 4 Mb L3 data cache
-db 2Ch, 1, 1, 15                       ; 32 kb L1 data cache
-db 39h, 2, 1, 17                       ; 128 kb L2 data cache
-db 3Ah, 2, 3, 16                       ; 192 kb L2 data cache
-db 3Bh, 2, 1, 17                       ; 128 kb L1 data cache
-db 3Ch, 2, 1, 18                       ; 256 kb L1 data cache
-db 3Dh, 2, 3, 17                       ; 384 kb L2 data cache
-db 3Eh, 2, 1, 19                       ; 512 kb L2 data cache
-db 41h, 2, 1, 17                       ; 128 kb L2 data cache
-db 42h, 2, 1, 18                       ; 256 kb L2 data cache
-db 43h, 2, 1, 19                       ; 512 kb L2 data cache
-db 44h, 2, 1, 20                       ; 1 Mb L2 data cache
-db 45h, 2, 1, 21                       ; 2 Mb L2 data cache
-db 46h, 3, 1, 22                       ; 4 Mb L3 data cache
-db 47h, 3, 1, 23                       ; 8 Mb L3 data cache
-db 48h, 2, 3, 20                       ; 3 Mb L2 data cache
-db 49h, 2, 1, 22                       ; 4 Mb L2 or 3 data cache
-db 4Ah, 3, 3, 21                       ; 6 Mb L3 data cache
-db 4Bh, 3, 1, 23                       ; 8 Mb L3 data cache
-db 4Ch, 3, 3, 22                       ; 12 Mb L3 data cache
-db 4Dh, 3, 1, 24                       ; 16 Mb L3 data cache
-db 4Eh, 2, 3, 21                       ; 6 Mb L2 data cache
-db 60h, 1, 1, 14                       ; 16 kb L1 data cache
-db 66h, 1, 1, 13                       ; 8 kb L1 data cache
-db 67h, 1, 1, 14                       ; 16 kb L1 data cache
-db 68h, 1, 1, 15                       ; 32 kb L1 data cache
-db 78h, 2, 1, 20                       ; 1 Mb L2 data cache
-db 79h, 2, 1, 17                       ; 128 kb L2 data cache
-db 7Ah, 2, 1, 18                       ; 256 kb L2 data cache
-db 7Bh, 2, 1, 19                       ; 512 kb L2 data cache
-db 7Ch, 2, 1, 20                       ; 1 Mb L2 data cache
-db 7Dh, 2, 1, 21                       ; 2 Mb L2 data cache
-db 7Fh, 2, 1, 19                       ; 512 kb L2 data cache
-db 82h, 2, 1, 18                       ; 256 kb L2 data cache
-db 83h, 2, 1, 19                       ; 512 kb L2 data cache
-db 84h, 2, 1, 20                       ; 1 Mb L2 data cache
-db 85h, 2, 1, 21                       ; 2 Mb L2 data cache
-db 86h, 2, 1, 19                       ; 512 kb L2 data cache
-db 87h, 2, 1, 20                       ; 1 Mb L2 data cache
-db 0D0h, 3, 1, 19                      ; 512 kb L3 data cache
-db 0D1h, 3, 1, 20                      ; 1 Mb L3 data cache
-db 0D2h, 3, 1, 21                      ; 2 Mb L3 data cache
-db 0D6h, 3, 1, 20                      ; 1 Mb L3 data cache
-db 0D7h, 3, 1, 21                      ; 2 Mb L3 data cache
-db 0D8h, 3, 1, 22                      ; 4 Mb L3 data cache
-db 0DCh, 3, 3, 19                      ; 1.5 Mb L3 data cache
-db 0DDh, 3, 3, 20                      ; 3 Mb L3 data cache
-db 0DEh, 3, 3, 21                      ; 6 Mb L3 data cache
-db 0E2h, 3, 1, 21                      ; 2 Mb L3 data cache
-db 0E3h, 3, 1, 22                      ; 4 Mb L3 data cache
-db 0E4h, 3, 1, 23                      ; 8 Mb L3 data cache
-db 0EAh, 3, 3, 22                      ; 12 Mb L3 data cache
-db 0EBh, 3, 9, 21                      ; 18 Mb L3 data cache
-db 0ECh, 3, 3, 23                      ; 24 Mb L3 data cache
-descriptortablelength equ ($ - descriptortable_) / descriptor_record_size
-
-
-SECTION .text
-
-; extern "C" site_t DataCacheSize(int level);
-
-; Function entry:
-DataCacheSize:
-        push    rbx
-        push    r14
-%ifdef  WINDOWS
-        push    rsi
-        push    rdi
-        mov     r14d, ecx              ; level
-%else   ; UNIX
-        mov     r14d, edi              ; level
-%endif
-        ; check if called before
-        lea     r9, [dataref]
-        cmp     dword [r9+ok], 1       ; ok
-        je      D800
-        
-        ; find cpu vendor
-        push    0
-%ifdef  WINDOWS
-        mov     rcx, rsp
-        xor     edx, edx
-        xor     r8d, r8d
-%else   ; UNIX
-        mov     rdi, rsp
-        xor     esi, esi
-        xor     edx, edx
-%endif        
-        call    CpuType
-        lea     r9, [dataref]
-        pop     rax                    ; eax = vendor
-        dec     eax
-        jz      Intel
-        dec     eax
-        jz      AMD
-        dec     eax
-        jz      VIA
-        ; unknown vendor, try all methods
-        call    IntelNewMethod
-        jnc     D800                   ; not carry = success
-        call    AMDMethod
-        jnc     D800                   ; not carry = success
-        call    IntelOldMethod
-        jmp     D800                   ; return whether success or not
-        
-Intel:  call    IntelNewMethod
-        jnc     D800                   ; not carry = success
-        call    IntelOldMethod
-        jmp     D800                   ; return whether success or not
-
-AMD:    ; AMD and VIA use same method
-VIA:    call    AMDMethod
-        
-D800:   ; cache data known, get desired return value
-        xor     eax, eax
-        cmp     r14d, numlevels
-        ja      D900
-        cmp     r14d, 0
-        je      D820
-        ; level = 1 .. numlevels
-        mov     rax, [r9 + r14*8]      ; size of selected cache
-        jmp     D850
-D820:   ; level = 0. Get size of largest level cache
-        mov     rax, [r9 + level3]     ; level3
-        test    rax, rax
-        jnz     D850
-        mov     rax, [r9 + level2]     ; level2
-        test    rax, rax
-        jnz     D850
-        mov     eax, [r9 + level1]     ; level1
-D850:   mov     dword [r9 + ok], 1     ; remember called, whether success or not
-D900:   
-%ifdef  WINDOWS
-        pop     rdi
-        pop     rsi
-%endif
-        pop     r14
-        pop     rbx
-        ret
-
-
-; Determine cache sizes by CPUID function 4
-; input: esi = pointer to dataref
-; output: values returned in dataref + level1, level2, level3
-; carry flag = 0 on succes
-IntelNewMethod:
-        xor     eax, eax
-        cpuid                          ; get number of CPUID functions
-        cmp     eax, 4
-        jb      I900                   ; fail
-        xor     esi, esi               ; loop counter
-I100:   mov     eax, 4
-        mov     ecx, esi
-        cpuid                          ; get cache parameters
-        mov     edx, eax
-        and     edx, 11111b            ; cache type
-        jz      I500                   ; no more caches
-        cmp     edx, 2
-        je      I200                   ; code cache, ignore
-        inc     ecx                    ; sets
-        mov     edx, ebx
-        shr     edx, 22
-        inc     edx                    ; ways
-        imul    ecx, edx
-        mov     edx, ebx
-        shr     edx, 12
-        and     edx, 1111111111b
-        inc     edx                    ; partitions
-        imul    ecx, edx
-        and     ebx, 111111111111b        
-        inc     ebx                    ; line size
-        imul    rcx, rbx               ; calculated cache size (64 bit)
-        shr     eax, 5
-        and     eax, 111b              ; cache level
-        cmp     eax, numlevels
-        jna     I180
-        mov     eax, numlevels         ; limit higher levels
-I180:   mov     [r9+rax*8], rcx        ; store size of data cache level eax
-I200:   inc     esi
-        cmp     esi, 100h              ; avoid infinite loop
-        jb      I100                   ; next cache
-I500:   ; loop finished
-        ; check if OK
-        mov     eax, [r9+level1]       ; level1
-        cmp     eax, 1024
-I900:   ret                            ; carry flag set if fail
-
-; Determine cache sizes by CPUID function 2
-; input: esi = pointer to dataref
-; output: values returned in dataref + level1, level2, level3
-; carry flag = 0 on succes
-IntelOldMethod:
-        xor     eax, eax
-        cpuid                          ; get number of CPUID functions
-        cmp     eax, 2
-        jb      J900                   ; fail
-        mov     eax, 2
-        xor     ecx, ecx
-        cpuid                          ; get 16 descriptor bytes in eax, ebx, ecx, edx
-        mov     al, 0                  ; al does not contain a descriptor
-        sub     rsp, 16
-        mov     [rsp],    eax          ; save all descriptors
-        mov     [rsp+4],  ebx
-        mov     [rsp+8],  ecx
-        mov     [rsp+12], edx
-        mov     edx, 15                ; loop counter
-        ; loop to read 16 descriptor bytes
-J100:   mov     al, byte [rsp+rdx]
-        ; find in table
-        mov     ebx, descriptortablelength-1  ; loop counter
-        ; loop to search in descriptortable
-J200:   cmp     al, [r9 + descriptortable + rbx*4 + d_key]
-        jne     J300
-        ; descriptor found
-        movzx   eax, byte [r9 + descriptortable + rbx*4 + d_sizem]
-        mov     cl,  [r9 + descriptortable + rbx*4 + d_2pow]
-        shl     eax, cl                ; compute size
-        movzx   ecx, byte [r9 + descriptortable + rbx*4 + d_level]
-        ; check that level = 1-3
-        cmp     ecx, 3
-        ja      J300
-        mov     [r9+rcx*8], rax        ; store size eax of data cache level ecx
-J300:   dec     ebx
-        jns     J200                   ; inner loop
-        dec     edx
-        jns     J100                   ; outer loop
-        add     rsp, 16                ; remove from stack
-        ; check if OK
-        mov     eax, [r9 + level1]
-        cmp     eax, 1024
-J900:   ret                            ; carry flag set if fail
-
-
-; Determine cache sizes by CPUID function 80000005H - 80000006H
-; input: esi = pointer to dataref
-; output: values returned in dataref
-; carry flag = 0 on succes
-AMDMethod:
-        mov     eax, 80000000H
-        cpuid                          ; get number of CPUID functions
-        cmp     eax, 6
-        jb      K900                   ; fail
-        mov     eax, 80000005H
-        cpuid                          ; get L1 cache size
-        shr     ecx, 24                ; L1 data cache size in kbytes
-        shl     ecx, 10                ; L1 data cache size in bytes
-        mov     [r9 + level1], ecx     ; store L1 data cache size
-        mov     eax, 80000006H
-        cpuid                          ; get L2 and L3 cache sizes
-        shr     ecx, 16                ; L2 data cache size in kbytes
-        shl     ecx, 10                ; L2 data cache size in bytes
-        mov     [r9 + level2], ecx     ; store L2 data cache size
-        mov     ecx, edx
-        shr     ecx, 18                ; L3 data cache size / 512 kbytes
-        shl     rcx, 19                ; L3 data cache size in bytes
-%if 0   ; AMD manual is unclear: 
-        ; do we have to increase the value if the number of ways is not a power or 2?
-        shr     edx, 12
-        and     edx, 1111b             ; L3 associativity
-        cmp     edx, 3
-        jb      K100
-        test    edx, 1
-        jz      K100
-        ; number of ways is not a power of 2, multiply by 1.5 ?
-        mov     rax, rcx
-        shr     rax, 1
-        add     rcx, rax
-%endif
-K100:   mov     [r9 + level3], rcx     ; store L3 data cache size
-        ; check if OK
-        mov     eax, [r9 + level1]
-        cmp     eax, 1024
-K900:   ret                            ; carry flag set if fail
diff --git a/contrib/libs/asmlib/cpuid64.asm b/contrib/libs/asmlib/cpuid64.asm
deleted file mode 100644
index 95f1b5a22d..0000000000
--- a/contrib/libs/asmlib/cpuid64.asm
+++ /dev/null
@@ -1,55 +0,0 @@
-%include "defs.asm"
-
-;*************************  cpuid64.asm  *********************************
-; Author:           Agner Fog
-; Date created:     2008-12-14
-; Last modified:    2011-07-01
-; Source URL:       www.agner.org/optimize
-; Project:          asmlib.zip
-; Description:
-; This function calls the CPUID instruction.
-;
-; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
-;******************************************************************************
-
-default rel
-
-global cpuid_ex: function
-
-SECTION .text  align=16
-
-; ********** cpuid_ex function **********
-; C++ prototype:
-; extern "C" void cpuid_ex (int abcd[4], int a, int c);
-; Input: a = eax, c = ecx
-; Output: abcd[0] = eax, abcd[1] = ebx, abcd[2] = ecx, abcd[3] = edx
-
-
-cpuid_ex:
-
-%IFDEF   WINDOWS
-; parameters: rcx = abcd, edx = a, r8d = c
-        push    rbx
-        xchg    rcx, r8
-        mov     eax, edx
-        cpuid                          ; input eax, ecx. output eax, ebx, ecx, edx
-        mov     [r8],    eax
-        mov     [r8+4],  ebx
-        mov     [r8+8],  ecx
-        mov     [r8+12], edx
-        pop     rbx
-%ENDIF        
-%IFDEF   UNIX
-; parameters: rdi = abcd, esi = a, edx = c
-        push    rbx
-        mov     eax, esi
-        mov     ecx, edx
-        cpuid                          ; input eax, ecx. output eax, ebx, ecx, edx
-        mov     [rdi],    eax
-        mov     [rdi+4],  ebx
-        mov     [rdi+8],  ecx
-        mov     [rdi+12], edx
-        pop     rbx
-%ENDIF        
-        ret
-;cpuid_ex END
diff --git a/contrib/libs/asmlib/cputype64.asm b/contrib/libs/asmlib/cputype64.asm
deleted file mode 100644
index 633ebee86a..0000000000
--- a/contrib/libs/asmlib/cputype64.asm
+++ /dev/null
@@ -1,127 +0,0 @@
-%include "defs.asm"
-
-;*************************  cputype64.asm  **********************************
-; Author:           Agner Fog
-; Date created:     2011-07-09
-; Last modified:    2011-07-09
-; Source URL:       www.agner.org/optimize
-; Project:          asmlib.zip
-; Language:         assembly, NASM/YASM syntax, 64 bit
-;
-; C++ prototype:
-; extern "C" void CpuType(int * vendor, int * family, int * model);
-;
-; Description:
-; This function finds the vendor, family and model number of the CPU
-; and returns the values through the pointers. If a pointer is zero
-; then the value is not returned.
-;
-; Vendor: 
-; 0 = unknown
-; 1 = Intel
-; 2 = AMD
-; 3 = VIA/Centaur
-; 4 = Cyrix
-; 5 = NexGen
-;
-; Family: This is the sum of the family and extended family fields of the cpuid
-; Model:  This is the model + (extended model << 8)
-;
-; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
-;******************************************************************************
-;
-; C++ prototype:
-; extern "C" void CpuType(int * vendor, int * family, int * model);
-
-global CpuType: function
-
-
-SECTION .text
-
-CpuType:
-        push    rbx
-%ifdef  UNIX
-        mov     r8, rdx
-%endif
-%ifdef  WINDOWS        
-        push    rsi
-        push    rdi
-        mov     rdi, rcx
-        mov     rsi, rdx
-%endif
-        
-; parameters
-; vendor  rdi
-; family  rsi
-; model   r8
-
-        xor     r9d,  r9d              ; vendor
-        xor     r10d, r10d             ; family
-        xor     r11d, r11d             ; model
-
-        xor     eax, eax
-        cpuid                          ; get vendor
-        ; ecx = last  4 characters of vendor string
-        ; ebx = first 4 characters of vendor string
-        cmp     ecx, 'ntel'            ; 'GenuineIntel'
-        je      C110
-        cmp     ecx, 'cAMD'            ; 'AuthenticAMD'
-        je      C120
-        cmp     ebx, 'Cent'            ; 'CentaurHauls'
-        je      C130
-        cmp     ebx, 'VIA '            ; 'VIA VIA VIA '
-        je      C130
-        cmp     ebx, 'Cyri'            ; 'CyrixInstead'
-        je      C140
-        cmp     ebx, 'NexG'            ; 'NexGenDriven'
-        je      C150
-        jmp     C200                   ; other
-C110:   or      r9d, 1
-        jmp     C200
-C120:   or      r9d, 2
-        jmp     C200
-C130:   or      r9d, 3
-        jmp     C200
-C140:   or      r9d, 4
-        jmp     C200
-C150:   or      r9d, 5
-        ;jmp     C200
-C200:   
-
-        ; Get family and model
-        mov     eax, 1
-        cpuid                          
-        mov     ebx, eax
-        mov     r10d, eax
-        shr     ebx, 8
-        and     ebx, 0FH               ; Family
-        shr     r10d, 20
-        and     r10d, 0FFH             ; Extended family
-        add     r10d, ebx              ; Family + extended family
-        
-        mov     r11d, eax
-        shr     r11d, 4
-        and     r11d, 0FH              ; Model
-        shr     eax, 12
-        and     eax, 0F0H              ; Extended model
-        or      r11d, eax              ; extended model | Model
-        
-C300:   ; return r9d = vendor, r10d = family, r11d = model
-        test    rdi, rdi
-        jz      C310
-        mov     [rdi], r9d
-C310:   test    rsi, rsi
-        jz      C320
-        mov     [rsi], r10d
-C320:   test    r8, r8
-        jz      C330
-        mov     [r8], r11d
-C330:   xor     eax, eax
-        ; return
-%ifdef  WINDOWS 
-        pop     rdi
-        pop     rsi
-%endif
-        pop     rbx
-        ret
-;CpuType ENDP
diff --git a/contrib/libs/asmlib/debugbreak64.asm b/contrib/libs/asmlib/debugbreak64.asm
deleted file mode 100644
index ed2971cd24..0000000000
--- a/contrib/libs/asmlib/debugbreak64.asm
+++ /dev/null
@@ -1,33 +0,0 @@
-%include "defs.asm"
-
-;*************************  debugbreak64.asm  **********************************
-; Author:           Agner Fog
-; Date created:     2011-07-09
-; Last modified:    2011-07-09
-; Source URL:       www.agner.org/optimize
-; Project:          asmlib.zip
-; Language:         assembly, NASM/YASM syntax, 32 bit
-;
-; C++ prototype:
-; extern "C" void A_DebugBreak(void);
-;
-; Description:
-; Makes a debug breakpoint. Works only when running under a debugger
-;
-;
-; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
-;******************************************************************************
-;
-; C++ prototype:
-; extern "C" void A_DebugBreak(void);
-
-global A_DebugBreak: function
-
-
-SECTION .text
-
-A_DebugBreak:
-        int3
-        nop
-        ret
-;A_DebugBreak ENDP
diff --git a/contrib/libs/asmlib/defs.asm b/contrib/libs/asmlib/defs.asm
deleted file mode 100644
index db313e6cf1..0000000000
--- a/contrib/libs/asmlib/defs.asm
+++ /dev/null
@@ -1,22 +0,0 @@
-%ifdef UNIX
-    %ifdef DARWIN
-        %define EXP(x) _ %+ x
-    %else
-        %define EXP(x) x
-    %endif
-%else
-    %define EXP(x) _ %+ x
-    %define WINDOWS
-%endif
-
-%define ALLOW_OVERRIDE 1
-
-%ifdef WINDOWS
-    %define WEAK_SYM(x) global x
-%else
-    %ifdef DARWIN
-        %define WEAK_SYM(x) global x
-    %else
-        %define WEAK_SYM(x) weak x
-    %endif
-%endif
diff --git a/contrib/libs/asmlib/dispatchpatch64.asm b/contrib/libs/asmlib/dispatchpatch64.asm
deleted file mode 100644
index 205fac543d..0000000000
--- a/contrib/libs/asmlib/dispatchpatch64.asm
+++ /dev/null
@@ -1,303 +0,0 @@
-%include "defs.asm"
-
-;***********************  dispatchpatch64.asm  ********************************
-; Author:           Agner Fog
-; Date created:     2007-07-20
-; Last modified:    2013-08-21
-; Source URL:       www.agner.org/optimize
-; Project:          asmlib.zip
-; Language:         assembly, NASM/YASM syntax, 64 bit
-;
-; C++ prototype:
-; extern "C" int  __intel_cpu_indicator = 0;
-; extern "C" void __intel_cpu_indicator_init()
-;
-; Description:
-; Example of how to replace Intel CPU dispatcher in order to improve 
-; compatibility of Intel function libraries with non-Intel processors.
-; Only works with static link libraries (*.lib, *.a), not dynamic libraries
-; (*.dll, *.so). Linking in this as an object file will override the functions
-; with the same name in the library.; 
-; 
-; Copyright (c) 2007-2013 GNU LGPL License v. 3.0 www.gnu.org/licenses/lgpl.html
-;******************************************************************************
-
-; extern InstructionSet: function
-%include "instrset64.asm"              ; include code for InstructionSet function
-
-; InstructionSet function return value:
-;  4 or above = SSE2 supported
-;  5 or above = SSE3 supported
-;  6 or above = Supplementary SSE3
-;  8 or above = SSE4.1 supported
-;  9 or above = POPCNT supported
-; 10 or above = SSE4.2 supported
-; 11 or above = AVX supported by processor and operating system
-; 12 or above = PCLMUL and AES supported
-; 13 or above = AVX2 supported
-; 14 or above = FMA3, F16C, BMI1, BMI2, LZCNT
-; 15 or above = HLE + RTM supported
-
-
-global __intel_cpu_indicator
-global __intel_cpu_indicator_init
-
-
-SECTION .data
-intel_cpu_indicator@:                  ; local name
-__intel_cpu_indicator: dd 0
-
-; table of indicator values
-itable  DD      1                      ; 0: generic version, 80386 instruction set
-        DD      8, 8                   ; 1,   2: MMX
-        DD      0x80                   ; 3:      SSE
-        DD      0x200                  ; 4:      SSE2
-        DD      0x800                  ; 5:      SSE3
-        DD      0x1000,  0x1000        ; 6,   7: SSSE3
-        DD      0x2000,  0x2000        ; 8,   9: SSE4.1
-        DD      0x8000,  0x8000        ; 10, 11: SSE4.2 and popcnt
-        DD      0x20000, 0x20000       ; 12, 13: AVX, pclmul, aes
-        DD      0x400000               ; 14:     AVX2, F16C, BMI1, BMI2, LZCNT, FMA3
-        DD      0x800000               ; 15:     HLE, RTM
-itablelen equ ($ - itable) / 4         ; length of table
-
-SECTION .text
-
-__intel_cpu_indicator_init:
-        push    rax                    ; registers must be pushed
-        push    rcx
-        push    rdx
-        push    r8
-        push    r9
-        push    r10
-        push    r11
-        push    rsi
-        push    rdi
-        call    InstructionSet
-        cmp     eax, itablelen
-        jb      L100
-        mov     eax, itablelen - 1     ; limit to table length
-L100:   lea     rdx, [rel itable]
-        mov     eax, [rdx + 4*rax]
-        mov     [rel intel_cpu_indicator@], eax             ; store in __intel_cpu_indicator
-        pop     rdi
-        pop     rsi
-        pop     r11
-        pop     r10
-        pop     r9
-        pop     r8
-        pop     rdx
-        pop     rcx
-        pop     rax
-        ret
-
-;__intel_cpu_indicator_init ENDP
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;     Dispatcher for Math Kernel Library (MKL),
-;     version 10.2 and higher
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-WEAK_SYM(mkl_serv_cpu_detect)
-
-SECTION .data
-; table of indicator values
-; Note: the table is different in 32 bit and 64 bit mode
-
-mkltab  DD      0, 0, 0, 0             ; 0-3: generic version, 80386 instruction set
-        DD      0                      ; 4:      SSE2
-        DD      1                      ; 5:      SSE3
-        DD      2, 2, 2, 2             ; 6-9:    SSSE3
-        DD      3                      ; 10:     SSE4.2
-        DD      4, 4, 4                ; 11-13:  AVX
-        DD      5                      ; 14:     AVX2, FMA3, BMI1, BMI2, LZCNT, PCLMUL
-mkltablen equ ($ - mkltab) / 4         ; length of table
-
-SECTION .text
-
-mkl_serv_cpu_detect:
-        push    rcx                    ; Perhaps not needed
-        push    rdx
-        push    r8
-        push    r9
-%ifdef WINDOWS
-        push    rsi
-        push    rdi
-%endif
-        call    InstructionSet
-        cmp     eax, mkltablen
-        jb      M100
-        mov     eax, mkltablen - 1     ; limit to table length
-M100:   
-        lea     rdx, [rel mkltab]
-        mov     eax, [rdx + 4*rax]
-%ifdef WINDOWS
-        pop     rdi
-        pop     rsi
-%endif
-        pop     r9
-        pop     r8
-        pop     rdx
-        pop     rcx
-        ret
-; end mkl_serv_cpu_detect        
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;     Dispatcher for Vector Math Library (VML)
-;     version 10.0 and higher
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-WEAK_SYM(mkl_vml_serv_cpu_detect)
-
-SECTION .data
-; table of indicator values
-; Note: the table is different in 32 bit and 64 bit mode
-
-vmltab  DD      0, 0, 0, 0             ; 0-3: generic version, 80386 instruction set
-        DD      1, 1                   ; 4-5:    SSE2
-        DD      2, 2                   ; 6-7:    SSSE3
-        DD      3, 3                   ; 8-9:    SSE4.1
-        DD      4                      ; 10:     SSE4.2
-        DD      5, 5, 5                ; 11:     AVX
-;       DD      6  ??        
-vmltablen equ ($ - vmltab) / 4         ; length of table
-
-SECTION .text
-
-mkl_vml_serv_cpu_detect:
-        push    rcx                    ; Perhaps not needed
-        push    rdx
-        push    r8
-        push    r9
-%ifdef WINDOWS
-        push    rsi
-        push    rdi
-%endif
-        call    InstructionSet
-        cmp     eax, vmltablen
-        jb      V100
-        mov     eax, vmltablen - 1     ; limit to table length
-V100:   
-        lea     rdx, [rel vmltab]
-        mov     eax, [rdx + 4*rax]
-%ifdef WINDOWS
-        pop     rdi
-        pop     rsi
-%endif
-        pop     r9
-        pop     r8
-        pop     rdx
-        pop     rcx
-        ret
-; end mkl_vml_serv_cpu_detect        
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;     Dispatcher for __intel_cpu_feature_indicator 
-;     version 13 and higher
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-global __intel_cpu_features_init
-global __intel_cpu_feature_indicator
-global __intel_cpu_fms_indicator
-global __intel_cpu_features_init_x
-global __intel_cpu_feature_indicator_x
-global __intel_cpu_fms_indicator_x
-
-SECTION .data
-; table of indicator values
-
-intel_cpu_feature_indicator@:
-__intel_cpu_feature_indicator:
-__intel_cpu_feature_indicator_x  DD 0, 0
-intel_cpu_fms_indicator@:
-__intel_cpu_fms_indicator:
-__intel_cpu_fms_indicator_x:     DD 0, 0
-
-
-feattab DD  1                ; 0 default
-        DD  0BH              ; 1 MMX
-        DD  0FH              ; 2 conditional move and FCOMI supported
-        DD  3FH              ; 3 SSE
-        DD  7FH              ; 4 SSE2
-        DD  0FFH             ; 5 SSE3
-        DD  1FFH, 1FFH       ; 6 Supplementary SSE3
-        DD  3FFH             ; 8 SSE4.1
-        DD  0BFFH            ; 9 POPCNT 
-        DD  0FFFH            ; 10 SSE4.2 
-        DD  10FFFH           ; 11 AVX 
-        DD  16FFFH           ; 12 PCLMUL and AES 
-        DD  816FFFH          ; 13 AVX2 
-        DD  9DEFFFH          ; 14 FMA3, F16C, BMI1, BMI2, LZCNT
-        DD  0FDEFFFH         ; 15 HLE, RTM 
-
-feattablen equ ($ - feattab) / 4  ; length of table
-
-SECTION .text
-
-__intel_cpu_features_init:
-__intel_cpu_features_init_x:
-        push    rbx
-        push    rcx                    ; Perhaps not needed
-        push    rdx
-        push    r8
-        push    r9
-%ifdef WINDOWS
-        push    rsi
-        push    rdi
-%endif
-        call    InstructionSet
-        cmp     eax, feattablen
-        jb      F100
-        mov     eax, vmltablen - 1     ; limit to table length
-F100:   
-        lea     rdx, [rel feattab]
-        mov     ebx, [rdx + 4*rax]     ; look up in table        
-        push    rbx
-        mov     eax, 1
-        cpuid
-        pop     rbx
-        bt      ecx, 22                ; MOVBE
-        jnc     F200
-        or      ebx, 1000H
-F200:   mov     [intel_cpu_feature_indicator@], rbx
-
-        ; get family and model
-        mov     edx, eax
-        and     eax, 0FH               ; stepping bit 0-3
-        mov     ecx, edx
-        shr     ecx, 4
-        and     ecx, 0FH               ; model
-        mov     ebx, edx
-        shr     ebx, 12
-        and     ebx, 0F0H              ; x model
-        or      ecx, ebx               ; full model
-        mov     ah,  cl                ; model bit 8 - 15
-        mov     ecx, edx
-        shr     ecx, 8
-        and     ecx, 0FH               ; family
-        mov     ebx, edx
-        shr     ebx, 20
-        and     ebx, 0FFH              ; x family
-        add     ecx, ebx               ; full family
-        shl     ecx, 16
-        or      eax, ecx               ; full family bit 16 - 23
-        mov     [intel_cpu_fms_indicator@], eax
-        
-%ifdef WINDOWS
-        pop     rdi
-        pop     rsi
-%endif
-        pop     r9
-        pop     r8
-        pop     rdx
-        pop     rcx
-        pop     rbx
-        ret
-; end __intel_cpu_features_init        
-
-
-
-
diff --git a/contrib/libs/asmlib/divfixedi64.asm b/contrib/libs/asmlib/divfixedi64.asm
deleted file mode 100644
index bf8ab137a9..0000000000
--- a/contrib/libs/asmlib/divfixedi64.asm
+++ /dev/null
@@ -1,173 +0,0 @@
-%include "defs.asm"
-
-;*************************  divfixedi64.asm  *********************************
-; Author:           Agner Fog
-; Date created:     2011-07-22
-; Last modified:    2011-07-22
-;
-; Function prototypes:
-; void setdivisori32(int buffer[2], int d);
-; int dividefixedi32(const int buffer[2], int x);
-; void setdivisoru32(uint32_t buffer[2], uint32_t d);
-; uint32_t dividefixedu32(const uint32_t buffer[2], uint32_t x);
-;
-; Description:
-; Functions for fast repeated integer division by the same divisor, signed 
-; and unsigned 32-bit integer versions. The divisor must be positive.
-;
-; The setdivisor functions calculate the reciprocal divisor and shift counts,
-; the dividefixed functions do the division by multiplication and shift.
-;
-; The methods used are described by:
-; T. Granlund and P. L. Montgomery: Division by Invariant Integers Using Multiplication,
-; Proceedings of the SIGPLAN 1994 Conference on Programming Language Design and Implementation.
-; http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556
-;
-; Mathematical formula, unsigned division:
-; x = dividend
-; d = divisor
-; n = integer size, bits
-; L = ceil(log2(d))
-; m = 1 + 2^n * (2^L-d) / d       [2^L should overflow to 0 if L = n]
-; sh1 = min(L,1)
-; sh2 = max(L-1,0)
-; t = m*x >> n                    [high part of unsigned multiplication]
-; x/d = (((x-t) >> sh1) + t) >> sh2
-;
-; Mathematical formula, signed division:
-; x = dividend
-; d = abs(divisor)
-; n = integer size, bits
-; L = ceil(log2(d))
-; L = max(L,1)
-; m = 1 + 2^(n+L-1)/d - 2^n       [division should overflow to 0 if d = 1]
-; sh1 = L-1
-; q = x + (m*x >> n)              [high part of signed multiplication]
-; q = (q >> sh1) - (x<0 ? -1 : 0)
-; if (divisor < 0) q = -q         [negative divisor not supported in present implementation]
-; x/d = q
-;
-; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
-;******************************************************************************
-
-%IFDEF  WINDOWS
-%define par1   rcx                     ; function parameter 1
-%define par2   edx                     ; function parameter 2
-%define buf    r9                      ; copy of function parameter 1: buffer
-%define rx     r8
-%define rxd    r8d                     ; d or x
-%ELSE   ; UNIX
-%define par1   rdi                     ; function parameter 1
-%define par2   esi                     ; function parameter 2
-%define buf    rdi                     ; function parameter 1: buffer
-%define rx     rsi
-%define rxd    esi                     ; d or x
-%ENDIF
-
-
-section .text
-
-; extern "C" void setdivisori32(int buffer[2], int d);
-; 32 bit signed 
-
-global setdivisori32: function
-setdivisori32:
-%IFDEF  WINDOWS
-        mov     rxd, edx               ; x
-        mov     buf, rcx               ; buffer
-%ENDIF        
-        dec     rxd                    ; rxd = r8d or esi
-        mov     ecx, -1                ; value for bsr if rxd = 0 (assuming bsr leaves dest unchanged if src = 0, this works on both Intel, AMD and VIA processors)
-        bsr     ecx, rxd               ; floor(log2(d-1))
-        inc     rxd
-        js      H120                   ; d < 0. Generate error
-        inc     ecx                    ; L = ceil(log2(d))        
-        sub     ecx, 1                 ; shift count = L - 1
-        adc     ecx, 0                 ; avoid negative shift count
-        xor     eax, eax
-        mov     edx, 1
-        cmp     rxd, edx
-        je      H110                   ; avoid overflow when d = 1
-        shl     edx, cl
-        div     rxd
-H110:   inc     eax
-        mov     [buf], eax             ; multiplier
-        mov     [buf+4], ecx           ; shift count
-        ret
-        
-H120:   ; d <= 0 not supported. Generate error
-        mov     edx, 1
-        div     edx                    ; will overflow
-        ud2
-
-        
-; extern "C" int dividefixedi32(int buffer[2], int x);
-global dividefixedi32: function
-dividefixedi32:
-%IFDEF  WINDOWS
-        mov     eax, edx
-        mov     rxd, edx               ; x
-        mov     buf, rcx               ; buffer
-%ELSE
-        mov     eax, esi
-%ENDIF        
-        imul    dword [buf]            ; m
-        lea     eax, [rdx+rx]          ; rx = r8 or rsi
-        mov     ecx, [buf+4]           ; shift count
-        sar     eax, cl
-        sar     rxd, 31                ; sign(x)
-        sub     eax, rxd
-        ret
-
-
-;extern "C" void setdivisoru32(int buffer[2], int d);
-; 32 bit unsigned 
-
-global setdivisoru32: function
-setdivisoru32:
-%IFDEF  WINDOWS
-        mov     rxd, edx               ; x
-        mov     buf, rcx               ; buffer
-%ENDIF        
-        dec     rxd                    ; rxd = r8d or esi
-        mov     ecx, -1                ; value for bsr if r8d = 0
-        bsr     ecx, rxd               ; floor(log2(d-1))
-        inc     rxd
-        inc     ecx                    ; L = ceil(log2(d))
-        mov     edx, 1
-        shl     rdx, cl                ; 2^L (64 bit shift because cl may be 32)
-        sub     edx, rxd
-        xor     eax, eax
-        div     rxd
-        inc     eax
-        mov     [buf], eax             ; multiplier
-        sub     ecx, 1
-        setae   dl
-        movzx   edx, dl                ; shift1
-        seta    al
-        neg     al
-        and     al,cl
-        movzx   eax, al                ; shift 2
-        shl     eax, 8
-        or      eax, edx
-        mov     [buf+4], eax           ; shift 1 and shift 2
-        ret
-        
-;extern "C" int dividefixedu32(int buffer[2], int x);
-global dividefixedu32: function       ; unsigned
-dividefixedu32:
-%IFDEF  WINDOWS
-        mov     eax, edx
-        mov     rxd, edx               ; x
-        mov     buf, rcx               ; buffer
-%ELSE
-        mov     eax, esi
-%ENDIF        
-        mul     dword [buf]            ; m
-        sub     rxd, edx               ; x-t
-        mov     ecx, [buf+4]           ; shift 1 and shift 2
-        shr     rxd, cl
-        lea     eax, [rx+rdx]
-        shr     ecx, 8
-        shr     eax, cl
-        ret
diff --git a/contrib/libs/asmlib/divfixedv64.asm b/contrib/libs/asmlib/divfixedv64.asm
deleted file mode 100644
index a4f0e177ec..0000000000
--- a/contrib/libs/asmlib/divfixedv64.asm
+++ /dev/null
@@ -1,498 +0,0 @@
-%include "defs.asm"
-
-;*************************  divfixedv64.asm  *********************************
-; Author:           Agner Fog
-; Date created:     2011-07-25
-; Last modified:    2012-03-10
-;
-; Function prototypes:
-; void setdivisorV8i16(__m128i buf[2], int16_t d);
-; void setdivisorV8u16(__m128i buf[2], uint16_t d);
-; void setdivisorV4i32(__m128i buf[2], int32_t d);
-; void setdivisorV4u32(__m128i buf[2], uint32_t d);
-;
-; __m128i dividefixedV8i16(const __m128i buf[2], __m128i x);
-; __m128i dividefixedV8u16(const __m128i buf[2], __m128i x);;
-; __m128i dividefixedV4i32(const __m128i buf[2], __m128i x);
-; __m128i dividefixedV4u32(const __m128i buf[2], __m128i x);
-;
-; Alternative versions for VectorClass.h:
-; (These versions pack all parameters into a single register)
-; __m128i setdivisor8s(int16_t d);
-; __m128i setdivisor8us(uint16_t d);
-; __m128i setdivisor4i(int32_t d);
-; __m128i setdivisor4ui(uint32_t d);
-;
-; Description:
-; Functions for integer vector division by the same divisor, signed 
-; and unsigned 16-bit and 32-bit integer versions.
-;
-; The setdivisor functions calculate the reciprocal divisor and shift counts,
-; the dividefixed functions do the division by multiplication and shift of the 
-; vector elements of packed 16-bit or 32-bit signed or unsigned integers. 
-;
-; The divisor must be positive. A zero divisor generated a divide by zero error.
-; A negative divisor generates a division overflow error. To divide by a negative
-; divisor, change the sign of the divisor and the result.
-;
-; The methods used are described in this article:
-; T. Granlund and P. L. Montgomery: Division by Invariant Integers Using Multiplication,
-; Proceedings of the SIGPLAN 1994 Conference on Programming Language Design and Implementation.
-; http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556
-;
-; Mathematical formula, unsigned division:
-; x = dividend
-; d = divisor
-; n = integer size, bits
-; L = ceil(log2(d))
-; m = 1 + 2^n * (2^L-d) / d       [2^L should overflow to 0 if L = n]
-; sh1 = min(L,1)
-; sh2 = max(L-1,0)
-; t = m*x >> n                    [high part of unsigned multiplication]
-; x/d = (((x-t) >> sh1) + t) >> sh2
-;
-; Mathematical formula, signed division:
-; x = dividend
-; d = abs(divisor)
-; n = integer size, bits
-; L = ceil(log2(d))
-; L = max(L,1)
-; m = 1 + 2^(n+L-1)/d - 2^n       [division should overflow to 0 if d = 1]
-; sh1 = L-1
-; q = x + (m*x >> n)              [high part of signed multiplication]
-; q = (q >> sh1) - (x<0 ? -1 : 0)
-; if (divisor < 0) q = -q         [negative divisor not supported in present implementation]
-; x/d = q
-;
-; Copyright (c) 2011 - 2012 GNU General Public License www.gnu.org/licenses
-;******************************************************************************
-default rel
-
-%IFDEF  WINDOWS
-%define par1   rcx                     ; function parameter 1
-%define par1d  ecx
-%define par1w   cx
-%define par2   rdx                     ; function parameter 2
-%define par2d  edx
-%define par2w  dx 
-%define buf    r8                      ; pointer to buffer
-%ENDIF
-%IFDEF  UNIX
-%define par1   rdi                     ; function parameter 1
-%define par1d  edi
-%define par1w  di 
-%define par2   rsi                     ; function parameter 2
-%define par2d  esi
-%define par2w  si
-%define buf    rdi                     ; pointer to buffer
-%ENDIF
-
-
-; Imported from instrset64.asm:
-extern InstructionSet                  ; Instruction set for CPU dispatcher
-
-section .text  align = 16
-
-;******************************************************************************
-;                    16 bit signed integers
-;******************************************************************************
-
-; extern "C" __m128i setdivisor8s(int16_t d);
-; vector of 8 x 16 bit signed integers
-
-global setdivisor8s: function
-setdivisor8s:
-        push    rbx
-        movsx   ebx, par1w             ; d
-        dec     ebx
-        mov     ecx, -1                ; value for bsr if ebx = 0
-        bsr     ecx, ebx               ; floor(log2(d-1))
-        inc     ebx
-        js      H120                   ; Generate error if d < 0. (error for d=0 will come in the div instruction)
-        inc     ecx                    ; L = ceil(log2(d))        
-        sub     ecx, 1                 ; shift count = L - 1
-        adc     ecx, 0                 ; avoid negative shift count
-        xor     eax, eax
-        mov     edx, 1
-        cmp     ebx, edx
-        je      H110                   ; avoid division overflow when d = 1
-        shl     edx, cl
-        div     bx                     ; 2^(16+L-1)/d
-H110:   inc     eax
-        movd    xmm0, eax              ; multiplier
-        pshuflw xmm0, xmm0, 0          ; broadcast into lower 4 words
-        movd    xmm1, ecx              ; shift count
-        punpcklqdq xmm0, xmm1          ; insert shift count into upper half
-        pop     rbx
-        ret
-H120:   ; d < 0 not supported. Generate error
-        mov     edx, 1
-        div     edx
-        ud2
-; setdivisor8s end        
-
-        
-; extern "C" void setdivisorV8i16(__m128i buf[2], int16_t d);
-; vector of 8 x 16 bit signed integers
-
-global setdivisorV8i16: function
-setdivisorV8i16:
-        push    par1                   ; buf
-        mov     par1d, par2d           ; d
-        call    setdivisor8s
-        pop     rax                    ; buf
-        punpcklqdq xmm0, xmm0          ; copy multiplier into upper 4 words        
-        movdqa  [rax], xmm0            ; multiplier
-        movdqa  [rax+16], xmm1         ; shift count is still in xmm1
-        ret
-; setdivisorV8i16 end
-
-
-; extern "C" int dividefixedV8i16(const __m128i buf[2], __m128i x);
-global dividefixedV8i16: function
-
-dividefixedV8i16:
-; buf = par1
-; x = xmm0 (UNIX) or [par2] (Windows)
-%IFDEF  WINDOWS
-        movdqa  xmm0, [par2]           ; x
-%ENDIF
-        movdqa  xmm1, xmm0             ; x
-        pmulhw  xmm0, [par1]           ; multiply high signed words
-        paddw   xmm0, xmm1
-        movd    xmm2, [par1+16]        ; shift count
-        psraw   xmm0, xmm2             ; shift right arithmetic
-        psraw   xmm1, 15               ; sign of x
-        psubw   xmm0, xmm1
-        ret
-;dividefixedV8i16 end
-
-
-
-;******************************************************************************
-;                    16 bit unsigned integers
-;******************************************************************************
-
-; extern "C" __m128i setdivisor8us(uint16_t d);
-; vector of 8 x 16 bit unsigned integers
-
-align 16
-global setdivisor8us: function
-setdivisor8us:
-        push    rbx
-        movzx   ebx, par1w             ; d
-        dec     ebx
-        mov     ecx, -1                ; value for bsr if ebx = 0
-        bsr     ecx, ebx               ; floor(log2(d-1))
-        inc     ebx
-        inc     ecx                    ; L = ceil(log2(d))
-        mov     edx, 1
-        shl     edx, cl                ; 2^L  [32-bit shift to allow overflow]
-        sub     edx, ebx
-        xor     eax, eax
-        div     bx
-        inc     eax
-        movd    xmm0, eax
-        pshuflw xmm0, xmm0, 0          ; broadcast into lower 4 words
-        sub     ecx, 1
-        setae   dl
-        movzx   edx, dl                ; shift 1
-        seta    al
-        neg     al
-        and     al,cl
-        movzx   eax, al                ; shift 2
-        movd    xmm1, edx              ; shift 1
-        movd    xmm2, eax              ; shift 2
-        punpckldq  xmm1, xmm2          ; combine into two dwords
-        punpcklqdq xmm0, xmm1          ; multipliers, shift1, shift2
-        pop     rbx
-        ret
-; setdivisor8us end
-
-
-;extern "C" void setdivisorV8u16(__m128i buf[2], uint16_t d);
-; 8 x 16 bit unsigned 
-
-global setdivisorV8u16: function
-setdivisorV8u16:
-        push    par1                   ; buf
-        mov     par1d, par2d           ; d
-        call    setdivisor8us
-        pop     rax                    ; buf
-        punpcklqdq xmm0, xmm0          ; copy multiplier into upper 4 words        
-        movdqa  [rax], xmm0            ; multiplier
-        movdqa  [rax+16], xmm1         ; shift counts are still in xmm1
-        ret
-; setdivisorV8u16 end
-
-        
-;extern "C" __m128i dividefixedV8u16(const __m128i buf[2], __m128i x);
-global dividefixedV8u16: function
-
-align 16
-dividefixedV8u16:
-; buf = par1
-; x = xmm0 (UNIX) or [par2] (Windows)
-%IFDEF  WINDOWS
-        movdqa  xmm0, [par2]           ; x
-%ENDIF
-        movdqa  xmm1, xmm0             ; x
-        pmulhuw xmm0, [par1]           ; multiply high unsigned words
-        psubw   xmm1, xmm0
-        movd    xmm2, [par1+16]        ; shift1
-        psrlw   xmm1, xmm2
-        paddw   xmm0, xmm1
-        movd    xmm2, [par1+20]        ; shift2
-        psrlw   xmm0, xmm2
-        ret
-;dividefixedV8u16 end
-
-
-
-;******************************************************************************
-;                    32 bit signed integers
-;******************************************************************************
-
-; extern "C" __m128i setdivisor4i(int32_t d);
-; vector of 4 x 32 bit signed integers
-
-align 16
-global setdivisor4i: function
-setdivisor4i:
-        push    rbx
-        mov     ebx, par1d             ; d
-        dec     ebx
-        mov     ecx, -1                ; value for bsr if ebx = 0
-        bsr     ecx, ebx               ; floor(log2(d-1))
-        inc     ebx
-        js      K120                   ; Generate error if d < 0. (error for d=0 will come in the div instruction)
-        inc     ecx                    ; L = ceil(log2(d))        
-        sub     ecx, 1                 ; shift count = L - 1
-        adc     ecx, 0                 ; avoid negative shift count
-        xor     eax, eax
-        mov     edx, 1
-        cmp     ebx, edx
-        je      K110                   ; avoid division overflow when d = 1
-        shl     edx, cl
-        div     ebx                    ; 2^(16+L-1)/d
-K110:   inc     eax
-        movd    xmm0, eax              ; multiplier
-        pshufd  xmm0, xmm0, 0          ; broadcast into 4 dwords
-        movd    xmm1, ecx              ; shift count
-        punpcklqdq xmm0, xmm1          ; insert shift count into upper half
-        pop     rbx
-        ret
-        
-K120:   ; d < 0 not supported. Generate error
-        mov     edx, 1
-        div     edx
-        ud2
-; setdivisor4i end
-
-
-; extern "C" void setdivisorV4i32(__m128i buf[2], int32_t d);
-; vector of 4 x 32 bit signed integers
-
-global setdivisorV4i32: function
-setdivisorV4i32:
-        push    par1                   ; buf
-        mov     par1d, par2d           ; d
-        call    setdivisor4i
-        pop     rax                    ; buf
-        punpcklqdq xmm0, xmm0          ; copy multiplier into upper 4 words        
-        movdqa  [rax], xmm0            ; multiplier
-        movdqa  [rax+16], xmm1         ; shift count is still in xmm1
-        ret
-; setdivisorV4i32 end
-
-        
-; extern "C" int dividefixedV4i32(const __m128i buf[2], __m128i x);
-global dividefixedV4i32: function
-
-; Direct entries to CPU-specific versions
-global dividefixedV4i32SSE2:  function
-global dividefixedV4i32SSE41: function
-
-align 8
-dividefixedV4i32: ; function dispatching
-        jmp     near [dividefixedV4i32Dispatch] ; Go to appropriate version, depending on instruction set
-
-align 16
-dividefixedV4i32SSE41: 
-; buf = par1
-; x = xmm0 (UNIX) or [par2] (Windows)
-%IFDEF  WINDOWS
-        movdqa  xmm0,[par2]            ; x
-%ENDIF
-        movdqa  xmm1, xmm0             ; x
-        movdqa  xmm2, xmm0             ; x        
-        movdqa  xmm3, [par1]           ; multiplier
-        pmuldq  xmm0, xmm3             ; 32 x 32 -> 64 bit signed multiplication of x[0] and x[2]
-        psrlq   xmm0, 32               ; high dword of result 0 and 2
-        psrlq   xmm1, 32               ; get x[1] and x[3] into position for multiplication
-        pmuldq  xmm1, xmm3             ; 32 x 32 -> 64 bit signed multiplication of x[1] and x[3]
-        pcmpeqd xmm3, xmm3
-        psllq   xmm3, 32               ; generate mask of dword 1 and 3
-        pand    xmm1, xmm3             ; high dword of result 1 and 3
-        por     xmm0, xmm1             ; combine all four results into one vector
-        paddd   xmm0, xmm2
-        movd    xmm3, [par1+16]        ; shift count
-        psrad   xmm0, xmm3             ; shift right arithmetic
-        psrad   xmm2, 31               ; sign of x
-        psubd   xmm0, xmm2
-        ret
-;dividefixedV4i32SSE41 end
-
-dividefixedV4i32SSE2:
-; I have tried to change sign and use pmuludq, but get rounding error (gives 9/10 = 1).
-; This solution, with 4 separate multiplications, is probably faster anyway despite store forwarding stall
-        push    rbp
-        mov     rbp, rsp
-%IFDEF  WINDOWS
-        movdqa  xmm0,[par2]            ; x
-        mov     buf, par1
-%ENDIF
-        sub     rsp, 16                ; allocate stack space
-        and     rsp, -16               ; stack should be aligned already. align anyway to be safe
-        movdqa  [rsp], xmm0            ; store x
-        movdqa  xmm2, xmm0             ; x        
-        mov     ecx, [buf]             ; multiplier
-        ; do four signed high multiplications
-        mov     eax, [rsp]
-        imul    ecx
-        mov     [rsp], edx
-        mov     eax, [rsp+4]
-        imul    ecx
-        mov     [rsp+4], edx
-        mov     eax, [rsp+8]
-        imul    ecx
-        mov     [rsp+8], edx
-        mov     eax, [rsp+12]
-        imul    ecx
-        mov     [rsp+12], edx
-        movdqa  xmm0, [rsp]            ; x*m vector
-        paddd   xmm0, xmm2
-        movd    xmm3, [buf+16]         ; shift count
-        psrad   xmm0, xmm3             ; shift right arithmetic
-        psrad   xmm2, 31               ; sign of x
-        psubd   xmm0, xmm2
-        mov     rsp, rbp
-        pop     rbp        
-        ret
-;dividefixedV4i32SSE2 end
-
-
-; ********************************************************************************
-; CPU dispatching for dividefixedV4i32. This is executed only once
-; ********************************************************************************
-
-dividefixedV4i32CPUDispatch:
-        ; get supported instruction set
-        push    par1
-        push    par2
-        call    InstructionSet
-        pop     par2
-        pop     par1
-        ; Point to generic version
-        lea     r8, [dividefixedV4i32SSE2]
-        cmp     eax, 8                ; check if PMULDQ supported
-        jb      Q100
-        ; SSE4.1 supported
-        ; Point to SSE4.1 version of strstr
-        lea     r8, [dividefixedV4i32SSE41]
-Q100:   mov     [dividefixedV4i32Dispatch], r8
-        ; Continue in appropriate version 
-        jmp     r8
-
-SECTION .data
-
-; Pointer to appropriate versions. Initially point to dispatcher
-dividefixedV4i32Dispatch Dq dividefixedV4i32CPUDispatch
-
-section .text
-
-
-;******************************************************************************
-;                    32 bit unsigned integers
-;******************************************************************************
-
-; extern "C" __m128i setdivisor4ui(uint32_t d);
-; vector of 4 x 32 bit unsigned integers
-
-align 16
-global setdivisor4ui: function
-setdivisor4ui:
-        push    rbx
-        mov     ebx, par1d             ; d
-        dec     ebx
-        mov     ecx, -1                ; value for bsr if ebx = 0
-        bsr     ecx, ebx               ; floor(log2(d-1))
-        inc     ebx
-        inc     ecx                    ; L = ceil(log2(d))
-        mov     edx, 1
-        shl     rdx, cl                ; 2^L     [64 bit shift to allow overflow]
-        sub     edx, ebx
-        xor     eax, eax
-        div     ebx
-        inc     eax
-        movd    xmm0, eax
-        pshufd  xmm0, xmm0, 0          ; broadcast into 4 dwords
-        sub     ecx, 1
-        setae   dl
-        movzx   edx, dl                ; shift1
-        seta    al
-        neg     al
-        and     al,cl
-        movzx   eax, al        
-        movd    xmm1, edx              ; shift 1
-        movd    xmm2, eax              ; shift 2
-        punpckldq  xmm1, xmm2          ; combine into two dwords
-        punpcklqdq xmm0, xmm1          ; multipliers, shift1, shift2
-        pop     rbx
-        ret
-; setdivisor4ui end
-
-;extern "C" void setdivisorV4u32(__m128i buf[2], uint32_t d);
-; 4 x 32 bit unsigned 
-
-global setdivisorV4u32: function
-setdivisorV4u32:
-        push    par1                   ; buf
-        mov     par1d, par2d           ; d
-        call    setdivisor4ui
-        pop     rax                    ; buf
-        punpcklqdq xmm0, xmm0          ; copy multiplier into upper 4 words        
-        movdqa  [rax], xmm0            ; multiplier
-        movdqa  [rax+16], xmm1         ; shift counts are still in xmm1
-        ret
-; setdivisorV4u32 end
-        
-;extern "C" __m128i dividefixedV4u32(const __m128i buf[2], __m128i x);
-global dividefixedV4u32: function
-
-align 16
-dividefixedV4u32:
-; buf = par1
-; x = xmm0 (UNIX) or [par2] (Windows)
-%IFDEF  WINDOWS
-        movdqa  xmm0,[par2]            ; x
-%ENDIF
-        movdqa  xmm1, xmm0             ; x
-        movdqa  xmm2, xmm0             ; x
-        movdqa  xmm3, [par1]           ; multiplier
-        pmuludq xmm0, xmm3             ; 32 x 32 -> 64 bit unsigned multiplication of x[0] and x[2]
-        psrlq   xmm0, 32               ; high dword of result 0 and 2
-        psrlq   xmm1, 32               ; get x[1] and x[3] into position for multiplication
-        pmuludq xmm1, xmm3             ; 32 x 32 -> 64 bit unsigned multiplication of x[1] and x[3]
-        pcmpeqd xmm3, xmm3
-        psllq   xmm3, 32               ; generate mask of dword 1 and 3
-        pand    xmm1, xmm3             ; high dword of result 1 and 3
-        por     xmm0, xmm1             ; combine all four results into one vector
-        psubd   xmm2, xmm0
-        movd    xmm3, [par1+16]        ; shift1
-        psrld   xmm2, xmm3
-        paddd   xmm0, xmm2
-        movd    xmm3, [par1+20]        ; shift2
-        psrld   xmm0, xmm3
-        ret
-;dividefixedV4u32 end
diff --git a/contrib/libs/asmlib/dummy.c b/contrib/libs/asmlib/dummy.c
deleted file mode 100644
index e69de29bb2..0000000000
--- a/contrib/libs/asmlib/dummy.c
+++ /dev/null
diff --git a/contrib/libs/asmlib/instrset64.asm b/contrib/libs/asmlib/instrset64.asm
deleted file mode 100644
index c8cdd34a19..0000000000
--- a/contrib/libs/asmlib/instrset64.asm
+++ /dev/null
@@ -1,184 +0,0 @@
-%include "defs.asm"
-
-;*************************  instrset64.asm  **********************************
-; Author:           Agner Fog
-; Date created:     2003-12-12
-; Last modified:    2013-09-11
-; Source URL:       www.agner.org/optimize
-; Project:          asmlib.zip
-; Language:         assembly, NASM/YASM syntax, 64 bit
-;
-; C++ prototype:
-; extern "C" int InstructionSet (void);
-;
-; Description:
-; This function returns an integer indicating which instruction set is
-; supported by the microprocessor and operating system. A program can
-; call this function to determine if a particular set of instructions can
-; be used.
-;
-; The method used here for detecting whether XMM instructions are enabled by
-; the operating system is different from the method recommended by Intel.
-; The method used here has the advantage that it is independent of the 
-; ability of the operating system to catch invalid opcode exceptions. The
-; method used here has been thoroughly tested on many different versions of
-; Intel and AMD microprocessors, and is believed to work reliably. For further
-; discussion of this method, see my manual "Optimizing subroutines in assembly
-; language" (www.agner.org/optimize/).
-; 
-; Copyright (c) 2003-2013 GNU General Public License www.gnu.org/licenses
-;******************************************************************************
-;
-; ********** InstructionSet function **********
-; C++ prototype:
-; extern "C" int InstructionSet (void);
-;
-; return value:
-;  0 =  80386 instruction set only
-;  1 or above = MMX instructions supported
-;  2 or above = conditional move and FCOMI supported
-;  3 or above = SSE (XMM) supported by processor and operating system
-;  4 or above = SSE2 supported
-;  5 or above = SSE3 supported
-;  6 or above = Supplementary SSE3
-;  8 or above = SSE4.1 supported
-;  9 or above = POPCNT supported
-; 10 or above = SSE4.2 supported
-; 11 or above = AVX supported by processor and operating system
-; 12 or above = PCLMUL and AES supported
-; 13 or above = AVX2 supported
-; 14 or above = FMA3, F16C, BMI1, BMI2, LZCNT
-; 15 or above = HLE + RTM supported
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-default rel
-
-global InstructionSet: function
-global IInstrSet
-
-
-SECTION .data
-align 16
-
-IInstrSet@:                            ; local name to avoid problems in shared objects
-IInstrSet:  dd      -1                 ; this global variable is valid after first call
-
-
-SECTION .text  align=16
-
-; ********** InstructionSet function **********
-; C++ prototype:
-; extern "C" int InstructionSet (void);
-
-; return value:
-;  4 or above = SSE2 supported
-;  5 or above = SSE3 supported
-;  6 or above = Supplementary SSE3 supported
-;  8 or above = SSE4.1 supported
-;  9 or above = POPCNT supported
-; 10 or above = SSE4.2 supported
-; 11 or above = AVX supported by processor and operating system
-; 12 or above = PCLMUL and AES supported
-
-
-InstructionSet:
-        ; Check if this function has been called before
-        mov     eax, [IInstrSet@]
-        test    eax, eax
-        js      FirstTime              ; Negative means first time
-        ; Early return. Has been called before
-        ret                            ; Return value is in eax
-
-FirstTime:
-        push    rbx
-
-        mov     eax, 1
-        cpuid                          ; get features into edx and ecx
-        
-        mov     eax, 4                 ; at least SSE2 supported in 64 bit mode
-        test    ecx, 1                 ; SSE3 support by microprocessor
-        jz      ISEND
-        inc     eax                    ; 5
-        
-        bt      ecx, 9                 ; Suppl-SSE3 support by microprocessor
-        jnc     ISEND
-        inc     eax                    ; 6
-        
-        bt      ecx, 19                ; SSE4.1 support by microprocessor
-        jnc     ISEND
-        mov     al, 8                  ; 8        
-        
-        bt      ecx, 23                ; POPCNT support by microprocessor
-        jnc     ISEND
-        inc     eax                    ; 9
-        
-        bt      ecx, 20                ; SSE4.2 support by microprocessor
-        jnc     ISEND
-        inc     eax                    ; 10
-
-        ; check OS support for YMM registers (AVX)
-        bt      ecx, 27                ; OSXSAVE: XGETBV supported
-        jnc     ISEND
-        push    rax
-        push    rcx
-        push    rdx
-        xor     ecx, ecx
-        db      0FH, 01H, 0D0H         ; XGETBV
-        and     eax, 6
-        cmp     eax, 6                 ; AVX support by OS
-        pop     rdx
-        pop     rcx
-        pop     rax
-        jne     ISEND
-
-        bt      ecx, 28                ; AVX support by microprocessor
-        jnc     ISEND
-        inc     eax                    ; 11
-        
-        bt      ecx, 1                 ; PCLMUL support
-        jnc     ISEND
-        bt      ecx, 25                ; AES support
-        jnc     ISEND
-        inc     eax                    ; 12
-        
-        push    rax
-        push    rcx
-        mov     eax, 7
-        xor     ecx, ecx
-        cpuid                          ; check for AVX2
-        bt      ebx, 5
-        pop     rcx
-        pop     rax
-        jnc     ISEND
-        inc     eax                    ; 13
-        
-; 14 or above = FMA3, F16C, BMI1, BMI2, LZCNT
-        bt      ecx, 12                ; FMA3
-        jnc     ISEND
-        bt      ecx, 29                ; F16C
-        jnc     ISEND
-        bt      ebx, 3                 ; BMI1
-        jnc     ISEND
-        bt      ebx, 8                 ; BMI2
-        jnc     ISEND
-        
-        push    rax
-        push    rbx
-        push    rcx
-        mov     eax, 80000001H
-        cpuid
-        bt      ecx, 5                 ; LZCNT
-        pop     rcx
-        pop     rbx
-        pop     rax
-        jnc     ISEND
-        
-        inc     eax                    ; 14
-       
-ISEND:  mov     [IInstrSet@], eax      ; save value in global variable
-
-        pop     rbx
-        ret                            ; return value is in eax
-
-;InstructionSet ENDP
diff --git a/contrib/libs/asmlib/memcmp64.asm b/contrib/libs/asmlib/memcmp64.asm
deleted file mode 100644
index b8a8ab5fbc..0000000000
--- a/contrib/libs/asmlib/memcmp64.asm
+++ /dev/null
@@ -1,295 +0,0 @@
-%include "defs.asm"
-
-;*************************  memcmp64.asm  *************************************
-; Author:           Agner Fog
-; Date created:     2013-10-03
-; Last modified:    2013-10-03
-; Description:
-; Faster version of the standard memcmp function:
-;
-; int A_memcmp (const void * ptr1, const void * ptr2, size_t count);
-;
-; Compares two memory blocks of size num.
-; The return value is zero if the two memory blocks ptr1 and ptr2 are equal
-; The return value is positive if the first differing byte of ptr1 is bigger 
-; than ptr2 when compared as unsigned bytes.
-; The return value is negative if the first differing byte of ptr1 is smaller 
-; than ptr2 when compared as unsigned bytes.
-;
-; Overriding standard function memcmp:
-; The alias ?OVR_memcmp is changed to _memcmp in the object file if
-; it is desired to override the standard library function memcmp.
-;
-; Optimization:
-; Uses XMM registers if SSE2 is available, uses YMM registers if AVX2.
-;
-; The latest version of this file is available at:
-; www.agner.org/optimize/asmexamples.zip
-; Copyright (c) 2013 GNU General Public License www.gnu.org/licenses
-;******************************************************************************
-
-global A_memcmp: function              ; Function memcmp
-global EXP(memcmp): function           ; ?OVR_ removed if standard function memcmp overridden
-; Direct entries to CPU-specific versions
-global memcmpSSE2: function            ; SSE2 version
-global memcmpAVX2: function            ; AVX2 version
-
-; Imported from instrset64.asm
-extern InstructionSet                 ; Instruction set for CPU dispatcher
-
-default rel
-
-; define registers used for parameters
-%IFDEF  WINDOWS
-%define par1   rcx                     ; function parameter 1
-%define par2   rdx                     ; function parameter 2
-%define par3   r8                      ; function parameter 3
-%define par4   r9                      ; scratch register
-%define par4d  r9d                     ; scratch register
-%ENDIF
-%IFDEF  UNIX
-%define par1   rdi                     ; function parameter 1
-%define par2   rsi                     ; function parameter 2
-%define par3   rdx                     ; function parameter 3
-%define par4   rcx                     ; scratch register
-%define par4d  ecx                     ; scratch register
-%ENDIF
-
-
-
-SECTION .text  align=16
-
-; extern "C" int A_memcmp (const void * ptr1, const void * ptr2, size_t count);
-; Function entry:
-A_memcmp:
-EXP(memcmp):
-        jmp     qword [memcmpDispatch] ; Go to appropriate version, depending on instruction set
-
-
-align 16
-memcmpAVX2:    ; AVX2 version. Use ymm register
-memcmpAVX2@:   ; internal reference
-
-        add     par1, par3                       ; use negative index from end of memory block
-        add     par2, par3
-        neg     par3
-        jz      A900
-        mov     par4d, 0FFFFH 
-        cmp     par3, -32
-        ja      A100
-        
-A000:   ; loop comparing 32 bytes
-        vmovdqu   ymm1, [par1+par3]
-        vpcmpeqb  ymm0, ymm1, [par2+par3]        ; compare 32 bytes
-        vpmovmskb eax, ymm0                      ; get byte mask
-        xor     eax, -1                          ; not eax would not set flags
-        jnz     A700                             ; difference found
-        add     par3, 32
-        jz      A900                             ; finished, equal
-        cmp     par3, -32
-        jna     A000                             ; next 32 bytes
-        vzeroupper                               ; end ymm state
-        
-A100:   ; less than 32 bytes left
-        cmp     par3, -16
-        ja      A200
-        movdqu  xmm1, [par1+par3]
-        movdqu  xmm2, [par2+par3]
-        pcmpeqb xmm1, xmm2                       ; compare 16 bytes
-        pmovmskb eax, xmm1                       ; get byte mask
-        xor     eax, par4d                       ; invert lower 16 bits
-        jnz     A701                             ; difference found
-        add     par3, 16
-        jz      A901                             ; finished, equal
-        
-A200:   ; less than 16 bytes left
-        cmp     par3, -8
-        ja      A300
-        ; compare 8 bytes
-        movq    xmm1, [par1+par3]
-        movq    xmm2, [par2+par3]
-        pcmpeqb xmm1, xmm2                       ; compare 8 bytes
-        pmovmskb eax, xmm1                       ; get byte mask
-        xor     eax, par4d
-        jnz     A701                             ; difference found
-        add     par3, 8
-        jz      A901 
-        
-A300:   ; less than 8 bytes left
-        cmp     par3, -4
-        ja      A400
-        ; compare 4 bytes
-        movd    xmm1, [par1+par3]
-        movd    xmm2, [par2+par3]
-        pcmpeqb xmm1, xmm2                       ; compare 4 bytes
-        pmovmskb eax, xmm1                       ; get byte mask
-        xor     eax, par4d                         ; not ax
-        jnz     A701                             ; difference found
-        add     par3, 4
-        jz      A901 
-
-A400:   ; less than 4 bytes left
-        cmp     par3, -2
-        ja      A500
-        movzx   eax, word [par1+par3]
-        movzx   par4d, word [par2+par3]
-        sub     eax, par4d
-        jnz     A800                             ; difference in byte 0 or 1
-        add     par3, 2
-        jz      A901 
-        
-A500:   ; less than 2 bytes left
-        test    par3, par3
-        jz      A901                             ; no bytes left
-        
-A600:   ; one byte left
-        movzx   eax, byte [par1+par3]
-        movzx   par4d, byte [par2+par3]
-        sub     eax, par4d                         ; return result
-        ret
-
-A700:   ; difference found. find position
-        vzeroupper
-A701:   
-        bsf     eax, eax
-        add     par3, rax
-        movzx   eax, byte [par1+par3]
-        movzx   par4d, byte [par2+par3]
-        sub     eax, par4d                         ; return result
-        ret
-
-A800:   ; difference in byte 0 or 1
-        neg     al
-        sbb     par3, -1                           ; add 1 to par3 if al == 0
-        movzx   eax, byte [par1+par3]
-        movzx   par4d, byte [par2+par3]
-        sub     eax, par4d                         ; return result
-        ret
-
-A900:   ; equal
-        vzeroupper
-A901:   xor     eax, eax        
-        ret
-        
-
-memcmpSSE2:    ; SSE2 version. Use xmm register
-memcmpSSE2@:   ; internal reference
-
-        add     par1, par3                         ; use negative index from end of memory block
-        add     par2, par3
-        neg     par3
-        jz      S900 
-        mov     par4d, 0FFFFH
-        cmp     par3, -16
-        ja      S200
-        
-S100:   ; loop comparing 16 bytes
-        movdqu  xmm1, [par1+par3]
-        movdqu  xmm2, [par2+par3]
-        pcmpeqb xmm1, xmm2                       ; compare 16 bytes
-        pmovmskb eax, xmm1                       ; get byte mask
-        xor     eax, par4d                         ; not ax
-        jnz     S700                             ; difference found
-        add     par3, 16
-        jz      S900                             ; finished, equal
-        cmp     par3, -16
-        jna     S100                             ; next 16 bytes
-        
-S200:   ; less than 16 bytes left
-        cmp     par3, -8
-        ja      S300
-        ; compare 8 bytes
-        movq    xmm1, [par1+par3]
-        movq    xmm2, [par2+par3]
-        pcmpeqb xmm1, xmm2                       ; compare 8 bytes
-        pmovmskb eax, xmm1                       ; get byte mask
-        xor     eax, par4d                         ; not ax
-        jnz     S700                             ; difference found
-        add     par3, 8
-        jz      S900 
-        
-S300:   ; less than 8 bytes left
-        cmp     par3, -4
-        ja      S400
-        ; compare 4 bytes
-        movd    xmm1, [par1+par3]
-        movd    xmm2, [par2+par3]
-        pcmpeqb xmm1, xmm2                       ; compare 4 bytes
-        pmovmskb eax, xmm1                       ; get byte mask
-        xor     eax, par4d                         ; not ax
-        jnz     S700                             ; difference found
-        add     par3, 4
-        jz      S900 
-
-S400:   ; less than 4 bytes left
-        cmp     par3, -2
-        ja      S500
-        movzx   eax, word [par1+par3]
-        movzx   par4d, word [par2+par3]
-        sub     eax, par4d
-        jnz     S800                             ; difference in byte 0 or 1
-        add     par3, 2
-        jz      S900 
-        
-S500:   ; less than 2 bytes left
-        test    par3, par3
-        jz      S900                             ; no bytes left
-        
-        ; one byte left
-        movzx   eax, byte [par1+par3]
-        movzx   par4d, byte [par2+par3]
-        sub     eax, par4d                         ; return result
-        ret
-
-S700:   ; difference found. find position
-        bsf     eax, eax
-        add     par3, rax
-        movzx   eax, byte [par1+par3]
-        movzx   par4d, byte [par2+par3]
-        sub     eax, par4d                         ; return result
-        ret
-
-S800:   ; difference in byte 0 or 1
-        neg     al
-        sbb     par3, -1                          ; add 1 to par3 if al == 0
-S820:   movzx   eax, byte [par1+par3]
-        movzx   par4d, byte [par2+par3]
-        sub     eax, par4d                         ; return result
-        ret
-
-S900:   ; equal
-        xor     eax, eax        
-        ret
-
-        
-; CPU dispatching for memcmp. This is executed only once
-memcmpCPUDispatch:
-        push    par1
-        push    par2
-        push    par3        
-        call    InstructionSet                         ; get supported instruction set
-        ; SSE2 always supported
-        lea     par4, [memcmpSSE2@]
-        cmp     eax, 13                ; check AVX2
-        jb      Q100
-        ; AVX2 supported
-        lea     par4, [memcmpAVX2@]        
-Q100:   ; save pointer
-        mov     qword [memcmpDispatch], par4
-; Continue in appropriate version of memcmp
-        pop     par3
-        pop     par2
-        pop     par1
-        jmp     par4
-
-
-SECTION .data
-align 16
-
-
-; Pointer to appropriate version.
-; This initially points to memcmpCPUDispatch. memcmpCPUDispatch will
-; change this to the appropriate version of memcmp, so that
-; memcmpCPUDispatch is only executed once:
-memcmpDispatch DQ memcmpCPUDispatch
-
diff --git a/contrib/libs/asmlib/memcpy64.asm b/contrib/libs/asmlib/memcpy64.asm
deleted file mode 100644
index d590990b99..0000000000
--- a/contrib/libs/asmlib/memcpy64.asm
+++ /dev/null
@@ -1,1332 +0,0 @@
-%include "defs.asm"
-
-;*************************  memcpy64.asm  ************************************
-; Author:           Agner Fog
-; Date created:     2008-07-19
-; Last modified:    2016-11-12 (patched version with AVX512 support removed)
-;
-; Description:
-; Faster version of the standard memcpy function:
-; void * A_memcpy(void *dest, const void *src, size_t count);
-; Copies 'count' bytes from 'src' to 'dest'
-;
-; Overriding standard function memcpy:
-; The alias ?OVR_memcpy is changed to _memcpy in the object file if
-; it is desired to override the standard library function memcpy.
-;
-; The function uses non-temporal writes to bypass the cache when the size is
-; bigger than half the size of the largest_level cache. This limit can be
-; read with GetMemcpyCacheLimit and changed with SetMemcpyCacheLimit
-; C++ prototypes:
-; extern "C" size_t GetMemcpyCacheLimit();  // in memcpy64.asm
-; extern "C" void SetMemcpyCacheLimit();    // in memmove64.asm
-; extern "C" void SetMemcpyCacheLimit1();   // used internally
-;
-; Position-independent code is generated if POSITIONINDEPENDENT is defined.
-;
-; CPU dispatching included SSE2, Suppl-SSE3 and AVX instruction sets.
-;
-; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
-;******************************************************************************
-
-default rel
-
-global A_memcpy: function              ; Function A_memcpy
-global EXP(memcpy): function           ; ?OVR removed if standard function memcpy overridden
-global memcpySSE2: function            ; Version for processors with only SSE2
-global memcpySSSE3: function           ; Version for processors with SSSE3
-global memcpyU: function               ; Version for processors with fast unaligned read
-global memcpyU256: function            ; Version for processors with fast 256-bit read/write
-
-global GetMemcpyCacheLimit: function   ; Get the size limit for bypassing cache when copying with memcpy and memmove
-global SetMemcpyCacheLimit1: function  ; Set the size limit for bypassing cache when copying with memcpy
-
-
-; Imported from instrset64.asm
-extern InstructionSet                  ; Instruction set for CPU dispatcher
-
-; Imported from unalignedisfaster64.asm:
-extern UnalignedIsFaster               ; Tells if unaligned read is faster than PALIGNR
-extern Store256BitIsFaster             ; Tells if a 256 bit store is faster than two 128 bit stores
-
-; Imported from cachesize32.asm:
-extern DataCacheSize                   ; Gets size of data cache
-
-
-; Define prolog for this function
-%MACRO  PROLOGM  0
-%IFDEF  WINDOWS
-        push    rsi
-        push    rdi
-        mov     rdi, rcx               ; dest
-        mov     r9,  rcx               ; dest
-        mov     rsi, rdx               ; src
-        mov     rcx, r8                ; count
-%ELSE   ; Unix
-        mov     rcx, rdx               ; count
-        mov     r9,  rdi               ; dest
-%ENDIF
-%ENDM
-
-; Define return from this function
-%MACRO  RETURNM  0
-%IFDEF  WINDOWS
-        pop     rdi
-        pop     rsi
-%ENDIF
-        mov     rax, r9                ; Return value = dest
-        ret
-%ENDM
-
-
-SECTION .text  align=16
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-;                          Common entry for dispatch
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-; extern "C" void * A_memcpy(void * dest, const void * src, size_t count);
-; Function entry:
-A_memcpy:
-EXP(memcpy):
-        jmp     qword [memcpyDispatch] ; Go to appropriate version, depending on instruction set
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-; AVX Version for processors with fast unaligned read and fast 32 bytes write
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-align 16
-memcpyU256:   ; global label
-memcpyU256@:  ; local label
-        PROLOGM
-        cmp     rcx, 40H
-        jb      A1000                  ; Use simpler code if count < 64
-
-        ; count >= 64
-        ; Calculate size of first block up to first regular boundary of dest
-        mov     edx, edi
-        neg     edx
-        and     edx, 1FH
-        jz      B3100                    ; Skip if dest aligned by 32
-
-        ; edx = size of first partial block, 1 - 31 bytes
-        test    dl, 3
-        jz      B3030
-        test    dl, 1
-        jz      B3020
-        ; move 1 byte
-        movzx   eax, byte [rsi]
-        mov     [rdi], al
-        inc     rsi
-        inc     rdi
-B3020:  test    dl, 2
-        jz      B3030
-        ; move 2 bytes
-        movzx   eax, word [rsi]
-        mov     [rdi], ax
-        add     rsi, 2
-        add     rdi, 2
-B3030:  test    dl, 4
-        jz      B3040
-        ; move 4 bytes
-        mov     eax, [rsi]
-        mov     [rdi], eax
-        add     rsi, 4
-        add     rdi, 4
-B3040:  test    dl, 8
-        jz      B3050
-        ; move 8 bytes
-        mov     rax, [rsi]
-        mov     [rdi], rax
-        add     rsi, 8
-        add     rdi, 8
-B3050:  test    dl, 16
-        jz      B3060
-        ; move 16 bytes
-        movups  xmm0, [rsi]
-        movaps  [rdi], xmm0
-        add     rsi, 16
-        add     rdi, 16
-B3060:  sub     rcx, rdx
-
-B3100:  ; Now dest is aligned by 32. Any partial block has been moved
-
-        ; Set up for loop moving 32 bytes per iteration:
-        mov     rdx, rcx               ; Save count
-        and     rcx, -20H              ; Round down to nearest multiple of 32
-        add     rsi, rcx               ; Point to the end
-        add     rdi, rcx               ; Point to the end
-        sub     rdx, rcx               ; Remaining data after loop
-        ; Check if count very big
-        cmp     rcx, [CacheBypassLimit]
-        ja      I3100                  ; Use non-temporal store if count > CacheBypassLimit
-        neg     rcx                    ; Negative index from the end
-
-H3100:  ; copy -rcx bytes in blocks of 32 bytes.
-
-        ; Check for false memory dependence: The CPU may falsely assume
-        ; a partial overlap between the written destination and the following
-        ; read source if source is unaligned and
-        ; (src-dest) modulo 4096  is close to 4096
-        test    sil, 1FH
-        jz      H3110                  ; aligned
-        mov     eax, esi
-        sub     eax, edi
-        and     eax, 0FFFH             ; modulo 4096
-        cmp     eax, 1000H - 200H
-        ja      J3100
-
-align 16
-H3110:  ; main copy loop, 32 bytes at a time
-        ; rcx has negative index from the end, counting up to zero
-        vmovups ymm0, [rsi+rcx]
-        vmovaps [rdi+rcx], ymm0
-        add     rcx, 20H
-        jnz     H3110
-        sfence
-        vzeroupper                     ; end of AVX mode
-
-H3120:  ; Move the remaining edx bytes (0 - 31):
-        add     rsi, rdx
-        add     rdi, rdx
-        neg     rdx
-        jz      H3500                  ; Skip if no more data
-        ; move 16-8-4-2-1 bytes, aligned
-        cmp     edx, -10H
-        jg      H3200
-        ; move 16 bytes
-        movups  xmm0, [rsi+rdx]
-        movaps  [rdi+rdx], xmm0
-        add     rdx, 10H
-H3200:  cmp     edx, -8
-        jg      H3210
-        ; move 8 bytes
-        movq    xmm0, qword [rsi+rdx]
-        movq    qword [rdi+rdx], xmm0
-        add     rdx, 8
-        jz      H500                   ; Early skip if count divisible by 8
-H3210:  cmp     edx, -4
-        jg      H3220
-        ; move 4 bytes
-        mov     eax, [rsi+rdx]
-        mov     [rdi+rdx], eax
-        add     rdx, 4
-H3220:  cmp     edx, -2
-        jg      H3230
-        ; move 2 bytes
-        movzx   eax, word [rsi+rdx]
-        mov     [rdi+rdx], ax
-        add     rdx, 2
-H3230:  cmp     edx, -1
-        jg      H3500
-        ; move 1 byte
-        movzx   eax, byte [rsi+rdx]
-        mov     [rdi+rdx], al
-H3500:  ; finished
-        RETURNM
-
-I3100:   ; non-temporal move
-        neg     rcx                    ; Negative index from the end
-
-align 16
-I3110:  ; main copy loop, 32 bytes at a time
-        ; rcx has negative index from the end, counting up to zero
-        vmovups ymm0, [rsi+rcx]
-        vmovntps [rdi+rcx], ymm0
-        add     rcx, 20H
-        jnz     I3110
-        sfence
-        vzeroupper                      ; end of AVX mode
-        jmp     H3120                  ; Move the remaining edx bytes (0 - 31)
-
-
-align 16
-J3100:  ; There is a false memory dependence.
-        ; check if src and dest overlap, if not then it is safe
-        ; to copy backwards to avoid false memory dependence
-%if 1
-        ; Use this version if you want consistent behavior in the case
-        ; where dest > src and overlap. However, this case is undefined
-        ; anyway because part of src is overwritten before copying
-        push    rdx
-        mov     rax, rsi
-        sub     rax, rdi
-        cqo
-        xor     rax, rdx
-        sub     rax, rdx   ; abs(src-dest)
-        neg     rcx        ; size
-        pop     rdx        ; restore rdx
-        cmp     rax, rcx
-        jnb     J3110
-        neg     rcx        ; restore rcx
-        jmp     H3110      ; overlap between src and dest. Can't copy backwards
-%else
-        ; save time by not checking the case that is undefined anyway
-        mov     rax, rsi
-        sub     rax, rdi
-        neg     rcx        ; size
-        cmp     rax, rcx
-        jnb     J3110      ; OK to copy backwards
-        ; must copy forwards
-        neg     rcx        ; restore ecx
-        jmp     H3110      ; copy forwards
-
-%endif
-
-J3110:   ; copy backwards, rcx = size. rsi, rdi = end of src, dest
-        push    rsi
-        push    rdi
-        sub     rsi, rcx
-        sub     rdi, rcx
-J3120:  ; loop backwards
-        vmovups ymm0, [rsi+rcx-20H]
-        vmovaps [rdi+rcx-20H], ymm0
-        sub     rcx, 20H
-        jnz     J3120
-        sfence
-        vzeroupper
-        pop     rdi
-        pop     rsi
-        jmp     H3120
-
-align 16
-        ; count < 64. Move 32-16-8-4-2-1 bytes
-        ; multiple CPU versions (SSSE3 and above)
-A1000:  add     rsi, rcx               ; end of src
-        add     rdi, rcx               ; end of dest
-        neg     rcx                    ; negative index from the end
-        cmp     ecx, -20H
-        jg      A1100
-        ; move 32 bytes
-        ; movdqu is faster than 64-bit moves on processors with SSSE3
-        movups  xmm0, [rsi+rcx]
-        movups  xmm1, [rsi+rcx+10H]
-        movups  [rdi+rcx], xmm0
-        movups  [rdi+rcx+10H], xmm1
-        add     rcx, 20H
-A1100:  cmp     ecx, -10H
-        jg      A1200
-        ; move 16 bytes
-        movups  xmm0, [rsi+rcx]
-        movups  [rdi+rcx], xmm0
-        add     rcx, 10H
-A1200:  cmp     ecx, -8
-        jg      A1300
-        ; move 8 bytes
-        mov     rax, qword [rsi+rcx]
-        mov     qword [rdi+rcx], rax
-        add     rcx, 8
-A1300:  cmp     ecx, -4
-        jg      A1400
-        ; move 4 bytes
-        mov     eax, [rsi+rcx]
-        mov     [rdi+rcx], eax
-        add     rcx, 4
-        jz      A1900                     ; early out if count divisible by 4
-A1400:  cmp     ecx, -2
-        jg      A1500
-        ; move 2 bytes
-        movzx   eax, word [rsi+rcx]
-        mov     [rdi+rcx], ax
-        add     rcx, 2
-A1500:  cmp     ecx, -1
-        jg      A1900
-        ; move 1 byte
-        movzx   eax, byte [rsi+rcx]
-        mov     [rdi+rcx], al
-A1900:  ; finished
-        RETURNM
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-;  Version for processors with fast unaligned read and fast 16 bytes write
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-align 16
-memcpyU:   ; global label
-memcpyU@:  ; local label
-        PROLOGM
-        cmp     rcx, 40H
-        jb      A1000                  ; Use simpler code if count < 64
-
-        ; count >= 64
-        ; Calculate size of first block up to first regular boundary of dest
-        mov     edx, edi
-        neg     edx
-        and     edx, 0FH
-        jz      B2100                    ; Skip if dest aligned by 16
-
-        ; edx = size of first partial block, 1 - 15 bytes
-        test    dl, 3
-        jz      B2030
-        test    dl, 1
-        jz      B2020
-        ; move 1 byte
-        movzx   eax, byte [rsi]
-        mov     [rdi], al
-        inc     rsi
-        inc     rdi
-B2020:  test    dl, 2
-        jz      B2030
-        ; move 2 bytes
-        movzx   eax, word [rsi]
-        mov     [rdi], ax
-        add     rsi, 2
-        add     rdi, 2
-B2030:  test    dl, 4
-        jz      B2040
-        ; move 4 bytes
-        mov     eax, [rsi]
-        mov     [rdi], eax
-        add     rsi, 4
-        add     rdi, 4
-B2040:  test    dl, 8
-        jz      B2050
-        ; move 8 bytes
-        mov     rax, [rsi]
-        mov     [rdi], rax
-        add     rsi, 8
-        add     rdi, 8
-B2050:  sub     rcx, rdx
-B2100:  ; Now dest is aligned by 16. Any partial block has been moved
-
-        ; Set up for loop moving 32 bytes per iteration:
-        mov     rdx, rcx               ; Save count
-        and     rcx, -20H              ; Round down to nearest multiple of 32
-        add     rsi, rcx               ; Point to the end
-        add     rdi, rcx               ; Point to the end
-        sub     rdx, rcx               ; Remaining data after loop
-
-        ; Check if count very big
-        cmp     rcx, [CacheBypassLimit]
-        ja      I100                   ; Use non-temporal store if count > CacheBypassLimit
-        neg     rcx                    ; Negative index from the end
-
-H100:   ; copy -rcx bytes in blocks of 32 bytes.
-
-        ; Check for false memory dependence: The CPU may falsely assume
-        ; a partial overlap between the written destination and the following
-        ; read source if source is unaligned and
-        ; (src-dest) modulo 4096 is close to 4096
-        test    sil, 0FH
-        jz      H110                   ; aligned
-        mov     eax, esi
-        sub     eax, edi
-        and     eax, 0FFFH             ; modulo 4096
-        cmp     eax, 1000H - 200H
-        ja      J100
-
-H110:   ; main copy loop, 32 bytes at a time
-        ; rcx has negative index from the end, counting up to zero
-        movups  xmm0, [rsi+rcx]
-        movups  xmm1, [rsi+rcx+10H]
-        movaps  [rdi+rcx], xmm0
-        movaps  [rdi+rcx+10H], xmm1
-        add     rcx, 20H
-        jnz     H110
-
-H120:   ; Move the remaining edx bytes (0 - 31):
-        add     rsi, rdx
-        add     rdi, rdx
-        neg     rdx
-        jz      H500                   ; Skip if no more data
-        ; move 16-8-4-2-1 bytes, aligned
-        cmp     edx, -10H
-        jg      H200
-        ; move 16 bytes
-        movups  xmm0, [rsi+rdx]
-        movaps  [rdi+rdx], xmm0
-        add     rdx, 10H
-H200:   cmp     edx, -8
-        jg      H210
-        ; move 8 bytes
-        movq    xmm0, qword [rsi+rdx]
-        movq    qword [rdi+rdx], xmm0
-        add     rdx, 8
-        jz      H500                   ; Early skip if count divisible by 8
-H210:   cmp     edx, -4
-        jg      H220
-        ; move 4 bytes
-        mov     eax, [rsi+rdx]
-        mov     [rdi+rdx], eax
-        add     rdx, 4
-H220:   cmp     edx, -2
-        jg      H230
-        ; move 2 bytes
-        movzx   eax, word [rsi+rdx]
-        mov     [rdi+rdx], ax
-        add     rdx, 2
-H230:   cmp     edx, -1
-        jg      H500
-        ; move 1 byte
-        movzx   eax, byte [rsi+rdx]
-        mov     [rdi+rdx], al
-H500:   ; finished
-        RETURNM
-
-I100:   ; non-temporal move
-        neg     rcx                    ; Negative index from the end
-
-align 16
-I110:   ; main copy loop, 32 bytes at a time
-        ; rcx has negative index from the end, counting up to zero
-        movups  xmm0, [rsi+rcx]
-        movups  xmm1, [rsi+rcx+10H]
-        movntps [rdi+rcx], xmm0
-        movntps [rdi+rcx+10H], xmm1
-        add     rcx, 20H
-        jnz     I110
-        sfence
-        jmp     H120                  ; Move the remaining edx bytes (0 - 31):
-
-
-align 16
-J100:   ; There is a false memory dependence.
-        ; check if src and dest overlap, if not then it is safe
-        ; to copy backwards to avoid false memory dependence
-%if 1
-        ; Use this version if you want consistent behavior in the case
-        ; where dest > src and overlap. However, this case is undefined
-        ; anyway because part of src is overwritten before copying
-        push    rdx
-        mov     rax, rsi
-        sub     rax, rdi
-        cqo
-        xor     rax, rdx
-        sub     rax, rdx   ; abs(src-dest)
-        neg     rcx        ; size
-        pop     rdx        ; restore rdx
-        cmp     rax, rcx
-        jnb     J110
-        neg     rcx        ; restore rcx
-        jmp     H110       ; overlap between src and dest. Can't copy backwards
-%else
-        ; save time by not checking the case that is undefined anyway
-        mov     rax, rsi
-        sub     rax, rdi
-        neg     rcx        ; size
-        cmp     rax, rcx
-        jnb     J110       ; OK to copy backwards
-        ; must copy forwards
-        neg     rcx        ; restore ecx
-        jmp     H110       ; copy forwards
-
-%endif
-
-J110:   ; copy backwards, rcx = size. rsi, rdi = end of src, dest
-        push    rsi
-        push    rdi
-        sub     rsi, rcx
-        sub     rdi, rcx
-J120:   ; loop backwards
-        movups  xmm1, [rsi+rcx-20H]
-        movups  xmm0, [rsi+rcx-10H]
-        movaps  [rdi+rcx-20H], xmm1
-        movaps  [rdi+rcx-10H], xmm0
-        sub     rcx, 20H
-        jnz     J120
-        pop     rdi
-        pop     rsi
-        jmp     H120
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-;  Version for processors with SSSE3. Aligned read + shift + aligned write
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-align 16
-memcpySSSE3:     ; global label
-memcpySSSE3@:    ; local label
-        PROLOGM
-        cmp     rcx, 40H
-        jb      A1000                  ; Use simpler code if count < 64
-
-        ; count >= 64
-        ; Calculate size of first block up to first regular boundary of dest
-        mov     edx, edi
-        neg     edx
-        and     edx, 0FH
-        jz      B1200                    ; Skip if dest aligned by 16
-
-        ; edx = size of first partial block, 1 - 15 bytes
-        test    dl, 3
-        jz      B1030
-        test    dl, 1
-        jz      B1020
-        ; move 1 byte
-        movzx   eax, byte [rsi]
-        mov     [rdi], al
-        inc     rsi
-        inc     rdi
-B1020:  test    dl, 2
-        jz      B1030
-        ; move 2 bytes
-        movzx   eax, word [rsi]
-        mov     [rdi], ax
-        add     rsi, 2
-        add     rdi, 2
-B1030:  test    dl, 4
-        jz      B1040
-        ; move 4 bytes
-        mov     eax, [rsi]
-        mov     [rdi], eax
-        add     rsi, 4
-        add     rdi, 4
-B1040:  test    dl, 8
-        jz      B1050
-        ; move 8 bytes
-        mov     rax, [rsi]
-        mov     [rdi], rax
-        add     rsi, 8
-        add     rdi, 8
-B1050:  sub     rcx, rdx
-B1200:  ; Now dest is aligned by 16. Any partial block has been moved
-        ; Find alignment of src modulo 16 at this point:
-        mov     eax, esi
-        and     eax, 0FH
-
-        ; Set up for loop moving 32 bytes per iteration:
-        mov     edx, ecx               ; Save count (lower 32 bits)
-        and     rcx, -20H              ; Round down count to nearest multiple of 32
-        add     rsi, rcx               ; Point to the end
-        add     rdi, rcx               ; Point to the end
-        sub     edx, ecx               ; Remaining data after loop (0-31)
-        sub     rsi, rax               ; Nearest preceding aligned block of src
-
-        ; Check if count very big
-        cmp     rcx, [CacheBypassLimit]
-        ja      B1400                   ; Use non-temporal store if count > CacheBypassLimit
-        neg     rcx                    ; Negative index from the end
-
-        ; Dispatch to different codes depending on src alignment
-        lea     r8, [AlignmentDispatchSSSE3]
-        jmp     near [r8+rax*8]
-
-B1400:  neg     rcx
-        ; Dispatch to different codes depending on src alignment
-        lea     r8, [AlignmentDispatchNT]
-        jmp     near [r8+rax*8]
-
-
-align   16
-C100:   ; Code for aligned src. SSE2 and SSSE3 versions
-        ; The nice case, src and dest have same alignment.
-
-        ; Loop. rcx has negative index from the end, counting up to zero
-        movaps  xmm0, [rsi+rcx]
-        movaps  xmm1, [rsi+rcx+10H]
-        movaps  [rdi+rcx], xmm0
-        movaps  [rdi+rcx+10H], xmm1
-        add     rcx, 20H
-        jnz     C100
-
-        ; Move the remaining edx bytes (0 - 31):
-        add     rsi, rdx
-        add     rdi, rdx
-        neg     rdx
-        jz      C500                   ; Skip if no more data
-        ; move 16-8-4-2-1 bytes, aligned
-        cmp     edx, -10H
-        jg      C200
-        ; move 16 bytes
-        movaps  xmm0, [rsi+rdx]
-        movaps  [rdi+rdx], xmm0
-        add     rdx, 10H
-C200:   cmp     edx, -8
-        jg      C210
-        ; move 8 bytes
-        mov     rax, [rsi+rdx]
-        mov     [rdi+rdx], rax
-        add     rdx, 8
-        jz      C500                   ; Early skip if count divisible by 8
-C210:   cmp     edx, -4
-        jg      C220
-        ; move 4 bytes
-        mov     eax, [rsi+rdx]
-        mov     [rdi+rdx], eax
-        add     rdx, 4
-C220:   cmp     edx, -2
-        jg      C230
-        ; move 2 bytes
-        movzx   eax, word [rsi+rdx]
-        mov     [rdi+rdx], ax
-        add     rdx, 2
-C230:   cmp     edx, -1
-        jg      C500
-        ; move 1 byte
-        movzx   eax, byte [rsi+rdx]
-        mov     [rdi+rdx], al
-C500:   ; finished
-        RETURNM
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-;  Version for processors with SSE2. Aligned read + shift + aligned write
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-memcpySSE2:     ; global label
-memcpySSE2@:    ; local label
-        PROLOGM
-        cmp     rcx, 40H
-        jae     B0100                   ; Use simpler code if count < 64
-
-        ; count < 64. Move 32-16-8-4-2-1 bytes
-        add     rsi, rcx               ; end of src
-        add     rdi, rcx               ; end of dest
-        neg     rcx                    ; negative index from the end
-        cmp     ecx, -20H
-        jg      A100
-        ; move 32 bytes
-        ; mov r64 is faster than movdqu on Intel Pentium M and Core 1
-        ; movdqu is fast on Nehalem and later
-        mov     rax, [rsi+rcx]
-        mov     rdx, [rsi+rcx+8]
-        mov     [rdi+rcx], rax
-        mov     [rdi+rcx+8], rdx
-        mov     rax, qword [rsi+rcx+10H]
-        mov     rdx, qword [rsi+rcx+18H]
-        mov     qword [rdi+rcx+10H], rax
-        mov     qword [rdi+rcx+18H], rdx
-        add     rcx, 20H
-A100:   cmp     ecx, -10H
-        jg      A200
-        ; move 16 bytes
-        mov     rax, [rsi+rcx]
-        mov     rdx, [rsi+rcx+8]
-        mov     [rdi+rcx], rax
-        mov     [rdi+rcx+8], rdx
-        add     rcx, 10H
-A200:   cmp     ecx, -8
-        jg      A300
-        ; move 8 bytes
-        mov     rax, qword [rsi+rcx]
-        mov     qword [rdi+rcx], rax
-        add     rcx, 8
-A300:   cmp     ecx, -4
-        jg      A400
-        ; move 4 bytes
-        mov     eax, [rsi+rcx]
-        mov     [rdi+rcx], eax
-        add     rcx, 4
-        jz      A900                     ; early out if count divisible by 4
-A400:   cmp     ecx, -2
-        jg      A500
-        ; move 2 bytes
-        movzx   eax, word [rsi+rcx]
-        mov     [rdi+rcx], ax
-        add     rcx, 2
-A500:   cmp     ecx, -1
-        jg      A900
-        ; move 1 byte
-        movzx   eax, byte [rsi+rcx]
-        mov     [rdi+rcx], al
-A900:   ; finished
-        RETURNM
-
-B0100:  ; count >= 64
-        ; Calculate size of first block up to first regular boundary of dest
-        mov     edx, edi
-        neg     edx
-        and     edx, 0FH
-        jz      B0200                    ; Skip if dest aligned by 16
-
-        ; edx = size of first partial block, 1 - 15 bytes
-        test    dl, 3
-        jz      B0030
-        test    dl, 1
-        jz      B0020
-        ; move 1 byte
-        movzx   eax, byte [rsi]
-        mov     [rdi], al
-        inc     rsi
-        inc     rdi
-B0020:  test    dl, 2
-        jz      B0030
-        ; move 2 bytes
-        movzx   eax, word [rsi]
-        mov     [rdi], ax
-        add     rsi, 2
-        add     rdi, 2
-B0030:  test    dl, 4
-        jz      B0040
-        ; move 4 bytes
-        mov     eax, [rsi]
-        mov     [rdi], eax
-        add     rsi, 4
-        add     rdi, 4
-B0040:  test    dl, 8
-        jz      B0050
-        ; move 8 bytes
-        mov     rax, [rsi]
-        mov     [rdi], rax
-        add     rsi, 8
-        add     rdi, 8
-B0050:  sub     rcx, rdx
-B0200:  ; Now dest is aligned by 16. Any partial block has been moved
-
-        ; This part will not always work if count < 64
-        ; Calculate size of first block up to first regular boundary of dest
-        mov     edx, edi
-        neg     edx
-        and     edx, 0FH
-        jz      B300                    ; Skip if dest aligned by 16
-
-        ; rdx = size of first partial block, 1 - 15 bytes
-        add     rsi, rdx
-        add     rdi, rdx
-        sub     rcx, rdx
-        neg     rdx
-        cmp     edx, -8
-        jg      B200
-        ; move 8 bytes
-        mov     rax, [rsi+rdx]
-        mov     [rdi+rdx], rax
-        add     rdx, 8
-B200:   cmp     edx, -4
-        jg      B210
-        ; move 4 bytes
-        mov     eax, [rsi+rdx]
-        mov     [rdi+rdx], eax
-        add     rdx, 4
-        jz      B300              ; early out if aligned by 4
-B210:   cmp     edx, -2
-        jg      B220
-        ; move 2 bytes
-        movzx   eax, word [rsi+rdx]
-        mov     [rdi+rdx], ax
-        add     rdx, 2
-B220:   cmp     edx, -1
-        jg      B300
-        ; move 1 byte
-        movzx   eax, byte [rsi+rdx]
-        mov     [rdi+rdx], al
-
-B300:   ; Now dest is aligned by 16. Any partial block has been moved
-        ; Find alignment of src modulo 16 at this point:
-        mov     eax, esi
-        and     eax, 0FH
-
-        ; Set up for loop moving 32 bytes per iteration:
-        mov     edx, ecx               ; Save count (lower 32 bits)
-        and     rcx, -20H              ; Round down count to nearest multiple of 32
-        add     rsi, rcx               ; Point to the end
-        add     rdi, rcx               ; Point to the end
-        sub     edx, ecx               ; Remaining data after loop (0-31)
-        sub     rsi, rax               ; Nearest preceding aligned block of src
-
-        ; Check if count very big
-        cmp     rcx, [CacheBypassLimit]
-        ja      B400                   ; Use non-temporal store if count > CacheBypassLimit
-        neg     rcx                    ; Negative index from the end
-
-        ; Dispatch to different codes depending on src alignment
-        lea     r8, [AlignmentDispatchSSE2]
-        jmp     near [r8+rax*8]
-
-B400:   neg     rcx
-        ; Dispatch to different codes depending on src alignment
-        lea     r8, [AlignmentDispatchNT]
-        jmp     near [r8+rax*8]
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-;  Macros and alignment jump tables
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-; Macros for each src alignment, SSE2 instruction set:
-; Make separate code for each alignment u because the shift instructions
-; have the shift count as a constant:
-
-%MACRO  MOVE_UNALIGNED_SSE2  2 ; u, nt
-; Move rcx + rdx bytes of data
-; Source is misaligned. (src-dest) modulo 16 = %1
-; %2 = 1 if non-temporal store desired
-; eax = %1
-; rsi = src - %1 = nearest preceding 16-bytes boundary
-; rdi = dest (aligned)
-; rcx = - (count rounded down to nearest divisible by 32)
-; edx = remaining bytes to move after loop
-        movdqa  xmm0, [rsi+rcx]        ; Read from nearest preceding 16B boundary
-%%L1:  ; Loop. rcx has negative index from the end, counting up to zero
-        movdqa  xmm1, [rsi+rcx+10H]    ; Read next two blocks aligned
-        movdqa  xmm2, [rsi+rcx+20H]
-        movdqa  xmm3, xmm1             ; Copy because used twice
-        psrldq  xmm0, %1               ; shift right
-        pslldq  xmm1, 16-%1            ; shift left
-        por     xmm0, xmm1             ; combine blocks
-        %IF %2 == 0
-        movdqa  [rdi+rcx], xmm0        ; Save aligned
-        %ELSE
-        movntdq [rdi+rcx], xmm0        ; non-temporal save
-        %ENDIF
-        movdqa  xmm0, xmm2             ; Save for next iteration
-        psrldq  xmm3, %1               ; shift right
-        pslldq  xmm2, 16-%1            ; shift left
-        por     xmm3, xmm2             ; combine blocks
-        %IF %2 == 0
-        movdqa  [rdi+rcx+10H], xmm3    ; Save aligned
-        %ELSE
-        movntdq [rdi+rcx+10H], xmm3    ; non-temporal save
-        %ENDIF
-        add     rcx, 20H               ; Loop through negative values up to zero
-        jnz     %%L1
-
-        ; Set up for edx remaining bytes
-        add     rsi, rdx
-        add     rdi, rdx
-        neg     rdx
-        cmp     edx, -10H
-        jg      %%L2
-        ; One more 16-bytes block to move
-        movdqa  xmm1, [rsi+rdx+10H]
-        psrldq  xmm0, %1               ; shift right
-        pslldq  xmm1, 16-%1            ; shift left
-        por     xmm0, xmm1             ; combine blocks
-        %IF %2 == 0
-        movdqa  [rdi+rdx], xmm0        ; Save aligned
-        %ELSE
-        movntdq [rdi+rdx], xmm0        ; non-temporal save
-        %ENDIF
-        add     rdx, 10H
-%%L2:   ; Get src pointer back to misaligned state
-        add     rsi, rax
-        %IF %2 == 1
-        sfence
-        %ENDIF
-        ; Move remaining 0 - 15 bytes, unaligned
-        jmp     C200
-%ENDMACRO
-
-
-%MACRO  MOVE_UNALIGNED_SSE2_4  1 ; nt
-; Special case for u = 4
-; %1 = 1 if non-temporal store desired
-        movaps  xmm0, [rsi+rcx]        ; Read from nearest preceding 16B boundary
-%%L1:   ; Loop. rcx has negative index from the end, counting up to zero
-        movaps  xmm1, [rsi+rcx+10H]    ; Read next two blocks aligned
-        movss   xmm0, xmm1             ; Moves 4 bytes, leaves remaining bytes unchanged
-        shufps  xmm0, xmm0, 00111001B  ; Rotate
-        %IF %1 == 0
-        movaps  [rdi+rcx], xmm0        ; Save aligned
-        %ELSE
-        movntps [rdi+rcx], xmm0        ; Non-temporal save
-        %ENDIF
-        movaps  xmm0, [rsi+rcx+20H]
-        movss   xmm1, xmm0
-        shufps  xmm1, xmm1, 00111001B
-        %IF %1 == 0
-        movaps  [rdi+rcx+10H], xmm1    ; Save aligned
-        %ELSE
-        movntps [rdi+rcx+10H], xmm1    ; Non-temporal save
-        %ENDIF
-        add     rcx, 20H               ; Loop through negative values up to zero
-        jnz     %%L1
-        ; Set up for edx remaining bytes
-        add     rsi, rdx
-        add     rdi, rdx
-        neg     rdx
-        cmp     edx, -10H
-        jg      %%L2
-        ; One more 16-bytes block to move
-        movaps  xmm1, [rsi+rdx+10H]    ; Read next two blocks aligned
-        movss   xmm0, xmm1
-        shufps  xmm0, xmm0, 00111001B
-        %IF %1 == 0
-        movaps  [rdi+rdx], xmm0        ; Save aligned
-        %ELSE
-        movntps [rdi+rdx], xmm0        ; Non-temporal save
-        %ENDIF
-        add     rdx, 10H
-%%L2:   ; Get src pointer back to misaligned state
-        add     rsi, rax
-        %IF %1 == 1
-        sfence
-        %ENDIF
-        ; Move remaining 0 - 15 bytes, unaligned
-        jmp     C200
-%ENDMACRO
-
-
-%MACRO  MOVE_UNALIGNED_SSE2_8  1 ; nt
-; Special case for u = 8
-; %1 = 1 if non-temporal store desired
-        movaps  xmm0, [rsi+rcx]        ; Read from nearest preceding 16B boundary
-%%L1:   ; Loop. rcx has negative index from the end, counting up to zero
-        movaps  xmm1, [rsi+rcx+10H]    ; Read next two blocks aligned
-        movsd   xmm0, xmm1             ; Moves 8 bytes, leaves remaining bytes unchanged
-        shufps  xmm0, xmm0, 01001110B  ; Rotate
-        %IF %1 == 0
-        movaps  [rdi+rcx], xmm0        ; Save aligned
-        %ELSE
-        movntps [rdi+rcx], xmm0        ; Non-temporal save
-        %ENDIF
-        movaps  xmm0, [rsi+rcx+20H]
-        movsd   xmm1, xmm0
-        shufps  xmm1, xmm1, 01001110B
-        %IF %1 == 0
-        movaps  [rdi+rcx+10H], xmm1    ; Save aligned
-        %ELSE
-        movntps [rdi+rcx+10H], xmm1    ; Non-temporal save
-        %ENDIF
-        add     rcx, 20H               ; Loop through negative values up to zero
-        jnz     %%L1
-        ; Set up for edx remaining bytes
-        add     rsi, rdx
-        add     rdi, rdx
-        neg     rdx
-        cmp     edx, -10H
-        jg      %%L2
-        ; One more 16-bytes block to move
-        movaps  xmm1, [rsi+rdx+10H]    ; Read next two blocks aligned
-        movsd   xmm0, xmm1
-        shufps  xmm0, xmm0, 01001110B
-        %IF %1 == 0
-        movaps  [rdi+rdx], xmm0        ; Save aligned
-        %ELSE
-        movntps [rdi+rdx], xmm0        ; Non-temporal save
-        %ENDIF
-        add     rdx, 10H
-%%L2:   ; Get src pointer back to misaligned state
-        add     rsi, rax
-        %IF %1 == 1
-        sfence
-        %ENDIF
-        ; Move remaining 0 - 15 bytes, unaligned
-        jmp     C200
-%ENDMACRO
-
-
-%MACRO  MOVE_UNALIGNED_SSE2_12  1 ; nt
-; Special case for u = 12
-; %1 = 1 if non-temporal store desired
-        movaps  xmm0, [rsi+rcx]        ; Read from nearest preceding 16B boundary
-        shufps  xmm0, xmm0, 10010011B
-%%L1:   ; Loop. rcx has negative index from the end, counting up to zero
-        movaps  xmm1, [rsi+rcx+10H]    ; Read next two blocks aligned
-        movaps  xmm2, [rsi+rcx+20H]
-        shufps  xmm1, xmm1, 10010011B
-        shufps  xmm2, xmm2, 10010011B
-        movaps  xmm3, xmm2
-        movss   xmm2, xmm1             ; Moves 4 bytes, leaves remaining bytes unchanged
-        movss   xmm1, xmm0             ; Moves 4 bytes, leaves remaining bytes unchanged
-        %IF %1 == 0
-        movaps  [rdi+rcx], xmm1        ; Save aligned
-        movaps  [rdi+rcx+10H], xmm2    ; Save aligned
-        %ELSE
-        movntps [rdi+rcx], xmm1        ; Non-temporal save
-        movntps [rdi+rcx+10H], xmm2    ; Non-temporal save
-        %ENDIF
-        movaps  xmm0, xmm3             ; Save for next iteration
-        add     rcx, 20H               ; Loop through negative values up to zero
-        jnz     %%L1
-        ; Set up for edx remaining bytes
-        add     rsi, rdx
-        add     rdi, rdx
-        neg     rdx
-        cmp     edx, -10H
-        jg      %%L2
-        ; One more 16-bytes block to move
-        movaps  xmm1, [rsi+rdx+10H]    ; Read next two blocks aligned
-        shufps  xmm1, xmm1, 10010011B
-        movss   xmm1, xmm0             ; Moves 4 bytes, leaves remaining bytes unchanged
-        %IF %1 == 0
-        movaps  [rdi+rdx], xmm1        ; Save aligned
-        %ELSE
-        movntps [rdi+rdx], xmm1        ; Non-temporal save
-        %ENDIF
-        add     rdx, 10H
-%%L2:   ; Get src pointer back to misaligned state
-        add     rsi, rax
-        %IF %1 == 1
-        sfence
-        %ENDIF
-        ; Move remaining 0 - 15 bytes, unaligned
-        jmp     C200
-%ENDMACRO
-
-
-; Macros for each src alignment, Suppl.SSE3 instruction set:
-; Make separate code for each alignment u because the palignr instruction
-; has the shift count as a constant:
-
-%MACRO MOVE_UNALIGNED_SSSE3  1 ; u
-; Move rcx + rdx bytes of data
-; Source is misaligned. (src-dest) modulo 16 = %1
-; eax = %1
-; rsi = src - %1 = nearest preceding 16-bytes boundary
-; rdi = dest (aligned)
-; rcx = - (count rounded down to nearest divisible by 32)
-; edx = remaining bytes to move after loop
-        movdqa  xmm0, [rsi+rcx]        ; Read from nearest preceding 16B boundary
-
-%%L1:   ; Loop. rcx has negative index from the end, counting up to zero
-        movdqa  xmm2, [rsi+rcx+10H]    ; Read next two blocks
-        movdqa  xmm3, [rsi+rcx+20H]
-        movdqa  xmm1, xmm0             ; Save xmm0
-        movdqa  xmm0, xmm3             ; Save for next iteration
-        palignr xmm3, xmm2, %1         ; Combine parts into aligned block
-        palignr xmm2, xmm1, %1         ; Combine parts into aligned block
-        movdqa  [rdi+rcx], xmm2        ; Save aligned
-        movdqa  [rdi+rcx+10H], xmm3    ; Save aligned
-        add     rcx, 20H
-        jnz     %%L1
-
-        ; Set up for edx remaining bytes
-        add     rsi, rdx
-        add     rdi, rdx
-        neg     rdx
-        cmp     edx, -10H
-        jg      %%L2
-        ; One more 16-bytes block to move
-        movdqa  xmm2, [rsi+rdx+10H]
-        palignr xmm2, xmm0, %1
-        movdqa  [rdi+rdx], xmm2
-        add     rdx, 10H
-%%L2:   ; Get src pointer back to misaligned state
-        add     rsi, rax
-        ; Move remaining 0 - 15 bytes
-        jmp     C200
-%ENDMACRO
-
-
-; Make 15 instances of SSE2 macro for each value of the alignment u.
-; These are pointed to by the jump table AlignmentDispatchSSE2 below
-; (alignments and fillers are inserted manually to minimize the number
-; of 16-bytes boundaries inside loops)
-
-align 16
-D104:   MOVE_UNALIGNED_SSE2_4    0
-times 4 nop
-D108:   MOVE_UNALIGNED_SSE2_8    0
-times 4 nop
-D10C:   MOVE_UNALIGNED_SSE2_12   0
-times 1 nop
-D101:   MOVE_UNALIGNED_SSE2 1,   0
-D102:   MOVE_UNALIGNED_SSE2 2,   0
-D103:   MOVE_UNALIGNED_SSE2 3,   0
-D105:   MOVE_UNALIGNED_SSE2 5,   0
-D106:   MOVE_UNALIGNED_SSE2 6,   0
-D107:   MOVE_UNALIGNED_SSE2 7,   0
-D109:   MOVE_UNALIGNED_SSE2 9,   0
-times 1 nop
-D10A:   MOVE_UNALIGNED_SSE2 0AH, 0
-D10B:   MOVE_UNALIGNED_SSE2 0BH, 0
-D10D:   MOVE_UNALIGNED_SSE2 0DH, 0
-D10E:   MOVE_UNALIGNED_SSE2 0EH, 0
-D10F:   MOVE_UNALIGNED_SSE2 0FH, 0
-
-; Make 15 instances of Suppl-SSE3 macro for each value of the alignment u.
-; These are pointed to by the jump table AlignmentDispatchSupSSE3 below
-
-align   16
-E104:   MOVE_UNALIGNED_SSSE3 4
-E108:   MOVE_UNALIGNED_SSSE3 8
-E10C:   MOVE_UNALIGNED_SSSE3 0CH
-E101:   MOVE_UNALIGNED_SSSE3 1
-E102:   MOVE_UNALIGNED_SSSE3 2
-E103:   MOVE_UNALIGNED_SSSE3 3
-E105:   MOVE_UNALIGNED_SSSE3 5
-E106:   MOVE_UNALIGNED_SSSE3 6
-E107:   MOVE_UNALIGNED_SSSE3 7
-E109:   MOVE_UNALIGNED_SSSE3 9
-times 1 nop
-E10A:   MOVE_UNALIGNED_SSSE3 0AH
-E10B:   MOVE_UNALIGNED_SSSE3 0BH
-E10D:   MOVE_UNALIGNED_SSSE3 0DH
-E10E:   MOVE_UNALIGNED_SSSE3 0EH
-E10F:   MOVE_UNALIGNED_SSSE3 0FH
-
-; Codes for non-temporal move. Aligned case first
-
-align 16
-F100:   ; Non-temporal move, src and dest have same alignment.
-        ; Loop. rcx has negative index from the end, counting up to zero
-        movaps  xmm0, [rsi+rcx]        ; Read
-        movaps  xmm1, [rsi+rcx+10H]
-        movntps [rdi+rcx], xmm0        ; Write non-temporal (bypass cache)
-        movntps [rdi+rcx+10H], xmm1
-        add     rcx, 20H
-        jnz     F100                   ; Loop through negative rcx up to zero
-
-        ; Move the remaining edx bytes (0 - 31):
-        add     rsi, rdx
-        add     rdi, rdx
-        neg     rdx
-        jz      C500                   ; Skip if no more data
-        ; Check if we can more one more 16-bytes block
-        cmp     edx, -10H
-        jg      C200
-        ; move 16 bytes, aligned
-        movaps  xmm0, [rsi+rdx]
-        movntps [rdi+rdx], xmm0
-        add     rdx, 10H
-        sfence
-        ; move the remaining 0 - 15 bytes
-        jmp     C200
-
-; Make 15 instances of MOVE_UNALIGNED_SSE2 macro for each value of
-; the alignment u.
-; These are pointed to by the jump table AlignmentDispatchNT below
-
-;align 16
-F104:   MOVE_UNALIGNED_SSE2_4    1
-F108:   MOVE_UNALIGNED_SSE2_8    1
-F10C:   MOVE_UNALIGNED_SSE2_12   1
-F101:   MOVE_UNALIGNED_SSE2 1,   1
-F102:   MOVE_UNALIGNED_SSE2 2,   1
-F103:   MOVE_UNALIGNED_SSE2 3,   1
-F105:   MOVE_UNALIGNED_SSE2 5,   1
-F106:   MOVE_UNALIGNED_SSE2 6,   1
-F107:   MOVE_UNALIGNED_SSE2 7,   1
-F109:   MOVE_UNALIGNED_SSE2 9,   1
-F10A:   MOVE_UNALIGNED_SSE2 0AH, 1
-F10B:   MOVE_UNALIGNED_SSE2 0BH, 1
-F10D:   MOVE_UNALIGNED_SSE2 0DH, 1
-F10E:   MOVE_UNALIGNED_SSE2 0EH, 1
-F10F:   MOVE_UNALIGNED_SSE2 0FH, 1
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-;                   CPU dispatcher
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-memcpyCPUDispatch:   ; CPU dispatcher, check for instruction sets and which method is fastest
-        ; This part is executed only once
-        push    rbx
-        push    rcx
-        push    rdx
-        push    rsi
-        push    rdi
-        push    r8
-        ; set CacheBypassLimit to half the size of the largest level cache
-        call    GetMemcpyCacheLimit@
-        mov     eax, 1
-        cpuid                          ; Get feature flags
-        lea     rbx, [memcpySSE2@]
-        bt      ecx, 9                 ; Test bit for SupplSSE3
-        jnc     Q100
-        lea     rbx, [memcpySSSE3@]
-        call    UnalignedIsFaster      ; Test if unaligned read is faster than aligned read and shift
-        test    eax, eax
-        jz      Q100
-        lea     rbx, [memcpyU@]
-        call    Store256BitIsFaster    ; Test if 256-bit read/write is available and faster than 128-bit read/write
-        test    eax, eax
-        jz      Q100
-        lea     rbx, [memcpyU256@]
-Q100:
-        ; Insert appropriate pointer
-        mov     [memcpyDispatch], rbx
-        mov     rax, rbx
-        pop     r8
-        pop     rdi
-        pop     rsi
-        pop     rdx
-        pop     rcx
-        pop     rbx
-        ; Jump according to the replaced function pointer
-        jmp     rax
-
-; extern "C" size_t GetMemcpyCacheLimit();
-GetMemcpyCacheLimit:
-GetMemcpyCacheLimit@:  ; local limit
-        mov     rax, [CacheBypassLimit]
-        test    rax, rax
-        jnz     U200
-        ; Get half the size of the largest level cache
-%ifdef  WINDOWS
-        xor     ecx, ecx               ; 0 means largest level cache
-%else
-        xor     edi, edi               ; 0 means largest level cache
-%endif
-        call    DataCacheSize          ; get cache size
-        shr     rax, 1                 ; half the size
-        jnz     U100
-        mov     eax, 400000H           ; cannot determine cache size. use 4 Mbytes
-U100:   mov     [CacheBypassLimit], rax
-U200:   ret
-
-; Note: SetMemcpyCacheLimit is defined in memmove64.asm, calling SetMemcpyCacheLimit1
-SetMemcpyCacheLimit1:
-%ifdef  WINDOWS
-        mov     rax, rcx
-%else
-        mov     rax, rdi
-%endif
-        test    rax, rax
-        jnz     U400
-        ; zero, means default
-        mov     [CacheBypassLimit], rax
-        call    GetMemcpyCacheLimit@
-U400:   mov     [CacheBypassLimit], rax
-        ret
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-;                   getDispatch, for testing only
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-getDispatch:
-mov rax,[memcpyDispatch]
-ret
-
-global getDispatch
-
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-;    data section. jump tables, dispatch function pointer, cache size
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-; Data segment must be included in function namespace
-SECTION .data
-align 16
-
-; Jump tables for alignments 0 - 15:
-; The CPU dispatcher replaces AlignmentDispatch with
-; AlignmentDispatchSSE2 or AlignmentDispatchSupSSE3 if Suppl-SSE3
-; is supported.
-
-; Code pointer for each alignment for SSE2 instruction set
-AlignmentDispatchSSE2:
-DQ C100, D101, D102, D103, D104, D105, D106, D107
-DQ D108, D109, D10A, D10B, D10C, D10D, D10E, D10F
-
-; Code pointer for each alignment for Suppl-SSE3 instruction set
-AlignmentDispatchSSSE3:
-DQ C100, E101, E102, E103, E104, E105, E106, E107
-DQ E108, E109, E10A, E10B, E10C, E10D, E10E, E10F
-
-; Code pointer for each alignment for non-temporal store
-AlignmentDispatchNT:
-DQ F100, F101, F102, F103, F104, F105, F106, F107
-DQ F108, F109, F10A, F10B, F10C, F10D, F10E, F10F
-
-; Pointer to appropriate version.
-; This initially points to memcpyCPUDispatch. memcpyCPUDispatch will
-; change this to the appropriate version of memcpy, so that
-; memcpyCPUDispatch is only executed once:
-memcpyDispatch DQ memcpyCPUDispatch
-
-; Bypass cache by using non-temporal moves if count > CacheBypassLimit
-; The optimal value of _CacheBypassLimit is difficult to estimate, but
-; a reasonable value is half the size of the largest cache:
-CacheBypassLimit: DQ 0
diff --git a/contrib/libs/asmlib/memmove64.asm b/contrib/libs/asmlib/memmove64.asm
deleted file mode 100644
index 1c61032541..0000000000
--- a/contrib/libs/asmlib/memmove64.asm
+++ /dev/null
@@ -1,1090 +0,0 @@
-%include "defs.asm"
-
-;*************************  memmove64.asm  ***********************************
-; Author:           Agner Fog
-; Date created:     2008-07-18
-; Last modified:    2016-11-16 (patched version with AVX512 support removed)
-; Description:
-; Faster version of the standard memmove function:
-; void * A_memmove(void *dest, const void *src, size_t count);
-; Moves 'count' bytes from 'src' to 'dest'. src and dest may overlap.
-;
-; Overriding standard function memmove:
-; The alias ?OVR_memmove is changed to _memmove in the object file if
-; it is desired to override the standard library function memmove.
-;
-; CPU dispatching included for different CPUs
-;
-; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
-;******************************************************************************
-
-default rel
-
-global A_memmove: function             ; Function A_memmove
-global EXP(memmove): function          ; ?OVR removed if standard function memmove overridden
-global memmoveSSE2: function           ; Version for processors with only SSE2
-global memmoveSSSE3: function          ; Version for processors with SSSE3
-global memmoveU: function              ; Version for processors with fast unaligned read
-global memmoveU256: function           ; Version for processors with fast 256-bit read/write
-global SetMemcpyCacheLimit             ; Change limit for bypassing cache
-
-; Imported from memcpy64.asm:
-extern A_memcpy                        ; function entry
-extern memcpySSE2                      ; CPU specific function entry
-extern memcpySSSE3                     ; CPU specific function entry
-extern memcpyU                         ; CPU specific function entry
-extern memcpyU256                      ; CPU specific function entry
-
-; Imported from instrset64.asm
-extern InstructionSet                  ; Instruction set for CPU dispatcher
-
-; Imported from unalignedisfaster64.asm:
-extern UnalignedIsFaster               ; Tells if unaligned read is faster than PALIGNR
-extern Store256BitIsFaster             ; Tells if a 256 bit store is faster than two 128 bit stores
-
-; Imported from memcpy64.asm
-extern GetMemcpyCacheLimit             ; Get the size limit for bypassing cache when copying with memcpy and memmove
-extern SetMemcpyCacheLimit1            ; Set the size limit for bypassing cache when copying with memcpy
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-;      Prolog macro. Determine if we should move forwards or backwards
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-; Define prolog for this function
-; Parameter 1 is forward function label
-%MACRO  PROLOGM  1
-%IFDEF  WINDOWS
-        ; Check if dest overlaps src
-        mov     rax, rcx
-        sub     rax, rdx
-        cmp     rax, r8
-        ; We can avoid testing for dest < src by using unsigned compare:
-        ; (Assume that the memory block cannot span across address 0)
-        ; Must move backwards if unsigned(dest-src) < count
-        jae     %1                     ; Jump to memcpy if we can move forwards
-        push    rsi
-        push    rdi
-        mov     rdi, rcx               ; dest
-        mov     r9,  rcx               ; dest
-        mov     rsi, rdx               ; src
-        mov     rcx, r8                ; count
-%ELSE   ; Unix
-        ; Check if dest overlaps src
-        mov     rax, rdi
-        sub     rax, rsi
-        cmp     rax, rdx
-        ; Must move backwards if unsigned(dest-src) < count
-        jae     %1                     ; Jump to memcpy if we can move forwards
-        mov     rcx, rdx               ; count
-        mov     r9,  rdi               ; dest
-%ENDIF
-%ENDM
-
-
-; Define return from this function
-%MACRO  RETURNM  0
-%IFDEF  WINDOWS
-        pop     rdi
-        pop     rsi
-%ENDIF
-        mov     rax, r9                ; Return value = dest
-        ret
-%ENDMACRO
-
-
-SECTION .text  align=16
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-;                          Common entry for dispatch
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-; extern "C" void * A_memmove(void * dest, const void * src, size_t count);
-; Function entry:
-A_memmove:
-EXP(memmove):
-        jmp     qword [memmoveDispatch] ; Go to appropriate version, depending on instruction set
-
-        
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-; AVX Version for processors with fast unaligned read and fast 32 bytes write
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-align 16
-memmoveU256:   ; Version for processors with fast 256-bit read/write
-memmoveU256@:  ; local label
-        PROLOGM memcpyU256
-      
-        cmp     rcx, 40H
-        jb      A1000                    ; Use simpler code if count < 64
-        
-        ; count >= 64
-        ; Note: this part will not always work if count < 64
-        ; Calculate size of last block after last regular boundary of dest
-        lea     edx, [rdi+rcx]          ; end of dext
-        and     edx, 1FH
-        jz      B4300                   ; Skip if end of dest aligned by 32
-        
-        ; edx = size of last partial block, 1 - 31 bytes
-        test    dl, 3
-        jz      B4210
-        test    dl, 1
-        jz      B4201      ; B4200 if we haven't tested edx,3
-        ; move 1 byte
-        dec     rcx
-        movzx   eax, byte [rsi+rcx]
-        mov     [rdi+rcx], al        
-B4200:  test    dl, 2
-        jz      B4210
-B4201:  ; move 2 bytes
-        sub     rcx, 2
-        movzx   eax, word [rsi+rcx]
-        mov     [rdi+rcx], ax        
-B4210:  test    dl, 4
-        jz      B4220
-        ; move 4 bytes
-        sub     rcx, 4
-        mov     eax, [rsi+rcx]
-        mov     [rdi+rcx], eax
-B4220:  test    dl, 8
-        jz      B4230
-        ; move 8 bytes
-        sub     rcx, 8
-        mov     rax, [rsi+rcx]
-        mov     [rdi+rcx], rax
-B4230:  test    dl, 16
-        jz      B4300
-        ; move 16 bytes
-        sub     rcx, 16
-        movups  xmm0, [rsi+rcx]
-        movaps  [rdi+rcx], xmm0
-        
-B4300:  ; Now end of dest is aligned by 32. Any partial block has been moved        
-        mov     rdx, rcx
-        and     ecx, 1FH              ; remaining size after 32 bytes blocks moved
-        and     rdx, -20H             ; number of 32 bytes blocks
-        jz      H4100
-        add     rsi, rcx
-        add     rdi, rcx
-        
-        ; Check if count very big
-        cmp     rdx, [CacheBypassLimit]
-        ja      H4800                   ; Use non-temporal store if count > _CacheBypassLimit
-
-align   16 
-H4000:  ; 32 bytes move loop
-        vmovups  ymm0, [rsi+rdx-20H]
-        vmovaps  [rdi+rdx-20H], ymm0
-        sub      rdx, 20H
-        jnz      H4000
-        vzeroupper
-        
-H4090:  sub      rsi, rcx
-        sub      rdi, rcx
-
-H4100:  ; remaining 0-31 bytes
-        test    ecx, ecx
-        jz      H4600        
-        test    cl, 10H
-        jz      H4200
-        ; move 16 bytes
-        sub     ecx, 10H
-        movups  xmm0, [rsi+rcx]
-        movaps  [rdi+rcx], xmm0
-        jz      H4600                     ; early out if count divisible by 16
-H4200:  test    cl, 8
-        jz      H4300
-        ; move 8 bytes
-        sub     ecx, 8
-        mov     rax, [rsi+rcx]
-        mov     [rdi+rcx], rax
-H4300:  test    cl, 4
-        jz      H4400
-        ; move 4 bytes
-        sub     ecx, 4
-        mov     eax, [rsi+rcx]
-        mov     [rdi+rcx], eax
-        jz      H4600                     ; early out if count divisible by 4
-H4400:  test    cl, 2
-        jz      H4500
-        ; move 2 bytes
-        sub     ecx, 2
-        movzx   eax, word [rsi+rcx]
-        mov     [rdi+rcx], ax
-H4500:  test    cl, 1
-        jz      H4600
-        ; move 1 byte
-        movzx   eax, byte [rsi]   ; rcx-1 = 0
-        mov     [rdi], al
-H4600:  ; finished
-        RETURNM
-
-align 16
-H4800:  ; 32 bytes move loop, bypass cache
-        vmovups  ymm0, [rsi+rdx-20H]
-        vmovntps [rdi+rdx-20H], ymm0
-        sub      rdx, 20H
-        jnz      H4800        
-        sfence
-        vzeroupper
-        jmp      H4090
-        
-A1000:  ; count < 64. Move 32-16-8-4-2-1 bytes
-        test    cl, 20H
-        jz      A1100
-        ; move 32 bytes
-        ; movups is faster on processors with SSSE3
-        sub     ecx, 20H
-        movups     xmm0, [rsi+rcx+10H]
-        movups     xmm1, [rsi+rcx]
-        movups     [rdi+rcx+10H], xmm0
-        movups     [rdi+rcx], xmm1
-A1100:  test    cl, 10H
-        jz      A1200
-        ; move 16 bytes
-        sub     ecx, 10H
-        movups     xmm0, [rsi+rcx]
-        movups     [rdi+rcx], xmm0
-A1200:  test    cl, 8
-        jz      A1300
-        ; move 8 bytes
-        sub     ecx, 8
-        mov     rax, [rsi+rcx]
-        mov     [rdi+rcx], rax
-A1300:  test    cl, 4
-        jz      A1400
-        ; move 4 bytes
-        sub     ecx, 4
-        mov     eax, [rsi+rcx]
-        mov     [rdi+rcx], eax
-        jz      A1900                     ; early out if count divisible by 4
-A1400:  test    cl, 2
-        jz      A1500
-        ; move 2 bytes
-        sub     ecx, 2
-        movzx   eax, word [rsi+rcx]
-        mov     [rdi+rcx], ax
-A1500:  test    cl, 1
-        jz      A1900
-        ; move 1 byte
-        movzx   eax, byte [rsi]   ; rcx-1 = 0
-        mov     [rdi], al
-A1900:  ; finished
-        RETURNM
-        
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-;  Version for processors with fast unaligned read and fast 16 bytes write
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-                
-align 16
-memmoveU:  ; Version for processors with fast unaligned read
-memmoveU@: ; local label
-        PROLOGM memcpyU
-      
-        cmp     rcx, 40H
-        jb      A1000                    ; Use simpler code if count < 64
-        
-        ; count >= 64
-        ; Note: this part will not always work if count < 64
-        ; Calculate size of last block after last regular boundary of dest
-        lea     edx, [rdi+rcx]          ; end of dext
-        and     edx, 0FH
-        jz      B3300                   ; Skip if end of dest aligned by 16
-        
-        ; edx = size of last partial block, 1 - 15 bytes
-        test    dl, 3
-        jz      B3210
-        test    dl, 1
-        jz      B3201      ; B3200 if we haven't tested edx,3
-        ; move 1 byte
-        dec     rcx
-        movzx   eax, byte [rsi+rcx]
-        mov     [rdi+rcx], al        
-B3200:  test    dl, 2
-        jz      B3210
-B3201:  ; move 2 bytes
-        sub     rcx, 2
-        movzx   eax, word [rsi+rcx]
-        mov     [rdi+rcx], ax        
-B3210:  test    dl, 4
-        jz      B3220
-        ; move 4 bytes
-        sub     rcx, 4
-        mov     eax, [rsi+rcx]
-        mov     [rdi+rcx], eax
-B3220:  test    dl, 8
-        jz      B3300
-        ; move 8 bytes
-        sub     rcx, 8
-        mov     rax, [rsi+rcx]
-        mov     [rdi+rcx], rax        
-        
-B3300:  ; Now end of dest is aligned by 16. Any partial block has been moved        
-        mov     rdx, rcx
-        and     ecx, 1FH              ; remaining size after 32 bytes blocks moved
-        and     rdx, -20H             ; number of 32 bytes blocks
-        jz      H1100
-        add     rsi, rcx
-        add     rdi, rcx
-        
-        ; Check if count very big
-        cmp     rdx, [CacheBypassLimit]
-        ja      H1800                   ; Use non-temporal store if count > _CacheBypassLimit
-
-align   16    ; minimize 16-bytes boundaries in H1000 loop
-H1000:  ; 32 bytes move loop
-        movups   xmm1, [rsi+rdx-20H]
-        movups   xmm0, [rsi+rdx-10H]
-        movaps   [rdi+rdx-20H], xmm1
-        movaps   [rdi+rdx-10H], xmm0
-        sub      rdx, 20H
-        jnz      H1000
-        
-H1090:  sub      rsi, rcx
-        sub      rdi, rcx
-
-H1100:  ; remaining 0-31 bytes
-        test    ecx, ecx
-        jz      H1600        
-        test    cl, 10H
-        jz      H1200
-        ; move 16 bytes
-        sub     ecx, 10H
-        movups  xmm0, [rsi+rcx]
-        movaps  [rdi+rcx], xmm0
-        jz      H1600                     ; early out if count divisible by 16
-H1200:  test    cl, 8
-        jz      H1300
-        ; move 8 bytes
-        sub     ecx, 8
-        mov     rax, [rsi+rcx]
-        mov     [rdi+rcx], rax
-H1300:  test    cl, 4
-        jz      H1400
-        ; move 4 bytes
-        sub     ecx, 4
-        mov     eax, [rsi+rcx]
-        mov     [rdi+rcx], eax
-        jz      H1600                     ; early out if count divisible by 4
-H1400:  test    cl, 2
-        jz      H1500
-        ; move 2 bytes
-        sub     ecx, 2
-        movzx   eax, word [rsi+rcx]
-        mov     [rdi+rcx], ax
-H1500:  test    cl, 1
-        jz      H1600
-        ; move 1 byte
-        movzx   eax, byte [rsi]   ; rcx-1 = 0
-        mov     [rdi], al
-H1600:  ; finished
-        RETURNM
-
-align 16
-H1800:  ; 32 bytes move loop, bypass cache
-        movups   xmm1, [rsi+rdx-20H]
-        movups   xmm0, [rsi+rdx-10H]
-        movntps  [rdi+rdx-20H], xmm1
-        movntps  [rdi+rdx-10H], xmm0
-        sub      rdx, 20H
-        jnz      H1800        
-        sfence
-        jmp      H1090
-        
-        
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-;  Version for processors with SSSE3. Aligned read + shift + aligned write
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-        
-align 16
-memmoveSSSE3:   ; SSSE3 version begins here
-memmoveSSSE3@:  ; local label
-        PROLOGM memcpySSSE3
-
-        ; Cannot use memcpy. Must move backwards because of overlap between src and dest
-        cmp     rcx, 40H
-        jb      A1000                    ; Use simpler code if count < 64
-        ; count >= 64
-        ; Note: this part will not always work if count < 64
-        ; Calculate size of last block after last regular boundary of dest
-        lea     edx, [rdi+rcx]         ; end of dext
-        and     edx, 0FH
-        jz      B1300                   ; Skip if end of dest aligned by 16
-        
-        ; edx = size of last partial block, 1 - 15 bytes
-        test    dl, 3
-        jz      B1210
-        test    dl, 1
-        jz      B1201      ; B1200 if we haven't tested edx,3
-        ; move 1 byte
-        dec     rcx
-        movzx   eax, byte [rsi+rcx]
-        mov     [rdi+rcx], al        
-B1200:  test    dl, 2
-        jz      B1210
-B1201:  ; move 2 bytes
-        sub     rcx, 2
-        movzx   eax, word [rsi+rcx]
-        mov     [rdi+rcx], ax        
-B1210:  test    dl, 4
-        jz      B1220
-        ; move 4 bytes
-        sub     rcx, 4
-        mov     eax, [rsi+rcx]
-        mov     [rdi+rcx], eax
-B1220:  test    dl, 8
-        jz      B1300
-        ; move 8 bytes
-        sub     rcx, 8
-        mov     rax, [rsi+rcx]
-        mov     [rdi+rcx], rax
-              
-B1300:  ; Now end of dest is aligned by 16. Any partial block has been moved        
-        ; Find alignment of end of src modulo 16 at this point:
-        lea     eax, [rsi+rcx]
-        and     eax, 0FH
-        
-        ; Set up for loop moving 32 bytes per iteration:
-        mov     edx, ecx               ; Save count
-        and     rcx, -20H              ; Round down to nearest multiple of 32
-        sub     edx, ecx               ; Remaining data after loop
-        sub     rsi, rax               ; Nearest preceding aligned block of src
-        ; Add the same to rsi and rdi as we have subtracted from rcx
-        add     rsi, rdx
-        add     rdi, rdx
-        
-        ; Check if count very big
-        cmp     rcx, [CacheBypassLimit]
-        ja      B1400                   ; Use non-temporal store if count > CacheBypassLimit
-        
-        ; Dispatch to different codes depending on src alignment
-        lea     r8, [MAlignmentDispatchSSSE3]
-        jmp     near [r8+rax*8]
-
-B1400:  ; Dispatch to different codes depending on src alignment
-        lea     r8, [MAlignmentDispatchNT]
-        jmp     near [r8+rax*8]
-        
-
-align   16
-C100:   ; Code for aligned src. SSE2 and later CPUs
-        ; The nice case, src and dest have same alignment.
-
-        ; Loop. rcx has positive index from the beginning, counting down to zero
-        movaps  xmm0, [rsi+rcx-10H]
-        movaps  xmm1, [rsi+rcx-20H]
-        movaps  [rdi+rcx-10H], xmm0
-        movaps  [rdi+rcx-20H], xmm1
-        sub     rcx, 20H
-        jnz     C100
-        
-        ; Move the remaining edx bytes (0 - 31):
-        ; move 16-8-4-2-1 bytes, aligned
-        test    edx, edx
-        jz      C500                   ; Early out if no more data
-        test    dl, 10H
-        jz      C200
-        ; move 16 bytes
-        sub     rcx, 10H
-        movaps  xmm0, [rsi+rcx]
-        movaps  [rdi+rcx], xmm0
-        
-C200:   ; Other branches come in here, rcx may contain arbitrary offset
-        test    edx, edx
-        jz      C500                   ; Early out if no more data
-        test    dl, 8
-        jz      C210        
-        ; move 8 bytes
-        sub     rcx, 8 
-        mov     rax, [rsi+rcx]
-        mov     [rdi+rcx], rax
-C210:   test    dl, 4
-        jz      C220        
-        ; move 4 bytes
-        sub     rcx, 4        
-        mov     eax, [rsi+rcx]
-        mov     [rdi+rcx], eax
-        jz      C500                   ; Early out if count divisible by 4
-C220:   test    dl, 2
-        jz      C230        
-        ; move 2 bytes
-        sub     rcx, 2
-        movzx   eax, word [rsi+rcx]
-        mov     [rdi+rcx], ax
-C230:   test    dl, 1
-        jz      C500        
-        ; move 1 byte
-        movzx   eax, byte [rsi+rcx-1]   ; rcx-1 is not always 0 here
-        mov     [rdi+rcx-1], al
-C500:   ; finished     
-        RETURNM
-        
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-;  Version for processors with SSE2. Aligned read + shift + aligned write
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-        
-memmoveSSE2:   ; SSE2 version begins here
-memmoveSSE2@:  ; local label
-        PROLOGM  memcpySSE2
-
-        ; Cannot use memcpy. Must move backwards because of overlap between src and dest
-        cmp     rcx, 40H
-        jae     B0100                    ; Use simpler code if count < 64
-        
-        ; count < 64. Move 32-16-8-4-2-1 bytes
-        test    cl, 20H
-        jz      A100
-        ; move 32 bytes
-        ; mov is faster than movdqu on SSE2 processors,
-        ; movdqu is faster on later processors
-        sub     ecx, 20H
-        mov     rax, [rsi+rcx+18H]
-        mov     rdx, [rsi+rcx+10H]
-        mov     [rdi+rcx+18H], rax
-        mov     [rdi+rcx+10H], rdx
-        mov     rax, [rsi+rcx+8]
-        mov     rdx, [rsi+rcx]
-        mov     [rdi+rcx+8], rax
-        mov     [rdi+rcx], rdx
-A100:   test    cl, 10H
-        jz      A200
-        ; move 16 bytes
-        sub     ecx, 10H
-        mov     rax, [rsi+rcx+8]
-        mov     rdx, [rsi+rcx]
-        mov     [rdi+rcx+8], rax
-        mov     [rdi+rcx], rdx
-A200:   test    cl, 8
-        jz      A300
-        ; move 8 bytes
-        sub     ecx, 8
-        mov     rax, [rsi+rcx]
-        mov     [rdi+rcx], rax
-A300:   test    cl, 4
-        jz      A400
-        ; move 4 bytes
-        sub     ecx, 4
-        mov     eax, [rsi+rcx]
-        mov     [rdi+rcx], eax
-        jz      A900                     ; early out if count divisible by 4
-A400:   test    cl, 2
-        jz      A500
-        ; move 2 bytes
-        sub     ecx, 2
-        movzx   eax, word [rsi+rcx]
-        mov     [rdi+rcx], ax
-A500:   test    cl, 1
-        jz      A900
-        ; move 1 byte
-        movzx   eax, byte [rsi]       ; rcx-1 = 0
-        mov     [rdi], al
-A900:   ; finished
-        RETURNM
-        
-B0100:  ; count >= 64
-        ; Note: this part will not always work if count < 64
-        ; Calculate size of last block after last regular boundary of dest
-        lea     edx, [rdi+rcx]         ; end of dext
-        and     edx, 0FH
-        jz      B0300                   ; Skip if end of dest aligned by 16
-        
-        ; edx = size of last partial block, 1 - 15 bytes
-        test    dl, 3
-        jz      B0210
-        test    dl, 1
-        jz      B0201      ; B0200 if we haven't tested edx,3
-        ; move 1 byte
-        dec     rcx
-        movzx   eax, byte [rsi+rcx]
-        mov     [rdi+rcx], al        
-B0200:  test    dl, 2
-        jz      B0210
-B0201:  ; move 2 bytes
-        sub     rcx, 2
-        movzx   eax, word [rsi+rcx]
-        mov     [rdi+rcx], ax        
-B0210:  test    dl, 4
-        jz      B0220
-        ; move 4 bytes
-        sub     rcx, 4
-        mov     eax, [rsi+rcx]
-        mov     [rdi+rcx], eax
-B0220:  test    dl, 8
-        jz      B0300
-        ; move 8 bytes
-        sub     rcx, 8
-        mov     rax, [rsi+rcx]
-        mov     [rdi+rcx], rax
-              
-B0300:  ; Now end of dest is aligned by 16. Any partial block has been moved        
-        ; Find alignment of end of src modulo 16 at this point:
-        lea     eax, [rsi+rcx]
-        and     eax, 0FH
-        
-        ; Set up for loop moving 32 bytes per iteration:
-        mov     edx, ecx               ; Save count
-        and     rcx, -20H              ; Round down to nearest multiple of 32
-        sub     edx, ecx               ; Remaining data after loop
-        sub     rsi, rax               ; Nearest preceding aligned block of src
-        ; Add the same to rsi and rdi as we have subtracted from rcx
-        add     rsi, rdx
-        add     rdi, rdx
-        
-        ; Check if count very big
-        cmp     rcx, [CacheBypassLimit]
-        ja      B0400                   ; Use non-temporal store if count > CacheBypassLimit
-        
-        ; Dispatch to different codes depending on src alignment
-        lea     r8, [MAlignmentDispatchSSE2]
-        jmp     near [r8+rax*8]
-
-B0400:   ; Dispatch to different codes depending on src alignment
-        lea     r8, [MAlignmentDispatchNT]
-        jmp     near [r8+rax*8]
-        
-        
-        
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-;  Macros and alignment jump tables
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-; Macros for each src alignment, SSE2 instruction set:
-; Make separate code for each alignment u because the shift instructions
-; have the shift count as a constant:
-
-%MACRO MOVE_REVERSE_UNALIGNED_SSE2  2 ; u, nt
-; Move rcx + rdx bytes of data
-; Source is misaligned. (src-dest) modulo 16 = %1
-; %2 = 1 if non-temporal store desired
-; eax = %1
-; rsi = src - %1 = nearest preceding 16-bytes boundary
-; rdi = dest (aligned)
-; rcx = count rounded down to nearest divisible by 32
-; edx = remaining bytes to move after loop
-        movdqa  xmm0, [rsi+rcx]        ; Read from nearest following 16B boundary        
-%%L1:   ; Loop. rcx has positive index from the beginning, counting down to zero
-        sub     rcx, 20H
-        movdqa  xmm1, [rsi+rcx+10H]    ; Read next two blocks aligned
-        movdqa  xmm2, [rsi+rcx]
-        movdqa  xmm3, xmm1             ; Copy because used twice
-        pslldq  xmm0, 16-%1            ; shift left
-        psrldq  xmm1, %1               ; shift right
-        por     xmm0, xmm1             ; combine blocks
-        %IF %2 == 0
-        movdqa  [rdi+rcx+10H], xmm0    ; Save aligned
-        %ELSE
-        movntdq [rdi+rcx+10H], xmm0    ; Save aligned
-        %ENDIF
-        movdqa  xmm0, xmm2             ; Save for next iteration
-        pslldq  xmm3, 16-%1            ; shift left
-        psrldq  xmm2, %1                ; shift right
-        por     xmm3, xmm2             ; combine blocks
-        %IF %2 == 0
-        movdqa  [rdi+rcx], xmm3        ; Save aligned
-        %ELSE
-        movntdq [rdi+rcx], xmm3        ; Save aligned
-        %ENDIF
-        jnz     %%L1
-                
-        ; Move edx remaining bytes
-        test    dl, 10H
-        jz      %%L2
-        ; One more 16-bytes block to move
-        sub     rcx, 10H
-        movdqa  xmm1, [rsi+rcx]
-        pslldq  xmm0, 16-%1            ; shift left
-        psrldq  xmm1, %1               ; shift right
-        por     xmm0, xmm1             ; combine blocks
-        %IF %2 == 0
-        movdqa  [rdi+rcx], xmm0        ; Save aligned
-        %ELSE
-        movntdq [rdi+rcx], xmm0        ; Save aligned
-        %ENDIF        
-%%L2:   ; Get src pointer back to misaligned state
-        add     rsi, rax
-        %IF %2 == 1
-        sfence
-        %ENDIF
-        ; Move remaining 0 - 15 bytes, unaligned
-        jmp     C200
-%ENDMACRO
-
-
-%MACRO  MOVE_REVERSE_UNALIGNED_SSE2_4  1 ; nt
-; Special case: u = 4
-; %1 = 1 if non-temporal store desired
-        movaps  xmm0, [rsi+rcx]        ; Read from nearest following 16B boundary
-%%L1:   ; Loop. rcx has positive index from the beginning, counting down to zero
-        sub     rcx, 20H
-        movaps  xmm1, [rsi+rcx+10H]    ; Read next two blocks aligned
-        movaps  xmm2, [rsi+rcx]
-        movaps  xmm3, xmm0
-        movaps  xmm0, xmm2        
-        movss   xmm2, xmm1
-        shufps  xmm2, xmm2, 00111001B  ; Rotate right
-        movss   xmm1, xmm3
-        shufps  xmm1, xmm1, 00111001B  ; Rotate right
-        %IF %1 == 0
-        movaps  [rdi+rcx+10H], xmm1    ; Save aligned
-        movaps  [rdi+rcx], xmm2        ; Save aligned
-        %ELSE
-        movntps [rdi+rcx+10H], xmm1    ; Non-temporal save
-        movntps [rdi+rcx], xmm2        ; Non-temporal save
-        %ENDIF
-        jnz     %%L1
-                
-        ; Move edx remaining bytes
-        test    dl, 10H
-        jz      %%L2
-        ; One more 16-bytes block to move
-        sub     rcx, 10H
-        movaps  xmm1, [rsi+rcx]
-        movss   xmm1, xmm0
-        shufps  xmm1, xmm1, 00111001B  ; Rotate right
-        %IF %1 == 0
-        movaps  [rdi+rcx], xmm1        ; Save aligned
-        %ELSE
-        movntps [rdi+rcx], xmm1        ; Non-temporal save
-        %ENDIF        
-%%L2:     ; Get src pointer back to misaligned state
-        add     rsi, rax
-        %IF %1 == 1
-        sfence
-        %ENDIF
-        ; Move remaining 0 - 15 bytes, unaligned
-        jmp     C200
-%ENDMACRO
-
-
-%MACRO  MOVE_REVERSE_UNALIGNED_SSE2_8  1 ; nt
-; Special case: u = 8
-; %1 = 1 if non-temporal store desired
-        movaps  xmm0, [rsi+rcx]        ; Read from nearest following 16B boundary
-        shufps  xmm0, xmm0, 01001110B  ; Rotate
-%%L1:   ; Loop. rcx has positive index from the beginning, counting down to zero
-        sub     rcx, 20H
-        movaps  xmm1, [rsi+rcx+10H]    ; Read next two blocks aligned
-        shufps  xmm1, xmm1, 01001110B  ; Rotate
-        movsd   xmm0, xmm1
-        %IF %1 == 0
-        movaps  [rdi+rcx+10H], xmm0    ; Save aligned
-        %ELSE
-        movntps [rdi+rcx+10H], xmm0    ; Non-temporal save
-        %ENDIF
-        movaps  xmm0, [rsi+rcx]
-        shufps  xmm0, xmm0, 01001110B  ; Rotate
-        movsd   xmm1, xmm0
-        %IF %1 == 0
-        movaps  [rdi+rcx], xmm1        ; Save aligned
-        %ELSE
-        movntps [rdi+rcx], xmm1        ; Non-temporal save
-        %ENDIF
-        jnz     %%L1
-                
-        ; Move edx remaining bytes
-        test    dl, 10H
-        jz      %%L2
-        ; One more 16-bytes block to move
-        sub     rcx, 10H
-        movaps  xmm1, [rsi+rcx]
-        shufps  xmm1, xmm1, 01001110B  ; Rotate 
-        movsd   xmm0, xmm1
-        %IF %1 == 0
-        movaps  [rdi+rcx], xmm0        ; Save aligned
-        %ELSE
-        movntps [rdi+rcx], xmm0        ; Non-temporal save
-        %ENDIF        
-%%L2:   ; Get src pointer back to misaligned state
-        add     rsi, rax
-        %IF %1 == 1
-        sfence
-        %ENDIF
-        ; Move remaining 0 - 15 bytes, unaligned
-        jmp     C200
-%ENDMACRO
-
-
-%MACRO  MOVE_REVERSE_UNALIGNED_SSE2_12  1 ; nt
-; Special case: u = 12
-; %1 = 1 if non-temporal store desired
-        movaps  xmm0, [rsi+rcx]        ; Read from nearest following 16B boundary
-        shufps  xmm0, xmm0, 10010011B  ; Rotate right
-%%L1:   ; Loop. rcx has positive index from the beginning, counting down to zero
-        sub     rcx, 20H
-        movaps  xmm1, [rsi+rcx+10H]    ; Read next two blocks aligned
-        shufps  xmm1, xmm1, 10010011B  ; Rotate left
-        movss   xmm0, xmm1
-        %IF %1 == 0
-        movaps  [rdi+rcx+10H], xmm0    ; Save aligned
-        %ELSE
-        movntps [rdi+rcx+10H], xmm0    ; Non-temporal save
-        %ENDIF
-        movaps  xmm0, [rsi+rcx]
-        shufps  xmm0, xmm0, 10010011B  ; Rotate left
-        movss   xmm1, xmm0
-        %IF %1 == 0
-        movaps  [rdi+rcx], xmm1        ; Save aligned
-        %ELSE
-        movntps [rdi+rcx], xmm1        ; Non-temporal save
-        %ENDIF
-        jnz     %%L1
-                
-        ; Move edx remaining bytes
-        test    dl, 10H
-        jz      %%L2
-        ; One more 16-bytes block to move
-        sub     rcx, 10H
-        movaps  xmm1, [rsi+rcx]
-        shufps  xmm1, xmm1, 10010011B  ; Rotate left
-        movss   xmm0, xmm1
-        %IF %1 == 0
-        movaps  [rdi+rcx], xmm0        ; Save aligned
-        %ELSE
-        movntps [rdi+rcx], xmm0        ; Non-temporal save
-        %ENDIF        
-%%L2:   ; Get src pointer back to misaligned state
-        add     rsi, rax
-        %IF %1 == 1
-        sfence
-        %ENDIF
-        ; Move remaining 0 - 15 bytes, unaligned
-        jmp     C200
-%ENDMACRO
-
-
-; Macros for each src alignment, Suppl.SSE3 instruction set:
-; Code for unaligned src, Suppl.SSE3 instruction set.
-; Make separate code for each alignment u because the palignr instruction
-; has the shift count as a constant:
-
-%MACRO  MOVE_REVERSE_UNALIGNED_SSSE3  1; u
-; Move rcx + rdx bytes of data
-; Source is misaligned. (src-dest) modulo 16 = %1
-; eax = %1
-; rsi = src - %1 = nearest preceding 16-bytes boundary
-; rdi = dest (aligned)
-; rcx = - (count rounded down to nearest divisible by 32)
-; edx = remaining bytes to move after loop
-        movdqa  xmm0, [rsi+rcx]        ; Read from nearest following 16B boundary
-        
-%%L1:   ; Loop. rcx has positive index from the beginning, counting down to zero
-        movdqa  xmm1, [rsi+rcx-10H]    ; Read next two blocks        
-        palignr xmm0, xmm1, %1         ; Combine parts into aligned block
-        movdqa  [rdi+rcx-10H], xmm0    ; Save aligned
-        movdqa  xmm0, [rsi+rcx-20H]
-        palignr xmm1, xmm0, %1         ; Combine parts into aligned block
-        movdqa  [rdi+rcx-20H], xmm1    ; Save aligned
-        sub     rcx, 20H
-        jnz     %%L1
-        
-        ; Set up for edx remaining bytes
-        test    dl, 10H
-        jz      %%L2
-        ; One more 16-bytes block to move
-        sub     rcx, 10H
-        movdqa  xmm1, [rsi+rcx]        ; Read next two blocks        
-        palignr xmm0, xmm1, %1         ; Combine parts into aligned block
-        movdqa  [rdi+rcx], xmm0        ; Save aligned
-        
-%%L2:   ; Get src pointer back to misaligned state
-        add     rsi, rax
-        ; Move remaining 0 - 15 bytes
-        jmp     C200
-%ENDMACRO
-
-
-; Make 15 instances of SSE2 macro for each value of the alignment u.
-; These are pointed to by the jump table MAlignmentDispatchSSE2 below
-; (aligns and fillers are inserted manually to minimize the 
-;  number of 16-bytes boundaries inside loops)
-
-align   16
-D104:   MOVE_REVERSE_UNALIGNED_SSE2_4    0
-D108:   MOVE_REVERSE_UNALIGNED_SSE2_8    0
-D10C:   MOVE_REVERSE_UNALIGNED_SSE2_12   0
-D101:   MOVE_REVERSE_UNALIGNED_SSE2 1,   0
-D102:   MOVE_REVERSE_UNALIGNED_SSE2 2,   0
-D103:   MOVE_REVERSE_UNALIGNED_SSE2 3,   0
-D105:   MOVE_REVERSE_UNALIGNED_SSE2 5,   0
-D106:   MOVE_REVERSE_UNALIGNED_SSE2 6,   0
-D107:   MOVE_REVERSE_UNALIGNED_SSE2 7,   0
-D109:   MOVE_REVERSE_UNALIGNED_SSE2 9,   0
-D10A:   MOVE_REVERSE_UNALIGNED_SSE2 0AH, 0
-D10B:   MOVE_REVERSE_UNALIGNED_SSE2 0BH, 0
-D10D:   MOVE_REVERSE_UNALIGNED_SSE2 0DH, 0
-D10E:   MOVE_REVERSE_UNALIGNED_SSE2 0EH, 0
-D10F:   MOVE_REVERSE_UNALIGNED_SSE2 0FH, 0
-
-; Make 15 instances of Suppl-SSE3 macro for each value of the alignment u.
-; These are pointed to by the jump table MAlignmentDispatchSupSSE3 below
-
-align   16
-E104:   MOVE_REVERSE_UNALIGNED_SSSE3 4
-E108:   MOVE_REVERSE_UNALIGNED_SSSE3 8
-E10C:   MOVE_REVERSE_UNALIGNED_SSSE3 0CH
-E101:   MOVE_REVERSE_UNALIGNED_SSSE3 1
-E102:   MOVE_REVERSE_UNALIGNED_SSSE3 2
-E103:   MOVE_REVERSE_UNALIGNED_SSSE3 3
-E105:   MOVE_REVERSE_UNALIGNED_SSSE3 5
-E106:   MOVE_REVERSE_UNALIGNED_SSSE3 6
-E107:   MOVE_REVERSE_UNALIGNED_SSSE3 7
-E109:   MOVE_REVERSE_UNALIGNED_SSSE3 9
-E10A:   MOVE_REVERSE_UNALIGNED_SSSE3 0AH
-E10B:   MOVE_REVERSE_UNALIGNED_SSSE3 0BH
-E10D:   MOVE_REVERSE_UNALIGNED_SSSE3 0DH
-E10E:   MOVE_REVERSE_UNALIGNED_SSSE3 0EH
-E10F:   MOVE_REVERSE_UNALIGNED_SSSE3 0FH
-        
-align   16
-F100:   ; Non-temporal move, src and dest have same alignment.
-        ; Loop. rcx has positive index from the beginning, counting down to zero
-        sub     rcx, 20H
-        movaps  xmm0, [rsi+rcx+10H]
-        movaps  xmm1, [rsi+rcx]
-        movntps [rdi+rcx+10H], xmm0
-        movntps [rdi+rcx], xmm1
-        jnz     F100
-        
-        ; Move the remaining edx bytes (0 - 31):
-        ; move 16-8-4-2-1 bytes, aligned
-        test    dl, 10H
-        jz      C200
-        ; move 16 bytes
-        sub     rcx, 10H
-        movaps  xmm0, [rsi+rcx]
-        movntps  [rdi+rcx], xmm0
-        sfence
-        ; move the remaining 0 - 15 bytes
-        jmp     C200
-
-; Non-temporal move, src and dest have different alignment.
-; Make 15 instances of SSE2 macro for each value of the alignment u.
-; These are pointed to by the jump table MAlignmentDispatchNT below
-
-align 16
-F101:   MOVE_REVERSE_UNALIGNED_SSE2 1,   1
-F102:   MOVE_REVERSE_UNALIGNED_SSE2 2,   1
-F103:   MOVE_REVERSE_UNALIGNED_SSE2 3,   1
-F104:   MOVE_REVERSE_UNALIGNED_SSE2_4    1
-F105:   MOVE_REVERSE_UNALIGNED_SSE2 5,   1
-F106:   MOVE_REVERSE_UNALIGNED_SSE2 6,   1
-F107:   MOVE_REVERSE_UNALIGNED_SSE2 7,   1
-F108:   MOVE_REVERSE_UNALIGNED_SSE2_8    1
-F109:   MOVE_REVERSE_UNALIGNED_SSE2 9,   1
-F10A:   MOVE_REVERSE_UNALIGNED_SSE2 0AH, 1
-F10B:   MOVE_REVERSE_UNALIGNED_SSE2 0BH, 1
-F10C:   MOVE_REVERSE_UNALIGNED_SSE2_12   1
-F10D:   MOVE_REVERSE_UNALIGNED_SSE2 0DH, 1
-F10E:   MOVE_REVERSE_UNALIGNED_SSE2 0EH, 1
-F10F:   MOVE_REVERSE_UNALIGNED_SSE2 0FH, 1
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-;    CPU dispatcher
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-memmoveCPUDispatch:   ; CPU dispatcher, check for Suppl-SSE3 instruction set
-        ; This part is executed only once
-        push    rbx
-        push    rcx
-        push    rdx
-        push    rsi
-        push    rdi
-        push    r8        
-
-        ; set CacheBypassLimit to half the size of the largest level cache
-%ifdef  WINDOWS
-        xor     ecx, ecx               ; 0 means default
-%else
-        xor     edi, edi
-%endif
-        call    SetMemcpyCacheLimit@
-        mov     eax, 1
-        cpuid                          ; Get feature flags
-        lea     rbx, [memmoveSSE2@]
-        bt      ecx, 9                 ; Test bit for SupplSSE3
-        jnc     Q100
-        lea     rbx, [memmoveSSSE3@]
-        call    UnalignedIsFaster
-        test    eax, eax
-        jz      Q100
-        lea     rbx, [memmoveU@]
-        call    Store256BitIsFaster
-        test    eax, eax
-        jz      Q100
-        lea     rbx, [memmoveU256@]
-        
-Q100:   ; Insert appropriate pointer
-        mov     [memmoveDispatch], rbx
-        mov     rax, rbx
-        pop     r8
-        pop     rdi
-        pop     rsi
-        pop     rdx
-        pop     rcx
-        pop     rbx
-        ; Jump according to the replaced function pointer
-        jmp     rax
-        
-; Note: Must call SetMemcpyCacheLimit1 defined in memcpy64.asm
-SetMemcpyCacheLimit:
-SetMemcpyCacheLimit@:
-        call    SetMemcpyCacheLimit1
-        mov     [CacheBypassLimit], rax
-        ret 
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-;    data section. jump tables, dispatch function pointer, cache size
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-; Data segment must be included in function namespace
-SECTION .data
-align 16
-
-; Jump tables for alignments 0 - 15:
-; The CPU dispatcher replaces MAlignmentDispatch with 
-; MAlignmentDispatchSSE2 or MAlignmentDispatchSupSSE3 if Suppl-SSE3 
-; is supported.
-
-; Code pointer for each alignment for SSE2 instruction set
-MAlignmentDispatchSSE2:
-DQ C100, D101, D102, D103, D104, D105, D106, D107
-DQ D108, D109, D10A, D10B, D10C, D10D, D10E, D10F
-
-; Code pointer for each alignment for Suppl-SSE3 instruction set
-MAlignmentDispatchSSSE3:
-DQ C100, E101, E102, E103, E104, E105, E106, E107
-DQ E108, E109, E10A, E10B, E10C, E10D, E10E, E10F
-
-; Code pointer for each alignment for non-temporal store
-MAlignmentDispatchNT:
-DQ F100, F101, F102, F103, F104, F105, F106, F107
-DQ F108, F109, F10A, F10B, F10C, F10D, F10E, F10F
-
-memmoveDispatch: DQ memmoveCPUDispatch
-
-; Bypass cache by using non-temporal moves if count > _CacheBypassLimit
-; The optimal value of CacheBypassLimit is difficult to estimate, but
-; a reasonable value is half the size of the largest cache:
-CacheBypassLimit: DD 0
diff --git a/contrib/libs/asmlib/memset64.asm b/contrib/libs/asmlib/memset64.asm
deleted file mode 100644
index 52d647984d..0000000000
--- a/contrib/libs/asmlib/memset64.asm
+++ /dev/null
@@ -1,372 +0,0 @@
-%include "defs.asm"
-
-;*************************  memset64.asm  *************************************
-; Author:           Agner Fog
-; Date created:     2008-07-19
-; Last modified:    2016-11-12 (patched version with AVX512 support removed)
-; Description:
-; Faster version of the standard memset function:
-; void * A_memset(void * dest, int c, size_t count);
-; Sets 'count' bytes from 'dest' to the 8-bit value 'c'
-;
-; Overriding standard function memset:
-; The alias ?OVR_memset is changed to _memset in the object file if
-; it is desired to override the standard library function memset.
-;
-; extern "C" size_t GetMemsetCacheLimit(); // Data blocks bigger than this will be stored uncached by memset
-; extern "C" void   SetMemsetCacheLimit(); // Change limit in GetMemsetCacheLimit
-;
-; Optimization:
-; Uses XMM registers to set 16 bytes at a time, aligned.
-;
-; The latest version of this file is available at:
-; www.agner.org/optimize/asmexamples.zip
-; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
-;******************************************************************************
-
-default rel
-
-global A_memset: function              ; Function memset
-global EXP(memset): function           ; ?OVR removed if standard function memset overridden
-global memsetSSE2: function            ; SSE2 version
-global memsetAVX: function             ; version for CPUs with fast 256-bit store
-global GetMemsetCacheLimit: function   ; Data blocks bigger than this will be stored uncached by memset
-global SetMemsetCacheLimit: function   ; Change limit in GetMemsetCacheLimit
-
-; Imported from cachesize64.asm:
-extern DataCacheSize                   ; Get size of data cache
-
-; Imported from unalignedisfaster64.asm:
-extern Store256BitIsFaster             ; Tells if a 256 bit store is faster than two 128 bit stores
-
-; Define prolog for this function
-%MACRO  PROLOGM  0
-%IFDEF  WINDOWS
-%define Rdest   rcx                    ; dest
-        movzx   eax, dl                ; c
-        mov     rdx, r8                ; count
-%define Rcount  rdx                    ; count
-%define Rdest2  r9                     ; copy of dest
-%define Rcount2 r8                     ; copy of count
-
-%ELSE   ; Unix
-%define Rdest   rdi                    ; dest
-        movzx   eax, sil               ; c
-%define Rcount  rdx                    ; count
-%define Rdest2  rcx                    ; copy of dest
-%define Rcount2 rsi                    ; copy of count
-        mov     Rcount2, Rcount        ; copy count
-%ENDIF
-%ENDMACRO
-
-
-SECTION .text  align=16
-
-; extern "C" void * memset(void * dest, int c, size_t count);
-; Function entry:
-A_memset:
-EXP(memset):
-        jmp     [memsetDispatch]       ; CPU dispatch table
-        
-memsetAVX:  ; AVX version. Use ymm register
-memsetAVX@: ; local label
-        PROLOGM
-        imul    eax, 01010101H         ; Broadcast c into all bytes of eax
-        mov     Rdest2, Rdest          ; save dest
-        cmp     Rcount, 16
-        ja      B100
-B050:   lea     r10, [MemsetJTab]      ; SSE2 version comes in here
-        jmp     qword [r10+Rcount*8]   ; jump table for small counts
-        
-; Separate code for each count from 0 to 16:
-M16:    mov     [Rdest+12], eax
-M12:    mov     [Rdest+8],  eax
-M08:    mov     [Rdest+4],  eax
-M04:    mov     [Rdest],    eax
-M00:    mov     rax, Rdest2            ; return dest
-        ret
-
-M15:    mov     [Rdest+11], eax
-M11:    mov     [Rdest+7],  eax
-M07:    mov     [Rdest+3],  eax
-M03:    mov     [Rdest+1],  ax
-M01:    mov     [Rdest],    al
-        mov     rax, Rdest2            ; return dest
-        ret
-       
-M14:    mov     [Rdest+10], eax
-M10:    mov     [Rdest+6],  eax
-M06:    mov     [Rdest+2],  eax
-M02:    mov     [Rdest],    ax
-        mov     rax, Rdest2            ; return dest
-        ret
-
-M13:    mov     [Rdest+9],  eax
-M09:    mov     [Rdest+5],  eax
-M05:    mov     [Rdest+1],  eax
-        mov     [Rdest],    al
-        mov     rax, Rdest2            ; return dest
-        ret
-        
-B100:   ; AVX version, Rcount > 16
-        movd    xmm0, eax
-        pshufd  xmm0, xmm0, 0          ; Broadcast c into all bytes of xmm0
-        
-        lea     rax, [Rdest+Rcount]    ; point to end
-        
-        cmp     Rcount, 20H
-        jbe     K600                   ; faster to use xmm registers if small
-        
-        ; Store the first possibly unaligned 16 bytes
-        ; It is faster to always write 16 bytes, possibly overlapping
-        ; with the subsequent regular part, than to make possibly mispredicted
-        ; branches depending on the size of the first part.
-        movups  oword [Rdest], xmm0
-        
-        ; store another 16 bytes, aligned        
-        add     Rdest, 10H
-        and     Rdest, -10H
-        movaps  oword [Rdest], xmm0
-        
-        ; go to next 32 bytes boundary
-        add     Rdest, 10H
-        and     Rdest, -20H
-        
-        ; Check if count very big
-        cmp     Rcount, [MemsetCacheLimit]        
-        ja      K300                   ; Use non-temporal store if count > MemsetCacheLimit
-        
-        ; find last 32 bytes boundary
-        mov     Rcount, rax
-        and     Rcount, -20H
-        
-        ; - size of 32-bytes blocks
-        sub     Rdest, Rcount
-        jnb     K200                   ; Jump if not negative
-        
-        ; extend value to 256 bits
-        vinsertf128 ymm0,ymm0,xmm0,1
-        
-align   16        
-K100:   ; Loop through 32-bytes blocks. Register use is swapped
-        ; Rcount = end of 32-bytes blocks part
-        ; Rdest = negative index from the end, counting up to zero
-        vmovaps [Rcount+Rdest], ymm0
-        add     Rdest, 20H
-        jnz     K100
-        vzeroupper
-        
-K200:   ; the last part from Rcount to rax is < 32 bytes. write last 32 bytes with overlap
-        movups  [rax-20H], xmm0
-        movups  [rax-10H], xmm0
-        mov     rax, Rdest2            ; return dest
-        ret
-        
-K300:   ; Use non-temporal moves, same code as above:
-
-        ; find last 32 bytes boundary
-        mov     Rcount, rax
-        and     Rcount, -20H
-        
-        ; - size of 32-bytes blocks
-        sub     Rdest, Rcount
-        jnb     K500                   ; Jump if not negative
-        
-        ; extend value to 256 bits
-        vinsertf128 ymm0,ymm0,xmm0,1
-        
-align   16        
-K400:   ; Loop through 32-bytes blocks. Register use is swapped
-        ; Rcount = end of 32-bytes blocks part
-        ; Rdest = negative index from the end, counting up to zero
-        vmovntps [Rcount+Rdest], ymm0
-        add     Rdest, 20H
-        jnz     K400
-        sfence
-        vzeroupper
-        
-K500:   ; the last part from Rcount to rax is < 32 bytes. write last 32 bytes with overlap
-        movups  [rax-20H], xmm0
-        movups  [rax-10H], xmm0
-        mov     rax, Rdest2            ; return dest
-        ret
-        
-K600:   ; 16 < count <= 32
-        movups [Rdest], xmm0
-        movups [rax-10H], xmm0
-        mov     rax, Rdest2            ; return dest
-        ret
-        
-
-memsetSSE2:  ; count > 16. Use SSE2 instruction set
-memsetSSE2@: ; local label
-        PROLOGM
-        imul    eax, 01010101H         ; Broadcast c into all bytes of eax
-        mov     Rdest2, Rdest          ; save dest
-        cmp     Rcount, 16
-        jna     B050
-
-        movd    xmm0, eax
-        pshufd  xmm0, xmm0, 0          ; Broadcast c into all bytes of xmm0
-        
-        ; Store the first unaligned part.
-        ; The size of this part is 1 - 16 bytes.
-        ; It is faster to always write 16 bytes, possibly overlapping
-        ; with the subsequent regular part, than to make possibly mispredicted
-        ; branches depending on the size of the first part.
-        movq    qword [Rdest],   xmm0
-        movq    qword [Rdest+8], xmm0
-        
-        ; Check if count very big
-M150:   mov     rax, [MemsetCacheLimit]        
-        cmp     Rcount, rax
-        ja      M500                   ; Use non-temporal store if count > MemsetCacheLimit
-        
-        ; Point to end of regular part:
-        ; Round down dest+count to nearest preceding 16-bytes boundary
-        lea     Rcount, [Rdest+Rcount-1]
-        and     Rcount, -10H
-        
-        ; Point to start of regular part:
-        ; Round up dest to next 16-bytes boundary
-        add     Rdest, 10H
-        and     Rdest, -10H
-        
-        ; -(size of regular part)
-        sub     Rdest, Rcount
-        jnb     M300                   ; Jump if not negative
-        
-align 16
-M200:   ; Loop through regular part
-        ; Rcount = end of regular part
-        ; Rdest = negative index from the end, counting up to zero
-        movdqa  [Rcount+Rdest], xmm0
-        add     Rdest, 10H
-        jnz     M200
-        
-M300:   ; Do the last irregular part
-        ; The size of this part is 1 - 16 bytes.
-        ; It is faster to always write 16 bytes, possibly overlapping
-        ; with the preceding regular part, than to make possibly mispredicted
-        ; branches depending on the size of the last part.
-        mov     rax, Rdest2                          ; dest
-        movq    qword [rax+Rcount2-10H], xmm0
-        movq    qword [rax+Rcount2-8], xmm0
-        ret
-
-        
-M500:   ; Use non-temporal moves, same code as above:
-        ; End of regular part:
-        ; Round down dest+count to nearest preceding 16-bytes boundary
-        lea     Rcount, [Rdest+Rcount-1]
-        and     Rcount, -10H
-        
-        ; Start of regular part:
-        ; Round up dest to next 16-bytes boundary
-        add     Rdest, 10H
-        and     Rdest, -10H
-        
-        ; -(size of regular part)
-        sub     Rdest, Rcount
-        jnb     M700                   ; Jump if not negative
-
-align 16        
-M600:   ; Loop through regular part
-        ; Rcount = end of regular part
-        ; Rdest = negative index from the end, counting up to zero
-        movntdq [Rcount+Rdest], xmm0
-        add     Rdest, 10H
-        jnz     M600
-        sfence
-
-M700:   ; Do the last irregular part
-        ; The size of this part is 1 - 16 bytes.
-        ; It is faster to always write 16 bytes, possibly overlapping
-        ; with the preceding regular part, than to make possibly mispredicted
-        ; branches depending on the size of the last part.
-        mov     rax, Rdest2            ; dest
-        movq    qword [rax+Rcount2-10H], xmm0
-        movq    qword [rax+Rcount2-8], xmm0
-        ret
-        
-        
-memsetCPUDispatch:    ; CPU dispatcher, check for instruction sets and which method is fastest        
-        ; This part is executed only once
-        push    rbx
-        push    rcx
-        push    rdx
-        push    rsi
-        push    rdi
-        push    r8
-        ; set CacheBypassLimit to half the size of the largest level cache
-        call    GetMemsetCacheLimit@
-        lea     rbx, [memsetSSE2@]
-        call    Store256BitIsFaster    ; Test if 256-bit read/write is available and faster than 128-bit read/write
-        test    eax, eax
-        jz      Q100
-        lea     rbx, [memsetAVX@]
-Q100:
-        ; Insert appropriate pointer
-        mov     [memsetDispatch], rbx
-        mov     rax, rbx
-        pop     r8
-        pop     rdi
-        pop     rsi
-        pop     rdx
-        pop     rcx
-        pop     rbx
-        ; Jump according to the replaced function pointer
-        jmp     rax
-
-        
-; extern "C" size_t GetMemsetCacheLimit(); // Data blocks bigger than this will be stored uncached by memset
-GetMemsetCacheLimit:
-GetMemsetCacheLimit@:
-        mov     rax, [MemsetCacheLimit]
-        test    rax, rax
-        jnz     U200
-        ; Get half the size of the largest level cache
-%ifdef  WINDOWS
-        xor     ecx, ecx               ; 0 means largest level cache
-%else
-        xor     edi, edi               ; 0 means largest level cache
-%endif
-        call    DataCacheSize          ; get cache size
-        shr     eax, 1                 ; half the size
-        jnz     U100
-        mov     eax, 400000H           ; cannot determine cache size. use 4 Mbytes
-U100:   mov     [MemsetCacheLimit], eax
-U200:   ret
-
-; extern "C" void   SetMemsetCacheLimit(); // Change limit in GetMemsetCacheLimit
-SetMemsetCacheLimit:
-%ifdef  WINDOWS
-        mov     rax, rcx
-%else
-        mov     rax, rdi
-%endif
-        test    rax, rax
-        jnz     U400
-        ; zero, means default
-        mov     [MemsetCacheLimit], rax
-        call    GetMemsetCacheLimit@
-U400:   mov     [MemsetCacheLimit], rax
-        ret
-        
-   
-SECTION .data
-align 16
-; Jump table for count from 0 to 16:
-MemsetJTab:DQ M00, M01, M02, M03, M04, M05, M06, M07
-           DQ M08, M09, M10, M11, M12, M13, M14, M15, M16
-           
-; Pointer to appropriate version.
-; This initially points to memsetCPUDispatch. memsetCPUDispatch will
-; change this to the appropriate version of memset, so that
-; memsetCPUDispatch is only executed once:
-memsetDispatch: DQ memsetCPUDispatch           
-
-; Bypass cache by using non-temporal moves if count > MemsetCacheLimit
-; The optimal value of MemsetCacheLimit is difficult to estimate, but
-; a reasonable value is half the size of the largest cache
-MemsetCacheLimit: DQ 0
diff --git a/contrib/libs/asmlib/mersenne64.asm b/contrib/libs/asmlib/mersenne64.asm
deleted file mode 100644
index 758075d61d..0000000000
--- a/contrib/libs/asmlib/mersenne64.asm
+++ /dev/null
@@ -1,616 +0,0 @@
-%include "defs.asm"
-
-; ----------------------------- MERSENNE64.ASM ---------------------------
-; Author:           Agner Fog
-; Date created:     1998
-; Last modified:    2013-09-13
-; Source URL:       www.agner.org/optimize
-; Project:          asmlib.zip
-; Language:         assembly, NASM/YASM syntax, 64 bit
-; Description:
-; Random Number generator 'Mersenne Twister' type MT11213A (or MT19937)
-;
-;
-;  This random number generator is described in the article by
-;  M. Matsumoto & T. Nishimura, in:
-;  ACM Transactions on Modeling and Computer Simulation,
-;  vol. 8, no. 1, 1998, pp. 3-30. See also:
-;  http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
-;
-;  Initialization:
-;  MersRandomInit must be called before the first call to any of the other
-;  random number functions. The seed is any 32-bit integer.
-;  You may use MersRandomInitByArray instead if you want more
-;  than 32 bits for seed. length is the number of integers in seeds[].
-;  length must be > 0, there is no upper limit for length.
-;
-;  Generating random numbers:
-;  MersRandom returns a floating point number in the interval 0 <= x < 1 with
-;  a resolution of 32 bits.
-;  MersIRandom returns an integer in the interval defined by min and max with
-;  a resolution of 32 bits.
-;  MersIRandomX returns an integer in the interval defined by min and max with
-;  exactly equal probabilities of all values in the interval.
-;  MersBRandom returns 32 random bits.
-;
-;  Error conditions:
-;  If MersRandomInit or MersRandomInitByArray has not been called then MersRandom
-;  and MersBRandom keep returning 0, and MersIRandom and MersIRandomX return min.
-;  MersIRandom and MersIRandomX return a large negative number if max < min.
-;
-;  C++ prototypes in randoma.h:
-;  Thread-safe versions:
-;  extern "C" void   MersRandomInit(void * Pthis, int seed);         // Re-seed
-;  extern "C" void   MersRandomInitByArray(void * Pthis, unsigned int seeds[], int length); // Seed by more than 32 bits
-;  extern "C" int    MersIRandom (void * Pthis, int min, int max);   // Output random integer
-;  extern "C" int    MersIRandomX(void * Pthis, int min, int max);   // Output random integer, exact
-;  extern "C" double MersRandom(void * Pthis);                       // Output random float
-;  extern "C" unsigned int MersBRandom(void * Pthis);                // Output random bits
-;
-;  Single-threaded versions:
-;  extern "C" void   MersenneRandomInit(int seed);                   // Re-seed
-;  extern "C" void   MersenneRandomInitByArray(unsigned int seeds[], int length); // Seed by more than 32 bits
-;  extern "C" int    MersenneIRandom (int min, int max);             // Output random integer
-;  extern "C" int    MersenneIRandomX(int min, int max);             // Output random integer, exact
-;  extern "C" double MersenneRandom();                               // Output random float
-;  extern "C" unsigned int MersenneBRandom();                        // Output random bits
-;
-; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-default rel
-
-; structure definition and constants:
-%INCLUDE "randomah.asi"
-
-global MersenneRandomInit, MersenneRandomInitD, MersRandomInit
-global MersenneRandomInitByArray, MersenneRandomInitByArrayD, MersRandomInitByArray
-global MersenneBRandom, MersenneBRandomD, MersBRandom
-global MersenneRandom, MersenneRandomD, MersRandom
-global MersenneIRandom, MersenneIRandomD, MersIRandom
-global MersenneIRandomX, MersenneIRandomXD, MersIRandomX
-
-
-section .data
-align 16
-
-; Data for single instance of random number generator
-MersenneInstance: ISTRUC CRandomMersenneA
-IEND
-; Size of structure
-MersenneSize equ $ - MersenneInstance
-
-
-SECTION .CODE  ALIGN=16
-
-MersenneRandomInit: ; PROC
-%IFDEF UNIX
-        mov     edx, edi                                   ; seed
-        lea     rcx, [MersenneInstance]                    ; Pthis = point to instance
-        jmp     ?Windows_MersRandomInit
-%ENDIF
-%IFDEF WINDOWS
-MersenneRandomInitD:                                       ; alias
-        mov     edx, ecx                                   ; seed
-        lea     rcx, [MersenneInstance]                    ; Pthis = point to instance
-        ;jmp     ?Windows_MersRandomInit
-%ENDIF
-;MersenneRandomInit ENDP
-
-        
-; Thread-safe version:
-;  extern "C" void MersRandomInit(void * Pthis, int seed); // Re-seed
-MersRandomInit: ;   PROC
-%IFDEF UNIX
-        ; translate calling convention
-        mov     edx, esi                                   ; seed
-        mov     rcx, rdi                                   ; Pthis
-%ENDIF
-        ; parameters: rcx = Pthis, edx = seed
-        and     rcx, -16                                   ; align buffer
-        ?Windows_MersRandomInit:
-        call    Mers_init0                                 ; initialize mt buffer with seeds
-        
-        ; Number of premade numbers that are lost in the initialization when the  
-        ; SSE2 implementation makes up to 4 premade numbers at a time:
-%IF MERS_N & 3        
-   PREMADELOST equ (MERS_N & 3)
-%ELSE
-   PREMADELOST equ 4
-%ENDIF
-        ; We want the C++ and the assembly implementation to give exactly the same
-        ; sequence. The C++ version discards 37 random numbers after initialization.
-        ; The assembly version generates a sequence that is PREMADELOST + 1 numbers
-        ; behind. Therefore we discard the first 37 + PREMADELOST + 1 numbers if
-        ; SSE2 is supported, otherwise 37 + 1.
-        
-        push    rbx
-        mov     ebx, 37+PREMADELOST+1
-        ; CMP     dword [rcx+CRandomMersenneA.Instset], 4  ; can we use XMM registers and SSE2 ?
-        ; jae     M110
-        ; sub     ebx, PREMADELOST                         ; SSE2 not supported
-        ; mov     dword [rcx+CRandomMersenneA.PreInx], 0   ; reset index to premade list
-M110:   ; loop
-M120:   call    ?Windows_MersBRandom
-        dec     ebx
-        jnz     M120
-        pop     rbx
-        ret
-;MersRandomInit ENDP
-        
-
-Mers_init0:                                                ; make random seeds from eax and put them into MT buffer
-; Input parameters: 
-; rcx points to CRandomMersenneA
-; edx: seed
-; rcx unchanged by procedure
-
-        push    rdi
-        ; clear my buffer
-        push    rcx
-        mov     rdi, rcx                                   ; Pthis
-        add     rdi, 16
-        mov     ecx, (MersenneSize - 16) / 4
-        xor     eax, eax
-        cld
-        rep     stosd
-        pop     rcx                                        ; Pthis
-        mov     edi, edx                                   ; seed
-        
-        ; initialize CRandomMersenneA structure
-        mov     dword [rcx+CRandomMersenneA.PreInx], 4*4
-        mov     dword [rcx+CRandomMersenneA.Instset], 4
-        mov     eax, MERS_B
-        mov     [rcx+CRandomMersenneA.TMB], eax
-        mov     [rcx+CRandomMersenneA.TMB+4], eax
-        mov     [rcx+CRandomMersenneA.TMB+8], eax
-        mov     [rcx+CRandomMersenneA.TMB+12], eax
-        mov     eax, MERS_C
-        mov     [rcx+CRandomMersenneA.TMC], eax
-        mov     [rcx+CRandomMersenneA.TMC+4], eax
-        mov     [rcx+CRandomMersenneA.TMC+8], eax
-        mov     [rcx+CRandomMersenneA.TMC+12], eax
-        mov     eax, 3FF00000H                             ; upper dword of 1.0, double precision
-        mov     [rcx+CRandomMersenneA.one+4], eax
-        mov     [rcx+CRandomMersenneA.one+12], eax        
-        mov     dword [rcx+CRandomMersenneA.LMASK], LOWER_MASK
-        mov     dword [rcx+CRandomMersenneA.UMASK], UPPER_MASK
-        mov     dword [rcx+CRandomMersenneA.MATA],  MERS_A
-
-        ; put random numbers into MT buffer
-        xor     eax, eax
-M210:   mov     [rcx+rax*4+CRandomMersenneA.MT], edi
-        mov     edx, edi
-        shr     edi, 30
-        xor     edi, edx
-        imul    edi, 1812433253
-        inc     eax
-        add     edi, eax
-        cmp     eax, MERS_N
-        jb      M210
-        
-        ; Set index MTI to end of list, (scaled by 4)
-        ; Round up to multiple of 4 to avoid alignment error
-        mov     dword [rcx+CRandomMersenneA.MTI], ((MERS_N+3) & (-4)) * 4
-        
-        pop     rdi
-        ret      
-
-
-; Single threaded version:
-; extern "C" void MersenneRandomInitByArray(unsigned int seeds[], int length);
-
-MersenneRandomInitByArray: ; PROC                          ; entry for Linux call
-%IFDEF UNIX
-        mov     r8d, esi                                   ; length
-        mov     rdx, rdi                                   ; seeds
-        lea     rcx, [MersenneInstance]                    ; Pthis = point to instance
-        jmp     ?Windows_MersRandomInitByArray
-%ENDIF
-%IFDEF WINDOWS
-MersenneRandomInitByArrayD: ; LABEL NEAR                   ; alias
-        mov     r8d, edx                                   ; length
-        mov     rdx, rcx                                   ; seeds
-        lea     rcx, [MersenneInstance]                    ; Pthis = point to instance
-        jmp     ?Windows_MersRandomInitByArray
-%ENDIF        
-;MersenneRandomInitByArray ENDP       
-
-; Thread-safe version:
-; extern "C" int MersRandomInitByArray(void * Pthis, unsigned int seeds[], int length);
-MersRandomInitByArray: ; PROC
-%IFDEF UNIX
-        ; translate calling convention
-        mov     r8d, edx                                   ; length
-        mov     rdx, rsi                                   ; seeds
-        mov     rcx, rdi                                   ; Pthis
-%ENDIF
-        
-?Windows_MersRandomInitByArray:
-; parameters: rcx = Pthis, rdx = seeds, r8d = length
-
-        and     rcx, -16                                   ; align buffer
-        push    rbx
-        push    rsi
-        push    rdi
-        push    rbp
-        mov     rbx, rdx                                   ; seeds
-        mov     ebp, r8d                                   ; length
-        
-        mov     edx, 19650218
-        call    Mers_init0                                 ; init0(19650218); (rcx unchanged)
-        
-        mov     r8d, ebp                                   ; r8d = length, ebp = k
-        test    ebp, ebp
-        jle     M380                                       ; error: length <= 0
-        xor     edi, edi                                   ; j = 0
-        lea     esi, [rdi+1]                               ; i = 1
-        cmp     ebp, MERS_N
-        ja      M310
-        mov     ebp, MERS_N                                ; k = max (MERS_N,length)
-M310:
-
-        ; for (; k; k--) {
-M320:   mov     eax, [rcx+rsi*4-4+CRandomMersenneA.MT]     ; mt[i-1]
-        mov     edx, eax
-        shr     eax, 30
-        xor     eax, edx                                   ; mt[i-1] ^ (mt[i-1] >> 30)
-        imul    eax, 1664525                               ; * 1664525
-        xor     eax, [rcx+rsi*4+CRandomMersenneA.MT]       ; ^ mt[i]
-        add     eax, [rbx+rdi*4]                           ; + seeds[j]
-        add     eax, edi                                   ; + j
-        mov     [rcx+rsi*4+CRandomMersenneA.MT], eax       ; save in mt[i]
-        inc     esi                                        ; i++
-        inc     edi                                        ; j++
-        cmp     esi, MERS_N
-        jb      M330                                       ; if (i>=MERS_N)
-        mov     eax, [rcx+(MERS_N-1)*4+CRandomMersenneA.MT]; mt[0] = mt[MERS_N-1];
-        mov     [rcx+CRandomMersenneA.MT], eax
-        mov     esi, 1                                     ; i=1;
-M330:
-        cmp     edi, r8d                                   ; length
-        jb      M340                                       ; if (j>=length)
-        xor     edi, edi                                   ; j = 0;
-M340:
-        dec     ebp                                        ; k--
-        jnz     M320                                       ; first k loop
-M350:
-        mov     ebp, MERS_N-1                              ; k
-M360:   mov     eax, [rcx+rsi*4-4+CRandomMersenneA.MT]     ; mt[i-1]
-        mov     edx, eax
-        shr     eax, 30
-        xor     eax, edx                                   ; mt[i-1] ^ (mt[i-1] >> 30)
-        imul    eax, 1566083941                            ; * 1566083941
-        xor     eax, [rcx+rsi*4+CRandomMersenneA.MT]       ; ^ mt[i]
-        sub     eax, esi                                   ; - i
-        mov     [rcx+rsi*4+CRandomMersenneA.MT], eax       ; save in mt[i]
-        inc     esi                                        ; i++
-        cmp     esi, MERS_N
-        jb      M370                                       ; if (i>=MERS_N)
-        mov     eax, [rcx+(MERS_N-1)*4+CRandomMersenneA.MT]; mt[0] = mt[MERS_N-1];
-        mov     [rcx+CRandomMersenneA.MT], eax
-        mov     esi, 1                                     ; i=1;
-M370:
-        dec     ebp                                        ; k--
-        jnz     M360                                       ; second k loop
-        mov     dword [rcx+CRandomMersenneA.MT], 80000000H ; mt[0] = 0x80000000
-M380:
-        mov     dword [rcx+CRandomMersenneA.MTI], 0
-        mov     dword [rcx+CRandomMersenneA.PreInx], 0
-
-; discard first MERS_N random numbers + PREMADELOST+1 to compensate for lag
-        mov     edi, MERS_N + PREMADELOST+1
-M391:   call    ?Windows_MersBRandom
-        dec     edi
-        jnz     M391
-
-        pop     rbp                                        ; restore registers
-        pop     rdi
-        pop     rsi
-        pop     rbx
-        ret
-;MersRandomInitByArray ENDP
-
-
-; Single threaded version:
-; extern "C" unsigned int MersenneBRandom(); // Output random bits
-
-MersenneBRandom: ; PROC                                    ; entry for both Windows and Linux call
-%IFDEF WINDOWS
-MersenneBRandomD: ; LABEL NEAR                             ; alias
-%ENDIF
-        lea     rcx, [MersenneInstance]                    ; Point to instance
-        jmp     ?Windows_MersBRandom
-;MersenneBRandom ENDP       
-
-; Thread-safe version:
-; extern "C" unsigned int MersBRandom(void * Pthis);       // Output random bits
-
-MersBRandom: ; PROC
-%IFDEF UNIX
-        mov     rcx, rdi                                   ; translate calling convention
-%ENDIF
-
-?Windows_MersBRandom: ; LABEL NEAR                         ; Label used internally
-        and     rcx, -16                                   ; align buffer
-        mov     edx, [rcx+CRandomMersenneA.PreInx]         ; index into premade numbers
-        mov     eax, [rcx+rdx*1+CRandomMersenneA.PreInt]   ; fetch premade random number
-        add     edx, 4
-        mov     [rcx+CRandomMersenneA.PreInx], edx
-        cmp     edx, 4*4
-        jnb     M410
-        ret                                                ; return premade number
-
-M410:
-; PREMADE list is empty. Make 4 more numbers ready for next call:
-        mov     edx, [rcx+CRandomMersenneA.MTI]            ; fetch 4 numbers from MT buffer
-        movdqa  xmm0, oword [rcx+rdx*1+CRandomMersenneA.MT]
-        
-%IF TEMPERING                                              ; optional tempering algorithm
-        movdqa  xmm1, xmm0
-        psrld   xmm0, MERS_U
-        pxor    xmm0, xmm1
-        movdqa  xmm1, xmm0        
-        pslld   xmm0, MERS_S
-        pand    xmm0, oword [rcx+CRandomMersenneA.TMB]
-        pxor    xmm0, xmm1
-        movdqa  xmm1, xmm0        
-        pslld   xmm0, MERS_T
-        pand    xmm0, oword [rcx+CRandomMersenneA.TMC]
-        pxor    xmm0, xmm1
-        movdqa  xmm1, xmm0        
-        psrld   xmm0, MERS_L
-        pxor    xmm0, xmm1
-%ENDIF   ; tempering
-
-        ; save four premade integers
-        movdqa  oword [rcx+CRandomMersenneA.PreInt], xmm0
-        ; premake four floating point numbers
-        pxor    xmm1, xmm1
-        pxor    xmm2, xmm2
-        punpckldq xmm1, xmm0                               ; get first two numbers into bits 32-63 and 96-127
-        punpckhdq xmm2, xmm0                               ; get next  two numbers into bits 32-63 and 96-127
-        psrlq   xmm1, 12                                   ; get bits into mantissa position
-        psrlq   xmm2, 12                                   ; get bits into mantissa position
-        por     xmm1,oword[rcx+CRandomMersenneA.one]       ; set exponent for interval [1,2)
-        por     xmm2,oword[rcx+CRandomMersenneA.one]       ; set exponent for interval [1,2)
-        movdqa  oword [rcx+CRandomMersenneA.PreFlt], xmm1  ; store two premade numbers
-        movdqa  oword [rcx+CRandomMersenneA.PreFlt+16],xmm2; store two more premade numbers        
-        mov     dword [rcx+CRandomMersenneA.PreInx], 0     ; index to premade numbers 
-        add     edx, 4*4                                   ; increment MTI index into MT buffer by 4
-        mov     [rcx+CRandomMersenneA.MTI], edx
-        cmp     edx, MERS_N*4
-        jae     M420
-        ret                                                ; return random number in eax
-
-; MT buffer exhausted. Make MERS_N new numbers ready for next time
-M420:                                                      ; eax is the random number to return
-%IF     MERS_N & 3                                         ; if MERS_N is not divisible by 4
-        NVALID equ MERS_N & 3                              ; only NVALID of the 4 premade numbers are valid
-        ; Move premade numbers (4-NVALID) positions forward
-        movdqa  xmm0, [rcx+CRandomMersenneA.PreInt]
-        movdqa  xmm1, [rcx+CRandomMersenneA.PreFlt]
-        movdqa  xmm2, [rcx+CRandomMersenneA.PreFlt+16]
-        movdqu  [rcx+CRandomMersenneA.PreInt + (4-NVALID)*4], xmm0
-        movdqu  [rcx+CRandomMersenneA.PreFlt + (4-NVALID)*8], xmm1
-%IF NVALID == 3        
-        movq    [rcx+CRandomMersenneA.PreFlt+16 + 8], xmm2
-%ENDIF        
-        ; save index to first valid premade number
-        mov     [rcx+CRandomMersenneA.PreInx], (4-NVALID)*4  
-%ENDIF
-        
-; MT buffer is empty. Fill it up
-        push    rbx
-        movd    xmm3, [rcx+CRandomMersenneA.UMASK]         ; load constants
-        movd    xmm4, [rcx+CRandomMersenneA.LMASK]
-        movd    xmm5, [rcx+CRandomMersenneA.MATA]
-        pshufd  xmm3, xmm3, 0                              ; broadcast constants
-        pshufd  xmm4, xmm4, 0
-        pshufd  xmm5, xmm5, 0
-        xor     rbx,  rbx                                  ; kk = 0
-        mov     edx,  MERS_M*4                             ; km
-        
-; change rcx from pointing to CRandomMersenneA to pointing to CRandomMersenneA.MT
-        add     rcx, CRandomMersenneA.MT
-
-M430:   ; kk loop
-        movdqa  xmm2, [rcx+rbx]                            ; mt[kk]
-        movd    xmm0, dword [rcx+rbx+16]
-        movdqa  xmm1, [rcx+rbx]                            ; mt[kk]        
-        movss   xmm2, xmm0                                 ; faster than movdqu xmm2,[]
-        pshufd  xmm2, xmm2, 00111001B                      ; mt[kk+1]
-        movdqu  xmm0, oword [rcx+rdx]                      ; mt[km]        
-        ;movq   xmm0, qword [rcx+rdx]                      ; mt[km]
-        ;movhps xmm0, qword [rcx+rdx+8]                    ; faster than movdqu on older processors        
-        pand    xmm1, xmm3                                 ; mt[kk] & UPPER_MASK
-        pand    xmm2, xmm4                                 ; mt[kk+1] & LOWER_MASK
-        por     xmm1, xmm2                                 ; y        
-        movdqa  xmm2, xmm1                                 ; y
-        pslld   xmm1, 31                                   ; copy bit 0 into all bits
-        psrad   xmm1, 31                                   ; -(y & 1)
-        pand    xmm1, xmm5                                 ; & MERS_A
-        psrld   xmm2, 1                                    ; y >> 1
-        pxor    xmm0, xmm1
-        pxor    xmm0, xmm2
-        movdqa  [rcx+rbx], xmm0                            ; result into mt[kk]
-        cmp     ebx, (MERS_N-4)*4
-        jae     M440                                       ; exit loop when kk past end of buffer
-        add     ebx, 16                                    ; kk += 4
-        add     rdx, 16                                    ; km += 4 (signed)
-        cmp     edx, (MERS_N-4)*4
-        jbe     M430                                       ; skip unless km wraparound
-        sub     rdx, MERS_N*4                              ; km wraparound (signed)
-        movdqu  xmm0, [rcx+(MERS_N-4)*4]                   ; copy end to before begin for km wraparound
-        movdqa  [rcx-4*4], xmm0        
-        movdqa  xmm0, [rcx]                                ; copy begin to after end for kk wraparound
-        movdqu  [rcx+MERS_N*4], xmm0
-        jmp     M430
-
-M440:   ; loop finished. discard excess part of last result
-
-; change ecx back to pointing to CRandomMersenneA
-        sub     rcx, CRandomMersenneA.MT        
-
-        mov     dword [rcx+CRandomMersenneA.MTI], 0
-        pop     rbx
-        ret                                                ; random number is still in eax
-        
-;MersBRandom ENDP
-
-
-; Single threaded version:
-; extern "C" unsigned int MersenneRandom();  // Get floating point random number
-
-MersenneRandom: ; PROC                                     ; entry for both Windows and Linux call
-%IFDEF WINDOWS
-MersenneRandomD:                                           ; alias
-        lea     rcx, [MersenneInstance]                    ; Point to instance
-        ; continue in next function
-%ENDIF
-%IFDEF UNIX
-        lea     rdi, [MersenneInstance]                    ; Point to instance
-        ; continue in next function
-%ENDIF
-
-; Thread-safe version:
-; extern "C" double MersRandom(void * Pthis);  // Get floating point random number
-MersRandom: 
-%IFDEF UNIX
-        mov     rcx, rdi                                   ; translate calling convention
-%ENDIF
-        mov     edx, [rcx+CRandomMersenneA.PreInx]         ; index into premade numbers
-        movsd   xmm0, [rcx+rdx*2+CRandomMersenneA.PreFlt]  ; fetch premade floating point random number
-        subsd   xmm0, [rcx+CRandomMersenneA.one]           ; subtract 1.0
-        movsd   [rcx+CRandomMersenneA.TmpFlt], xmm0        ; store random number
-        call    ?Windows_MersBRandom                       ; prepare next random number
-        movsd   xmm0, [rcx+CRandomMersenneA.TmpFlt]        ; recall random number
-        ret        
-;MersenneRandom ENDP       
-
-
-
-; Single threaded version:
-; extern "C" unsigned int MersenneIRandom(int min, int max); // Get integer random number in desired interval
-
-MersenneIRandom: ; PROC 
-%IFDEF UNIX
-        push    rsi                                        ; max
-        push    rdi                                        ; min
-        lea     rcx, [MersenneInstance]                    ; Pthis = point to instance
-        jmp     MersIRandom_max_min_on_stack
-%ENDIF
-%IFDEF WINDOWS
-MersenneIRandomD:                                          ; Alias
-        push    rdx                                        ; max
-        push    rcx                                        ; min
-        lea     rcx, [MersenneInstance]                    ; Pthis = point to instance
-        jmp     MersIRandom_max_min_on_stack
-%ENDIF
-;MersenneIRandom ENDP       
-
-; Thread-safe version:
-; extern "C" int MersIRandom(void * Pthis, int min, int max); // Get integer random number in desired interval
-MersIRandom: ; PROC
-%IFDEF UNIX
-        ; translate calling convention
-        mov     r8d, edx                                   ; max
-        mov     edx, esi                                   ; min
-        mov     rcx, rdi                                   ; Pthis
-%ENDIF
-        push    r8                                         ; max
-        push    rdx                                        ; min
-MersIRandom_max_min_on_stack:
-        
-        call    ?Windows_MersBRandom                       ; random bits
-        pop     rcx                                        ; min
-        pop     rdx                                        ; max
-        sub     edx, ecx
-        js      short M720                                 ; max < min
-        add     edx, 1                                     ; interval = max - min + 1
-        mul     edx                                        ; multiply random number by interval and truncate
-        lea     eax, [rdx+rcx]                             ; add min
-        ret
-M720:   mov     eax, 80000000H                             ; error exit
-        ret
-;MersIRandom ENDP
-
-
-; Single threaded version:
-; extern "C" unsigned int MersenneIRandomX(int min, int max); // Get integer random number in desired interval
-
-MersenneIRandomX: ; PROC
-%IFDEF UNIX
-        mov     r8d, esi                                   ; max
-        mov     edx, edi                                   ; min
-        lea     rcx, [MersenneInstance]                    ; Pthis = point to instance
-        jmp     ?Windows_MersIRandomX
-%ENDIF
-%IFDEF WINDOWS
-MersenneIRandomXD:                                         ; alias
-        mov     r8d, edx                                   ; max
-        mov     edx, ecx                                   ; min
-        lea     rcx, [MersenneInstance]                    ; Pthis = point to instance
-        jmp     ?Windows_MersIRandomX
-%ENDIF
-;MersenneIRandomX ENDP       
-
-; Thread-safe version:
-; extern "C" int MersIRandomX(void * Pthis, int min, int max); // Get integer random number in desired interval
-MersIRandomX: ; PROC
-%IFDEF UNIX
-        ; translate calling convention
-        mov     r8d, edx                                   ; max
-        mov     edx, esi                                   ; min
-        mov     rcx, rdi                                   ; Pthis
-%ENDIF
-        
-?Windows_MersIRandomX:
-; parameters: rcx = Pthis, edx = min, r8d = max
-
-        and     rcx, -16                                   ; align buffer
-        push    rdi
-        mov     edi, r8d                                   ; max
-
-        sub     edi, edx                                   ; max - min
-        jle     short M830                                 ; max <= min (signed)
-        inc     edi                                        ; interval = max - min + 1
-        push    rdx                                        ; save min
-        
-        ; if (interval != LastInterval) {
-        cmp     edi, [rcx+CRandomMersenneA.LastInterval]
-        je      M810
-        ; RLimit = uint32(((uint64)1 << 32) / interval) * interval - 1;}
-        xor     eax, eax                                   ; 0
-        lea     edx, [rax+1]                               ; 1
-        div     edi                                        ; (would give overflow if interval = 1)
-        mul     edi
-        dec     eax
-        mov     [rcx+CRandomMersenneA.RLimit], eax
-        mov     [rcx+CRandomMersenneA.LastInterval], edi
-M810:
-M820:   ; do { // Rejection loop
-        call    ?Windows_MersBRandom                       ; random bits (rcx is preserved)
-        ; longran  = (uint64)BRandom() * interval;
-        mul     edi
-        ; } while (remainder > RLimit);
-        cmp     eax, [rcx+CRandomMersenneA.RLimit]
-        ja      M820
-        
-        ; return (int32)iran + min
-        pop     rax                                        ; min
-        add     eax, edx
-        pop     rdi
-        ret
-        
-M830:   jl      M840
-        ; max = min. Return min
-        mov     eax, edx
-        pop     rdi
-        ret                                                ; max = min exit
-        
-M840:   ; max < min: error
-        mov     eax, 80000000H                             ; error exit
-        pop     rdi
-        ret
-;MersIRandomX ENDP
diff --git a/contrib/libs/asmlib/mother64.asm b/contrib/libs/asmlib/mother64.asm
deleted file mode 100644
index c6fd34ec3b..0000000000
--- a/contrib/libs/asmlib/mother64.asm
+++ /dev/null
@@ -1,242 +0,0 @@
-%include "defs.asm"
-
-; ----------------------------- MOTHER64.ASM -----------------------------
-; Author:           Agner Fog
-; Date created:     1998
-; Last modified:    2013-09-11
-; Source URL:       www.agner.org/optimize
-; Project:          asmlib.zip
-; Language:         assembly, NASM/YASM syntax, 64 bit
-; Description:
-; Mother-of-All random number generator by Agner Fog
-; 64-bit mode version for x86-64 compatible microprocessors.
-;
-;  This is a multiply-with-carry type of random number generator
-;  invented by George Marsaglia.  The algorithm is:             
-;  S = 2111111111*X[n-4] + 1492*X[n-3] + 1776*X[n-2] + 5115*X[n-1] + C
-;  X[n] = S modulo 2^32
-;  C = floor(S / 2^32) 
-;
-; C++ prototypes:
-; extern "C" void         MotRandomInit(void * Pthis, int seed);      // Initialization
-; extern "C" int          MotIRandom(void * Pthis, int min, int max); // Get integer random number in desired interval
-; extern "C" double       MotRandom(void * Pthis);                    // Get floating point random number
-; extern "C" unsigned int MotBRandom(void * Pthis);                   // Output random bits
-;
-; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-default rel
-
-; structure definition and constants:
-%INCLUDE "randomah.asi"
-
-; publics:
-global MotherBRandom, MotBRandom, ?Windows_MotBRandom
-global MotherRandom, MotRandom, MotherIRandom, MotIRandom
-global MotherRandomInit, MotRandomInit
-
-section .data
-align 16
-
-; Data for single instance of random number generator
-MotherInstance: ISTRUC CRandomMotherA
-IEND
-; Size of structure
-MotherSize equ $-MotherInstance
-
-
-SECTION .CODE ALIGN=16   ; code segment
-
-; Single threaded version:
-; extern "C" unsigned int MotherBRandom(); // Output random bits
-
-MotherBRandom: ; PROC                          ; entry for both Windows and Linux call
-        lea     rcx, [MotherInstance]         ; Point to instance
-        jmp     ?Windows_MotBRandom
-;MotherBRandom ENDP       
-
-; Thread-safe version:
-; extern "C" unsigned int MotBRandom(void * Pthis); // Output random bits
-
-MotBRandom: ; PROC 
-%IFDEF UNIX
-        mov     rcx, rdi                    ; translate calling convention
-%ENDIF
-?Windows_MotBRandom:
-        and     rcx, -16                    ; align
-        movdqa  xmm1, oword [rcx+CRandomMotherA.M3]  ; load M3,M2,M1,M0
-        mov     eax,  [rcx+CRandomMotherA.M0]              ; Retrieve previous random number
-        movdqa  xmm2, xmm1                                 ; copy
-        movdqa  xmm3, oword [rcx+CRandomMotherA.MF3] ; factors
-        psrlq   xmm2, 32                                   ; move M2,M0 down
-        movq    qword [rcx+CRandomMotherA.M4], xmm1    ; M4=M3, M3=M2
-        movhps  qword [rcx+CRandomMotherA.M2], xmm1    ; M2=M1, M1=M0
-        pmuludq xmm1, xmm3                                 ; M3*MF3, M1*MF1
-        psrlq   xmm3, 32                                   ; move MF2,MF0 down
-        pmuludq xmm2, xmm3                                 ; M2*MF2, M0*MF0
-        paddq   xmm1, xmm2                                 ; P2+P3, P0+P1
-        movhlps xmm2, xmm1                                 ; Get high qword
-        paddq   xmm1, xmm2                                 ; P0+P1+P2+P3
-        paddq   xmm1, oword [rcx+CRandomMotherA.MC]    ; +carry
-        movq    qword [rcx+CRandomMotherA.M0], xmm1    ; Store new M0 and carry
-        ; convert to double precision float
-        psllq   xmm1, 32                                   ; Discard carry bits
-        psrlq   xmm1, 12                                   ; Get bits into mantissa position
-        por     xmm1, oword [rcx+CRandomMotherA.one] ; Add exponent bits to get number in interval [1,2)
-        movq    [rcx+CRandomMotherA.RanP1], xmm1           ; Store floating point number
-        ret
-        
-;MotBRandom ENDP
-
-        
-; Single threaded version:
-; extern "C" unsigned int MotherRandom();  // Get floating point random number
-
-MotherRandom:
-%IFDEF UNIX
-        lea     rdi, [MotherInstance]         ; Point to instance
-%ENDIF
-%IFDEF WINDOWS
-        lea     rcx, [MotherInstance]         ; Point to instance
-%ENDIF
-
-; Thread-safe version:
-; extern "C" double MotRandom(void * Pthis);  // Get floating point random number
-MotRandom:
-%IFDEF UNIX
-        mov     rcx, rdi                                   ; translate calling convention
-%ENDIF
-        and     rcx, -16                    ; align
-        ; get previously prepared random number
-        movsd   xmm0, [rcx+CRandomMotherA.RanP1]
-        subsd   xmm0, [rcx+CRandomMotherA.one]
-
-        ; make new random number ready for next time
-        call    ?Windows_MotBRandom
-        ret
-;MotherRandom ENDP
-
-
-; Single threaded version:
-; extern "C" unsigned int MotherIRandom(int min, int max); // Get integer random number in desired interval
-
-MotherIRandom: ; PROC
-%IFDEF UNIX
-        mov     r8d, esi                    ; max
-        mov     edx, edi                    ; min
-        lea     rcx, [MotherInstance]       ; Pthis = point to instance
-        jmp     ?Windows_MotIRandom
-%ENDIF
-%IFDEF WINDOWS
-        mov     r8d, edx                    ; max
-        mov     edx, ecx                    ; min
-        lea     rcx, [MotherInstance]       ; Pthis = point to instance
-        jmp     ?Windows_MotIRandom
-%ENDIF
-; MotherIRandom ENDP       
-
-; Thread-safe version:
-; extern "C" int MotIRandom(void * Pthis, int min, int max); // Get integer random number in desired interval
-MotIRandom:
-%IFDEF UNIX
-        ; translate calling convention
-        mov     r8d, edx                    ; max
-        mov     edx, esi                    ; min
-        mov     rcx, rdi                    ; Pthis
-%ENDIF
-        
-?Windows_MotIRandom: ;   LABEL NEAR         ; entry for Windows call
-        and     rcx, -16                    ; align
-        push    r8
-        push    rdx
-        call    ?Windows_MotBRandom         ; make random number
-        pop     rcx                         ; min
-        pop     r8                          ; max
-        sub     r8d, ecx
-        js      short rerror                ; max < min
-        inc     r8d                         ; interval = max - min + 1
-        mul     r8d                         ; multiply random number eax by interval and truncate
-        lea     eax, [rdx+rcx]              ; add min to interval*BRandom >> 32
-        ret                                 ; ret 8 if not _cdecl calling
-
-rerror: mov     eax, 80000000h              ; error exit   
-        ret                                 ; ret 8 if not _cdecl calling
-;MotIRandom ENDP
-
-
-; Single threaded version:
-; extern "C" unsigned int MotherRandomInit(int seed);  // Initialization
-
-MotherRandomInit: ; PROC
-%IFDEF UNIX
-        mov     edx, edi                    ; seed
-        lea     rcx, [MotherInstance]       ; Pthis = point to instance
-        jmp     ?Windows_MotRandomInit
-%ENDIF
-%IFDEF WINDOWS
-        mov     edx, ecx                    ; seed
-        lea     rcx, [MotherInstance]       ; Pthis = point to instance
-        jmp     ?Windows_MotRandomInit
-%ENDIF
-;MotherRandomInit ENDP       
-
-; Thread-safe version:
-; extern "C" void MotRandomInit(void * Pthis, int seed);  // Initialization
-MotRandomInit: ; PROC
-%IFDEF UNIX
-        ; translate calling convention
-        mov     edx, esi                    ; seed
-        mov     rcx, rdi                    ; Pthis
-%ENDIF
-        
-?Windows_MotRandomInit: ;   LABEL NEAR         ; entry for Windows call
-        and     rcx, -16                    ; align
-        ; clear my buffer
-        push    rdi
-        push    rcx
-        mov     rdi, rcx                    ; Pthis
-        add     rdi, 16
-        mov     ecx, (MotherSize - 16) / 4
-        xor     eax, eax
-        cld
-        rep     stosd
-        pop     rcx
-        
-        ; insert constants
-        mov     dword [rcx+CRandomMotherA.one+4], 3FF00000H  ; high dword of 1.0       
-        mov     dword [rcx+CRandomMotherA.MF0], 5115             ; factors
-        mov     dword [rcx+CRandomMotherA.MF1], 1776
-        mov     dword [rcx+CRandomMotherA.MF2], 1492
-        mov     dword [rcx+CRandomMotherA.MF3], 2111111111
-        
-        ; initialize from seed
-        mov     eax, edx                                   ; seed        
-        ; make random numbers and put them into buffer
-        mov     edx, 29943829
-        imul    eax, edx
-        dec     eax
-        mov     [rcx+CRandomMotherA.M0], eax
-        imul    eax, edx
-        dec     eax
-        mov     [rcx+CRandomMotherA.M1], eax
-        imul    eax, edx
-        dec     eax
-        mov     [rcx+CRandomMotherA.M2], eax
-        imul    eax, edx
-        dec     eax
-        mov     [rcx+CRandomMotherA.M3], eax
-        imul    eax, edx
-        dec     eax
-        mov     [rcx+CRandomMotherA.MC], eax
-
-        ; randomize some more
-        mov     edi, 20                                    ; loop counter
-r90:    call    ?Windows_MotBRandom                        ; (rcx and rdi unchanged)
-        dec     edi
-        jnz     r90
-        pop     rdi
-        ret
-;MotRandomInit ENDP
-
- ;       END
diff --git a/contrib/libs/asmlib/physseed64.asm b/contrib/libs/asmlib/physseed64.asm
deleted file mode 100644
index b30fc26712..0000000000
--- a/contrib/libs/asmlib/physseed64.asm
+++ /dev/null
@@ -1,396 +0,0 @@
-%include "defs.asm"
-
-;*************************  physseed64.asm  **********************************
-; Author:           Agner Fog
-; Date created:     2010-08-03
-; Last modified:    2013-09-13
-; Source URL:       www.agner.org/optimize
-; Project:          asmlib.zip
-; C++ prototype:
-; extern "C" int PhysicalSeed(int seeds[], int NumSeeds);
-;
-; Description:
-; Generates a non-deterministic random seed from a physical random number generator 
-; which is available on some processors. 
-; Uses the time stamp counter (which is less random) if no physical random number
-; generator is available.
-; The code is not optimized for speed because it is typically called only once.
-;
-; Parameters:
-; int seeds[]       An array which will be filled with random numbers
-; int NumSeeds      Indicates the desired number of 32-bit random numbers
-;
-; Return value:     0   Failure. No suitable instruction available (processor older than Pentium)
-;                   1   No physical random number generator. Used time stamp counter instead
-;                   2   Success. VIA physical random number generator used
-;                   3   Success. Intel physical random number generator used
-;                   4   Success. Intel physical seed generator used
-; 
-; The return value will indicate the availability of a physical random number generator
-; even if NumSeeds = 0.
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-default rel
-
-%define NUM_TRIES   20                 ; max number of tries for rdseed and rdrand instructions
-
-%define TESTING     0                  ; 1 for test only
-
-global PhysicalSeed
-
-; Direct entries to CPU-specific versions
-global PhysicalSeedNone: function
-global PhysicalSeedRDTSC: function
-global PhysicalSeedVIA: function
-global PhysicalSeedRDRand: function
-global PhysicalSeedRDSeed function
-
-; ***************************************************************************
-; Define registers used for function parameters, used in 64-bit mode only
-; ***************************************************************************
- 
-%IFDEF WINDOWS
-  %define par1     rcx
-  %define par2     rdx
-  %define par3     r8
-  %define par1d    ecx
-  %define par2d    edx
-  %define par3d    r8d
-%ENDIF
-  
-%IFDEF UNIX
-  %define par1     rdi
-  %define par2     rsi
-  %define par3     rdx
-  %define par1d    edi
-  %define par2d    esi
-  %define par3d    edx
-%ENDIF 
-
-
-SECTION .text  align=16
-
-%IFDEF WINDOWS
-global PhysicalSeedD@8                 ; DLL version
-PhysicalSeedD@8:
-%ENDIF
-
-PhysicalSeed:
-        jmp     [PhysicalSeedDispatch] ; Go to appropriate version, depending on instructions available
-
-
-PhysicalSeedRDSeed:
-        push    rbx
-        test    par2d, par2d           ; NumSeeds
-        jz      S300 
-        js      S900
-        mov     par3d, par2d           ; NumSeeds
-        shr     par3d, 1
-        jz      S150
-        ; do 64 bits at a time
-S100:   mov     ebx, NUM_TRIES
-S110:   ; rdseed rax
-%if     TESTING
-        mov     eax, par3d
-        stc
-%ELSE
-        db 48h, 0Fh, 0C7h, 0F8h        ; rdseed rax
-%ENDIF
-        jc      S120
-        ; failed. try again
-        dec     ebx
-        jz      S900
-        jmp     S110
-S120:   mov     [par1], rax
-        add     par1, 8
-        dec     par3d
-        jnz     S100                   ; loop 64 bits
-S150:
-        and     par2d, 1
-        jz      S300
-        ; an odd 32 bit remains
-S200:   mov     ebx, NUM_TRIES
-S210:   ; rdseed rax
-%if     TESTING
-        mov     eax, par3d
-        stc
-%ELSE
-        db 0Fh, 0C7h, 0F8h             ; rdseed eax
-%ENDIF
-        jc      S220
-        ; failed. try again
-        dec     ebx
-        jz      S900
-        jmp     S210
-S220:   mov     [par1], eax
-S300:   mov     eax, 4                 ; return value
-        pop     rbx
-        ret
-S900:   ; failure 
-        xor     eax, eax               ; return 0
-        pop     rbx
-        ret
-                
-
-PhysicalSeedRDRand:
-        push    rbx
-        test    par2d, par2d           ; NumSeeds
-        jz      R300
-        js      R900         
-        mov     par3d, par2d           ; NumSeeds
-        shr     par3d, 1               ; NumSeeds/2
-        jz      R150
-        ; do 64 bits at a time
-R100:   mov     ebx, NUM_TRIES
-R110:   ; rdrand rax
-%if     TESTING
-        mov     eax, par3d
-        stc
-%ELSE
-        db 48h, 0Fh, 0C7h, 0F0h        ; rdrand rax
-%ENDIF
-        jc      R120
-        ; failed. try again
-        dec     ebx
-        jz      R900
-        jmp     R110
-R120:   mov     [par1], rax
-        add     par1, 8
-        dec     par3d
-        jnz     R100                   ; loop 64 bits
-R150:
-        and     par2d, 1
-        jz      R300
-        ; an odd 32 bit remains
-R200:   mov     ebx, NUM_TRIES
-R210:   ; rdrand eax
-%if     TESTING
-        mov     eax, par3d
-        stc
-%ELSE
-        db 0Fh, 0C7h, 0F0h             ; rdrand eax
-%ENDIF
-        jc      R220
-        ; failed. try again
-        dec     ebx
-        jz      R900
-        jmp     R210
-R220:   mov     [par1], eax
-R300:   mov     eax, 4                 ; return value
-        pop     rbx
-        ret
-R900:   ; failure 
-        xor     eax, eax               ; return 0
-        pop     rbx
-        ret
-
-
-PhysicalSeedVIA:
-;       VIA XSTORE  supported
-        push    rbx
-%IFDEF WINDOWS
-        push    rsi
-        push    rdi
-        mov     rdi, rcx               ; seeds
-        mov     esi, edx               ; NumSeeds
-%ENDIF        
-        mov     ecx, esi               ; NumSeeds
-        and     ecx, -2                ; round down to nearest even
-        jz      T200                   ; NumSeeds <= 1
-        ; make an even number of random dwords
-        shl     ecx, 2                 ; number of bytes (divisible by 8)
-        mov     edx, 3                 ; quality factor
-%if     TESTING
-        mov     eax, 1
-        rep stosb
-%ELSE        
-        db 0F3H, 00FH, 0A7H, 0C0H      ; rep xstore instuction
-%ENDIF
-T200:        
-        test    esi, 1
-        jz      T300
-        ; NumSeeds is odd. Make 8 bytes in temporary buffer and store 4 of the bytes
-        mov     rbx, rdi               ; current output pointer
-        mov     ecx, 4                 ; Will generate 4 or 8 bytes, depending on CPU
-        mov     edx, 3                 ; quality factor
-        push    rcx                    ; make temporary space on stack
-        mov     rdi, rsp               ; point to buffer on stack
-%if     TESTING
-        mov     eax, 1
-        rep stosb
-%ELSE        
-        db 0F3H, 00FH, 0A7H, 0C0H      ; rep xstore instuction
-%ENDIF
-        pop     rax
-        mov     [rbx], eax             ; store the last 4 bytes
-T300:
-        mov     eax, 2                 ; return value        
-%IFDEF WINDOWS
-        pop     rdi
-        pop     rsi
-%ENDIF  
-        pop     rbx      
-        ret        
-
-
-PhysicalSeedRDTSC:
-%IFDEF WINDOWS
-        push    rbx
-        push    rcx
-        push    rdx
-        xor     eax, eax
-        cpuid                          ; serialize
-        rdtsc                          ; get time stamp counter
-        pop     rbx                    ; numseeds
-        pop     rcx                    ; seeds
-        test    ebx, ebx
-        jz      U300                   ; zero seeds
-        js      U900                   ; failure
-        mov     [rcx], eax             ; store time stamp counter as seeds[0]
-        add     rcx, 4
-        dec     ebx
-        jz      U300
-        mov     [rcx], edx             ; store upper part of time stamp counter as seeds[1]
-        add     rcx, 4
-        dec     ebx
-        jz      U300
-        xor     eax, eax
-U100:   mov     [rcx], eax             ; store 0 for the rest
-        add     rcx, 4
-        dec     ebx
-        jnz     U100
-U300:   mov     eax, 1                 ; return value        
-        pop     rbx
-        ret
-U900:   ; failure         
-        xor     eax, eax               ; return 0
-        pop     rbx
-        ret
-        
-%ELSE   ; UNIX
-
-        push    rbx
-        xor     eax, eax
-        cpuid                          ; serialize
-        rdtsc                          ; get time stamp counter
-        test    esi, esi               ; numseeds
-        jz      U300                   ; zero seeds
-        js      U900                   ; failure
-        mov     [rdi], eax             ; store time stamp counter as seeds[0]
-        add     rdi, 4
-        dec     esi
-        jz      U300
-        mov     [rdi], edx             ; store upper part of time stamp counter as seeds[1]
-        add     rdi, 4
-        dec     esi
-        jz      U300
-        xor     eax, eax
-U100:   mov     [rdi], eax             ; store 0 for the rest
-        add     rdi, 4
-        dec     esi
-        jnz     U100
-U300:   mov     eax, 1                 ; return value        
-        pop     rbx
-        ret
-U900:   ; failure         
-        xor     eax, eax               ; return 0
-        pop     rbx
-        ret 
-
-%ENDIF  
-
-
-PhysicalSeedNone:                      ; no possible generation
-        xor     eax, eax
-        test    par2d, par2d           ; numseeds
-        jz      N200
-N100:   mov     [par1], eax
-        add     par1, 4
-        dec     par2d
-        jnz     N100
-N200:   ret                            ; return 0
-
-
-PhysicalSeedDispatcher:
-        push    rbx
-%IFDEF WINDOWS
-        push    rcx
-        push    rdx
-%ENDIF
-        ; test if RDSEED supported
-        xor     eax, eax
-        cpuid
-        cmp     eax, 7
-        jb      P200                   ; RDSEED not supported
-        mov     eax, 7
-        xor     ecx, ecx
-        cpuid
-        bt      ebx, 18
-       ; jc      USE_RDSEED             ; not tested yet!!
-
-P200:   ; test if RDRAND supported
-        mov     eax, 1
-        cpuid
-        bt      ecx, 30
-        jc      USE_RDRAND
-
-        ; test if VIA xstore instruction supported
-        mov     eax, 0C0000000H
-        push    rax
-        cpuid
-        pop     rbx
-        cmp     eax, ebx
-        jna     P300                   ; not a VIA processor
-        lea     eax, [rbx+1]
-        cpuid
-        bt      edx, 3
-        jc      VIA_METHOD
-
-P300:   ; test if RDTSC supported
-        mov     eax, 1
-        cpuid
-        bt      edx, 4
-        jc      USE_RDTSC              ; XSTORE instruction not supported or not enabled
-        
-FAILURE: ; No useful instruction supported
-        lea     rax, [PhysicalSeedNone]
-        jmp     P800
-
-USE_RDRAND:     ; Use RDRAND instruction        
-        lea     rax, [PhysicalSeedRDRand]
-        jmp     P800
-
-USE_RDSEED:     ; Use RDSEED instruction (not tested yet)
-        lea     rax, [PhysicalSeedRDSeed]
-        jmp     P800
-
-VIA_METHOD:     ; Use VIA xstore instructions   
-        lea     rax, [PhysicalSeedVIA]
-        jmp     P800
-        
-USE_RDTSC:
-        lea     rax, [PhysicalSeedRDTSC]
-        ;jmp     P800
-        
-P800:   mov     [PhysicalSeedDispatch], rax
-%IFDEF WINDOWS
-        pop     rdx
-        pop     rcx
-%ENDIF
-        pop     rbx
-        jmp     rax                    ; continue in dispatched version
-        
-
-; -----------------------------------------------------------------
-;  Data section for dispatcher
-; -----------------------------------------------------------------
-
-SECTION .data
-
-; Pointer to appropriate versions. Initially point to dispatcher
-PhysicalSeedDispatch  DQ PhysicalSeedDispatcher
-
-%IFDEF POSITIONINDEPENDENT
-; Fix potential problem in Mac linker
-        DD      0, 0
-%ENDIF
diff --git a/contrib/libs/asmlib/popcount64.asm b/contrib/libs/asmlib/popcount64.asm
deleted file mode 100644
index c4ad64e03b..0000000000
--- a/contrib/libs/asmlib/popcount64.asm
+++ /dev/null
@@ -1,112 +0,0 @@
-%include "defs.asm"
-
-;*************************  popcount64.asm  ************************************
-; Author:           Agner Fog
-; Date created:     2011-07-20
-; Last modified:    2011-07-20
-
-; Description:
-; Population count function. Counts the number of 1-bits in a 32-bit integer
-; unsigned int A_popcount (unsigned int x);
-;
-; Position-independent code is generated if POSITIONINDEPENDENT is defined.
-;
-; CPU dispatching included for 386 and SSE4.2 instruction sets.
-;
-; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
-;******************************************************************************
-default rel
-
-global A_popcount: function
-
-; Direct entries to CPU-specific versions
-global popcountGeneric: function
-global popcountSSE42: function
-
-; Imported from instrset32.asm:
-extern InstructionSet                 ; Instruction set for CPU dispatcher
-
-section .text
-
-;******************************************************************************
-;                               popcount function
-;******************************************************************************
-
-
-A_popcount: ; function dispatching
-        jmp     near [popcountDispatch] ; Go to appropriate version, depending on instruction set
-
-align 16
-popcountSSE42: ; SSE4.2 version
-%ifdef  WINDOWS
-        popcnt  eax, ecx
-%else
-        popcnt  eax, edi
-%endif        
-        ret
-
-
-;******************************************************************************
-;                               popcount function generic
-;******************************************************************************
-
-popcountGeneric: ; Generic version
-%ifdef  WINDOWS
-        mov     eax, ecx
-%else
-        mov     eax, edi
-%endif        
-        mov     edx, eax
-        shr     eax, 1
-        and     eax, 55555555h         ; odd bits in eax, even bits in edx
-        and     edx, 55555555h
-        add     eax, edx
-        mov     edx, eax
-        shr     eax, 2
-        and     eax, 33333333h
-        and     edx, 33333333h
-        add     eax, edx
-        mov     edx, eax
-        shr     eax, 4
-        add     eax, edx
-        and     eax, 0F0F0F0Fh
-        mov     edx, eax
-        shr     eax, 8
-        add     eax, edx
-        mov     edx, eax
-        shr     eax, 16
-        add     eax, edx
-        and     eax, 03FH
-        ret
-;popcountGeneric end
-
-; ********************************************************************************
-; CPU dispatching for popcount. This is executed only once
-; ********************************************************************************
-
-%ifdef  WINDOWS
-%define par1      rcx                  ; parameter 1, pointer to haystack
-%else
-%define par1      rdi                  ; parameter 1, pointer to haystack
-%endif
-
-popcountCPUDispatch:
-        ; get supported instruction set
-        push    par1
-        call    InstructionSet
-        pop     par1
-        ; Point to generic version of strstr
-        lea     rdx, [popcountGeneric]
-        cmp     eax, 9                ; check popcnt supported
-        jb      Q100
-        ; SSE4.2 supported
-        ; Point to SSE4.2 version of strstr
-        lea     rdx, [popcountSSE42]
-Q100:   mov     [popcountDispatch], rdx
-        ; Continue in appropriate version 
-        jmp     rdx
-
-SECTION .data
-
-; Pointer to appropriate versions. Initially point to dispatcher
-popcountDispatch  DQ popcountCPUDispatch
diff --git a/contrib/libs/asmlib/procname64.asm b/contrib/libs/asmlib/procname64.asm
deleted file mode 100644
index 1b77b74320..0000000000
--- a/contrib/libs/asmlib/procname64.asm
+++ /dev/null
@@ -1,145 +0,0 @@
-%include "defs.asm"
-
-;                   procname64.asm 
-;
-; Author:           Agner Fog
-; Date created:     2007
-; Last modified:    2011-07-02
-; Description:
-; ProcessorName
-; =============
-; This function produces a zero-terminated ASCII string containing a name
-; for the microprocessor in human-readable format.
-; 
-; Copyright (c) 2007-2011 GNU General Public License www.gnu.org/licenses
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-default rel
-
-global  ProcessorName: function
-
-SECTION .data
-align 16
-
-NameBuffer times 50H db 0              ; Static buffer to contain name
-
-
-SECTION .text  align=16
-
-; ********** ProcessorName function **********
-; C++ prototype:
-; void ProcessorName (char * text);
-
-; This function finds the name of the microprocessor. The name is returned
-; in the parameter text, which must be a character array of at least 68 bytes.
-
-ProcessorName:
-        push    rbx
-        push    rdi
-        lea     rdi, [NameBuffer]      ; text pointer
-        
-        mov     eax, 80000000H
-        cpuid
-        cmp     eax, 80000004H         ; text if extended vendor string available
-        jb      no_ext_vendor_string
-
-        ; Has extended vendor string
-        mov     eax, 80000002H
-        cpuid
-        mov     [rdi], eax             ; store 16 bytes of extended vendor string
-        mov     [rdi+4], ebx
-        mov     [rdi+8], ecx
-        mov     [rdi+0CH], edx
-        mov     eax, 80000003H
-        cpuid
-        mov     [rdi+10H], eax         ; next 16 bytes
-        mov     [rdi+14H], ebx
-        mov     [rdi+18H], ecx
-        mov     [rdi+1CH], edx
-        mov     eax, 80000004H
-        cpuid
-        mov     [rdi+20H], eax         ; next 16 bytes
-        mov     [rdi+24H], ebx
-        mov     [rdi+28H], ecx
-        mov     [rdi+2CH], edx
-        jmp     get_family_and_model
-        
-no_ext_vendor_string:
-        ; No extended vendor string. Get short vendor string
-        xor     eax, eax
-        cpuid
-        mov     [rdi],ebx              ; store short vendor string
-        mov     [rdi+4],edx
-        mov     [rdi+8],ecx
-        mov     byte [rdi+12],0    ; terminate string
-        
-get_family_and_model:
-        xor     eax, eax
-        mov     ecx, 30H
-        cld
-        repne   scasb                  ; find end of text
-        dec     rdi
-        
-        mov     dword [rdi], ' Fam'   ; Append text " Family "
-        mov     dword [rdi+4], 'ily '
-        add     rdi, 8
-
-        mov     eax, 1
-        cpuid                          ; Get family and model
-        mov     ebx, eax
-        mov     ecx, eax
-        shr     eax, 8
-        and     eax, 0FH               ; Family
-        shr     ecx, 20
-        and     ecx, 0FFH              ; Extended family
-        add     eax, ecx               ; Family + extended family
-        call    WriteHex               ; Write as hexadecimal
-
-        mov     dword [rdi], 'H Mo' ; Write text "H Model "
-        mov     dword [rdi+4], 'del '
-        add     rdi, 8
-        
-        mov     eax, ebx
-        shr     eax, 4
-        and     eax, 0FH               ; Model
-        mov     ecx, ebx
-        shr     ecx, 12
-        and     ecx, 0F0H              ; Extended model
-        or      eax, ecx               ; Model | extended model
-        call    WriteHex               ; Write as hexadecimal
-
-        mov     dword [rdi], 'H'       ; Write text "H"
-        
-PNEND:  ; finished
-        lea     rax, [NameBuffer]      ; Pointer to result
-        pop     rdi
-        pop     rbx
-        ret
-;ProcessorName ENDP
-
-WriteHex:                              ; Local function: Write 2 hexadecimal digits
-        ; Parameters: AL = number to write, RDI = text destination
-        mov     ecx, eax
-        shr     ecx, 4
-        and     ecx, 0FH               ; most significant digit first
-        cmp     ecx, 10
-        jnb     W1
-        ; 0 - 9
-        add     ecx, '0'
-        jmp     W2
-W1:     ; A - F
-        add     ecx, 'A' - 10
-W2:     mov     [rdi], cl              ; write digit
-                
-        mov     ecx, eax
-        and     ecx, 0FH               ; next digit
-        cmp     ecx, 10
-        jnb     W3
-        ; 0 - 9
-        add     ecx, '0'
-        jmp     W4
-W3:     ; A - F
-        add     ecx, 'A' - 10
-W4:     mov     [rdi+1], cl            ; write digit
-        add     rdi, 2                 ; advance string pointer
-        ret
diff --git a/contrib/libs/asmlib/randomah.asi b/contrib/libs/asmlib/randomah.asi
deleted file mode 100644
index ed7a0185a4..0000000000
--- a/contrib/libs/asmlib/randomah.asi
+++ /dev/null
@@ -1,290 +0,0 @@
-; ----------------------------- RANDOMAH.ASI ---------------------------
-;
-;  Author:           Agner Fog
-;  Date created:     1998
-;  Last modified:    2013-09-09
-;  Description:
-;  Assembly include file containing
-;  structure/class definitions for random number generators
-;
-; Copyright (c) 1998-2013 GNU General Public License www.gnu.org/licenses
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-; Definitions for Mersenne Twister:
-
-TEMPERING EQU 1              ; set to 0 if no tempering (improves speed by 25%)
-
-%if 0
-; define constants for MT11213A:
-MERS_N    EQU 351
-MERS_M    EQU 175
-MERS_R    EQU 19
-MERS_A    EQU 0E4BD75F5H
-MERS_U    EQU 11
-MERS_S    EQU 7
-MERS_T    EQU 15
-MERS_L    EQU 17
-MERS_B    EQU 655E5280H
-MERS_C    EQU 0FFD58000H
-
-%ELSE
-; or constants for MT19937:
-MERS_N    EQU 624
-MERS_M    EQU 397
-MERS_R    EQU 31
-MERS_A    EQU 09908B0DFH
-MERS_U    EQU 11
-MERS_S    EQU 7
-MERS_T    EQU 15
-MERS_L    EQU 18
-MERS_B    EQU 9D2C5680H
-MERS_C    EQU 0EFC60000H
-
-%ENDIF
-
-LOWER_MASK EQU (1 << MERS_R) - 1             ; lower MERS_R bits
-UPPER_MASK EQU -1 << MERS_R                  ; upper 32-MERS_R bits
-
-; Define class CRandomMersenneA member data
-; Must be aligned by 16.
-
-STRUC CRandomMersenneA
-.Fill1    RESD      4        ; Alignment filler
-.PreInt:  RESD      4        ; premade tempered integer numbers, ready to use
-.PreFlt:  RESQ      4        ; premade floating point numbers, ready to use (subtract 1.0)
-          RESQ      1        ; last PreFlt unaligned overrun if MERS_N mod 4 = 1
-.TmpFlt:  RESQ      1        ; temporary storage of floating point random number
-.PreInx:  RESD      1        ; index to next PreInt and PreFlt number
-.Instset: RESD      1        ; Instruction set
-.LastInterval: RESD 1        ; Last interval length for IRandomX
-.RLimit:  RESD      1        ; Rejection limit used by IRandomX
-.TMB:     RESD      4        ; 4 copies of MERS_B constant
-.TMC:     RESD      4        ; 4 copies of MERS_C constant
-.one:     RESQ      2        ; 2 copies of 1.0 constant
-.MTI:     RESD      1        ; index into MT buffer
-.UMASK:   RESD      1        ; UPPER_MASK
-.LMASK:   RESD      1        ; LOWER_MASK             ; constants
-.MATA:    RESD      1        ; MERS_A
-.wrap1:   RESD      4        ; MT buffer km wraparound
-.MT:      RESD      MERS_N   ; MT history buffer (aligned by 16)
-.wrap2:   RESD      4        ; MT buffer kk wraparound
-%if MERS_N & 3
-         ; MERS_N not divisible by 4. align by 4
-          RESD      (4 - (MERS_N & 3))
-%ENDIF        
-endstruc ; CRandomMersenneA
-
-
-; Definitions for Mother-of-all generator:
-
-; Define class CRandomMotherA member data
-; Must be aligned by 16. Preferably aligned by 64 to fit a cache line
-STRUC   CRandomMotherA 
-.Fill2   RESD      4         ; Alignment filler
-.one     RESQ      1         ; 1.0
-.Instset RESD      1         ; Instruction set
-.M4      RESD      1         ; x[n-4]
-.M3      RESD      1         ; x[n-3] (aligned)
-.M2      RESD      1         ; x[n-2]
-.M1      RESD      1         ; x[n-1]
-.M0      RESD      1         ; x[n]
-.MC      RESD      1         ; Carry (aligned)
-.zero    RESD      1         ; Zero-extension of carry
-.RanP1   RESQ      1         ; Double random number in interval [1,2)
-.MF3     RESD      1         ; 2111111111 (aligned)
-.MF2     RESD      1         ; 1492
-.MF1     RESD      1         ; 1776
-.MF0     RESD      1         ; 5115
-endstruc ; CRandomMotherA
-
-MOTHERF0 EQU 5115            ; factor 0
-MOTHERF1 EQU 1776            ; factor 1
-MOTHERF2 EQU 1492            ; factor 2
-MOTHERF3 EQU 2111111111      ; factor 3
-
-
-; ***************************************************************************
-; Definitions for SFMT generator
-; ***************************************************************************
-
-; Choose Mersenne exponent.
-; Higher values give longer cycle length and use more memory:
-; MEXP equ    607
-; MEXP equ   1279
-; MEXP equ   2281
-; MEXP equ   4253
-  MEXP equ  11213
-; MEXP equ  19937
-; MEXP equ  44497
-
-%if MEXP == 44497
-SFMT_N      equ  348         ; Size of state vector
-SFMT_M      equ  330         ; Position of intermediate feedback
-SFMT_SL1    equ    5         ; Left shift of W[N-1], 32-bit words
-SFMT_SL2    equ    3         ; Left shift of W[0], *8, 128-bit words
-SFMT_SR1    equ    9         ; Right shift of W[M], 32-bit words
-SFMT_SR2    equ	   3         ; Right shift of W[N-2], *8, 128-bit words
-SFMT_MASK1  equ  0effffffbH  ;first DWORD of AND mask
-; AND mask:
-%define SFMT_MASK   0effffffbH,0dfbebfffH,0bfbf7befH,09ffd7bffH
-; Period certification vector
-%define 1,0,0a3ac4000H,0ecc1327aH
-
-%elif MEXP == 19937
-SFMT_N      equ  156         ; Size of state vector
-SFMT_M      equ  122         ; Position of intermediate feedback
-SFMT_SL1    equ   18         ; Left shift of W[N-1], 32-bit words
-SFMT_SL2    equ    1         ; Left shift of W[0], *8, 128-bit words
-SFMT_SR1    equ   11         ; Right shift of W[M], 32-bit words
-SFMT_SR2    equ	   1         ; Right shift of W[N-2], *8, 128-bit words
-SFMT_MASK1  equ  0dfffffefH  ;first DWORD of AND mask
-%define SFMT_MASK   0dfffffefH,0ddfecb7fH,0bffaffffH,0bffffff6H
-%define SFMT_PARITY 1,0,0,013c9e684H
-
-%elif MEXP == 11213
-SFMT_N      equ  88          ; Size of state vector
-SFMT_M      equ  68          ; Position of intermediate feedback
-SFMT_SL1	equ  14          ; Left shift of W[N-1], 32-bit words
-SFMT_SL2	equ   3          ; Left shift of W[0], *8, 128-bit words
-SFMT_SR1	equ   7          ; Right shift of W[M], 32-bit words
-SFMT_SR2	equ   3          ; Right shift of W[N-2], *8, 128-bit words
-SFMT_MASK1  equ  0effff7fbH  ;first DWORD of AND mask
-%define SFMT_MASK	0effff7fbH,0ffffffefH,0dfdfbfffH,07fffdbfdH
-%define SFMT_PARITY 1,0,0e8148000H,0d0c7afa3H
-
-%elif MEXP == 4253
-SFMT_N      equ  34          ; Size of state vector
-SFMT_M      equ  17          ; Position of intermediate feedback
-SFMT_SL1	equ  20          ; Left shift of W[N-1], 32-bit words
-SFMT_SL2	equ  1           ; Left shift of W[0], *8, 128-bit words
-SFMT_SR1	equ  7           ; Right shift of W[M], 32-bit words
-SFMT_SR2	equ  1           ; Right shift of W[N-2], *8, 128-bit words
-SFMT_MASK1  equ  09f7bffffH  ;first DWORD of AND mask
-%define SFMT_MASK	09f7bffffH,09fffff5fH,03efffffbH,0fffff7bbH
-%define SFMT_PARITY 0a8000001H,0af5390a3H,0b740b3f8H,06c11486dH
-
-%elif MEXP == 2281
-SFMT_N      equ  18          ; Size of state vector
-SFMT_M      equ  12          ; Position of intermediate feedback
-SFMT_SL1	equ  19          ; Left shift of W[N-1], 32-bit words
-SFMT_SL2	equ   1          ; Left shift of W[0], *8, 128-bit words
-SFMT_SR1	equ   5          ; Right shift of W[M], 32-bit words
-SFMT_SR2	equ   1          ; Right shift of W[N-2], *8, 128-bit words
-SFMT_MASK1  equ   0bff7ffbfH ;first DWORD of AND mask
-%define SFMT_MASK	0bff7ffbfH,0fdfffffeH,0f7ffef7fH,0f2f7cbbfH
-%define SFMT_PARITY 1,0,0,041dfa600H
-
-%elif MEXP == 1279
-SFMT_N      equ  10          ; Size of state vector
-SFMT_M      equ   7          ; Position of intermediate feedback
-SFMT_SL1	equ  14          ; Left shift of W[N-1], 32-bit words
-SFMT_SL2	equ   3          ; Left shift of W[0], *8, 128-bit words
-SFMT_SR1	equ   5          ; Right shift of W[M], 32-bit words
-SFMT_SR2	equ   1          ; Right shift of W[N-2], *8, 128-bit words
-SFMT_MASK1  equ   0f7fefffdH ;first DWORD of AND mask
-%define SFMT_MASK	0f7fefffdH,07fefcfffH,0aff3ef3fH,0b5ffff7fH
-%define SFMT_PARITY 1,0,0,020000000H
-
-%elif MEXP == 607
-SFMT_N      equ   5          ; Size of state vector
-SFMT_M      equ   2          ; Position of intermediate feedback
-SFMT_SL1	equ  15          ; Left shift of W[N-1], 32-bit words
-SFMT_SL2	equ   3          ; Left shift of W[0], *8, 128-bit words
-SFMT_SR1	equ  13          ; Right shift of W[M], 32-bit words
-SFMT_SR2	equ   3          ; Right shift of W[N-2], *8, 128-bit words
-SFMT_MASK1  equ   0fdff37ffH ;first DWORD of AND mask
-%define SFMT_MASK	0fdff37ffH,0ef7f3f7dH,0ff777b7dH,07ff7fb2fH
-%define SFMT_PARITY 1,0,0,05986f054H
-
-%ELSE
-%error MEXP must have one of the predefined values
-%ENDIF
-
-STRUC CRandomSFMTA 
-.Fill3         RESD     4    ; Alignment filler
-
-; Parameters for Mother-Of-All generator:
-.M3:           RESD     1    ; x[n-3] (aligned)
-               RESD     1    ; unused filler to fit the pmuludq instruction
-.M2:           RESD     1    ; x[n-2]
-               RESD     1    ; unused filler to fit the pmuludq instruction
-.M1:           RESD     1    ; x[n-1]
-               RESD     1    ; unused filler to fit the pmuludq instruction
-.M0:           RESD     1    ; x[n]
-.MC:           RESD     1    ; Carry (zero-extends into one)
-.one:          RESQ     1    ; 1.0 (low dword = zero-extension of carry) (aligned)
-.TempRan:      RESQ     1    ; Temporary random number
-.MF3:          RESD     1    ; 2111111111 (aligned)
-.Instset:      RESD     1    ; Instruction set
-.MF2:          RESD     1    ; 1492 (MF3,MF2,MF1,MF0 interleaved with other variables to fit the pmuludq instruction)
-               RESD     1    ; Filler (may be used for read-only parameter, but not for read/write parameter)
-.MF1:          RESD     1    ; 1776
-               RESD     1    ; Filler (may be used for read-only parameter, but not for read/write parameter)
-.MF0:          RESD     1    ; 5115
-               RESD     1    ; Filler (may be used for read-only parameter, but not for read/write parameter)
-
-; Parameters for IRandomX:
-.LASTINTERVAL: RESD     1    ; Last interval length for IRandomX
-.RLIMIT:       RESD     1    ; Rejection limit used by IRandomX
-
-; Parameters for SFMT generator:
-.USEMOTHER:    RESD     1    ; 1 if combine with Mother-Of-All generator
-.IX:           RESD     1    ; Index into state buffer for SFMT
-
-.AMASK:        RESD     4    ; AND mask (aligned)
-.STATE:        RESD SFMT_N*4 ; State vector (aligned)
-endstruc ; CRandomSFMTA 
-
-
-; Load offset of TARGET into ecx. Use position-independent method if necessary
-%macro LOADOFFSET2ECX 1
-%IFNDEF  POSITIONINDEPENDENT
-        mov     ecx, %1
-%ELSE
-        ; get position-independent address of TARGET
-        call    get_thunk_ecx
-        add ecx, %1 - $
-%ENDIF
-%endmacro
-
-; Load offset of TARGET into edi. Use position-independent method if necessary
-%macro LOADOFFSET2EDI 1
-%IFNDEF  POSITIONINDEPENDENT
-        mov     edi, %1
-%ELSE
-        ; get position-independent address of TARGET
-        call    get_thunk_edi
-        add edi, %1 - $
-%ENDIF
-%endmacro
-
-
-; ***************************************************************************
-; Define registers used for function parameters, used in 64-bit mode only
-; ***************************************************************************
- 
-%IFDEF WINDOWS
-  %define par1     rcx
-  %define par2     rdx
-  %define par3     r8
-  %define par4     r9
-  %define par5     qword [rsp+32+8]   ; stack offset including shadow space
-  %define par1d    ecx
-  %define par2d    edx
-  %define par3d    r8d
-  %define par4d    r9d
-  %define par5d    dword [rsp+32+8]
-%ENDIF
-  
-%IFDEF UNIX
-  %define par1     rdi
-  %define par2     rsi
-  %define par3     rdx
-  %define par4     rcx
-  %define par5     r8
-  %define par1d    edi
-  %define par2d    esi
-  %define par3d    edx
-  %define par4d    ecx
-  %define par5d    r8d
-%ENDIF 
diff --git a/contrib/libs/asmlib/rdtsc64.asm b/contrib/libs/asmlib/rdtsc64.asm
deleted file mode 100644
index 42a0e23203..0000000000
--- a/contrib/libs/asmlib/rdtsc64.asm
+++ /dev/null
@@ -1,53 +0,0 @@
-%include "defs.asm"
-
-;          RDTSC64.ASM
-;
-; Author:           Agner Fog
-; Date created:     2003
-; Last modified:    2008-10-16
-; Description:
-;
-; Copyright (c) 2009 GNU General Public License www.gnu.org/licenses
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-default rel
-
-global  ReadTSC: function
-
-SECTION .text  align=16
-
-; ********** ReadTSC function **********
-; C++ prototype:
-; extern "C" __int64 ReadTSC (void);
-
-; This function returns the value of the time stamp counter, which counts 
-; clock cycles. To count how many clock cycles a piece of code takes, call
-; Rdtsc before and after the code to measure and calculate the difference.
-
-; The number of clock cycles taken by the ReadTSC function itself is approximately:
-; Core 2:   730
-; Pentium 4:  700
-; Pentium II and Pentium III: 225
-; AMD Athlon 64, Opteron: 126
-; Does not work on 80386 and 80486.
-
-; Note that clock counts may not be fully reproducible on Intel Core and
-; Core 2 processors because the clock frequency can change. More reliable
-; instruction timings are obtained with the performance monitor counter
-; for "core clock cycles". This requires a kernel mode driver as the one
-; included with www.agner.org/optimize/testp.zip.
-
-ReadTSC:
-        push    rbx                    ; ebx is modified by cpuid
-        sub     eax, eax               ; 0
-        cpuid                          ; serialize
-        rdtsc                          ; read time stamp counter into edx:eax
-        shl     rdx, 32
-        or      rax, rdx               ; combine into 64 bit register        
-        push    rax
-        sub     eax, eax
-        cpuid                          ; serialize
-        pop     rax                    ; return value
-        pop     rbx
-        ret
-;ReadTSC ENDP
diff --git a/contrib/libs/asmlib/round64.asm b/contrib/libs/asmlib/round64.asm
deleted file mode 100644
index 5ed55c53c6..0000000000
--- a/contrib/libs/asmlib/round64.asm
+++ /dev/null
@@ -1,40 +0,0 @@
-%include "defs.asm"
-
-;          ROUND64.ASM 
-
-; Author:           Agner Fog
-; Date created:     2007-06-15
-; Last modified:    2008-10-16
-; Description:
-; Round function
-
-; Copyright (c) 2009 GNU General Public License www.gnu.org/licenses
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-default rel
-
-global RoundD: function
-global RoundF: function
-
-
-SECTION .text  align=16
-
-; ********** round function **********
-; C++ prototype:
-; extern "C" int RoundD (double x);
-; extern "C" int RoundF (float  x);
-
-; This function converts a single or double precision floating point number 
-; to an integer, rounding to nearest or even. Does not check for overflow.
-; This function is much faster than the default conversion method in C++
-; which uses truncation.
-
-RoundD:
-        cvtsd2si eax, xmm0             ; Round xmm0 to eax
-        ret
-;RoundD  ENDP
-
-RoundF:
-        cvtss2si eax, xmm0             ; Round xmm0 to eax
-        ret
-;RoundF ENDP
diff --git a/contrib/libs/asmlib/sfmt64.asm b/contrib/libs/asmlib/sfmt64.asm
deleted file mode 100644
index 3ca3cedca0..0000000000
--- a/contrib/libs/asmlib/sfmt64.asm
+++ /dev/null
@@ -1,889 +0,0 @@
-%include "defs.asm"
-
-; ----------------------------- SFMT64.ASM ---------------------------
-; Author:        Agner Fog
-; Date created:  2008-11-01
-; Last modified: 2013-09-13
-; Project:       randoma library of random number generators
-; Source URL:    www.agner.org/random
-; Description:
-; Random number generator of type SIMD-oriented Fast Mersenne Twister (SFMT)
-; (Mutsuo Saito and Makoto Matsumoto: "SIMD-oriented Fast Mersenne Twister:
-; a 128-bit Pseudorandom Number Generator", Monte Carlo and Quasi-Monte 
-; Carlo Methods 2006, Springer, 2008, pp. 607-622).
-;
-; 64-bit mode version for x86-64 compatible microprocessors.
-; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
-; ----------------------------------------------------------------------
-
-default rel
-
-global SFMTRandomInit, SFMTRandomInitByArray, SFMTBRandom, SFMTRandom
-global SFMTRandomL, SFMTIRandom, SFMTIRandomX, SFMTgenRandomInit
-global SFMTgenRandomInitByArray, SFMTgenRandom, SFMTgenRandomL, SFMTgenIRandom
-global SFMTgenIRandomX, SFMTgenBRandom
-
-extern InstructionSet
-
-; structure definition and constants:
-%INCLUDE "randomah.asi"
-
-
-section .data
-align 16
-; Data for single instance of random number generator
-SFMTInstance: ISTRUC CRandomSFMTA
-; Size of structure
-IEND
-SFMTSize equ $-SFMTInstance
-
-
-align 16
-; Initialization constants for Mother-Of-All:
-InitMother DD 2111111111, 0, 1492, 0, 1776, 0, 5115, 0
-; Initialization Mask for SFMT:
-InitMask   DD SFMT_MASK
-; Period certification vector for SFMT:
-InitParity DD SFMT_PARITY
-
-
-SECTION .CODE align=16   ; code segment
-
-
-; ---------------------------------------------------------------
-;  Thread-safe static link versions for SFMT
-; ---------------------------------------------------------------
-
-; extern "C" void SFMTRandomInit(void * Pthis, int ThisSize, int seed, int IncludeMother = 0);
-; Parameters:
-; par1  = Pthis
-; par2d = ThisSize
-; par3d = seed
-; par4d = IncludeMother
-
-SFMTRandomInit:
-        cmp     par2d, SFMTSize
-        jb      Error                                      ; Error exit if buffer too small
-        push    rbx
-
-        ; Align by 16. Will overlap part of Fill if Pthis unaligned        
-        and     par1, -16
-
-        xor     eax, eax
-        test    par4d, par4d                               ; IncludeMother
-        setnz   al                                         ; convert any nonzero value to 1
-        ; Store USEMOTHER
-        mov     [par1+CRandomSFMTA.USEMOTHER], eax
-        
-        mov     eax, par3d                                 ; seed
-        xor     ebx, ebx                                   ; loop counter i
-        jmp     L002                                       ; go into seeding loop
-
-L001:   ; seeding loop for SFMT
-        ; y = factor * (y ^ (y >> 30)) + (++i);
-        call    InitSubf0                                  ; randomization subfunction
-L002:   mov     [par1+rbx*4+CRandomSFMTA.STATE],eax        ; initialize state
-        cmp     ebx, SFMT_N*4 - 1
-        jb      L001
-
-        ; Put 5 more values into Mother-Of-All generator
-        call    InitSubf0
-        mov     [par1+CRandomSFMTA.M0], eax
-        call    InitSubf0
-        mov     [par1+CRandomSFMTA.M1], eax
-        call    InitSubf0
-        mov     [par1+CRandomSFMTA.M2], eax
-        call    InitSubf0
-        mov     [par1+CRandomSFMTA.M3], eax
-        call    InitSubf0
-        mov     [par1+CRandomSFMTA.MC], eax
-        
-        ; more initialization and period certification
-        call    InitAndPeriod
-        
-        pop     rbx
-        ret
-;SFMTRandomInit ENDP
-        
-Error:                                                     ; Error exit
-        xor     eax, eax
-        div     eax                                        ; Divide by 0
-        ret
-        
-; Subfunction used by SFMTRandomInit
-InitSubf0: ; private
-; y = 1812433253 * (y ^ (y >> 30)) + (++i);
-; input parameters:
-; eax = y
-; ebx = i
-; output:
-; eax = new y
-; ebx = i+1
-; edx modified
-        mov     edx, eax
-        shr     eax, 30
-        xor     eax, edx
-        imul    eax, 1812433253
-        inc     ebx
-        add     eax, ebx
-        ret
-;InitSubf0 endp 
-       
-; Subfunction used by SFMTRandomInitByArray
-InitSubf1: ; private
-; r = 1664525U * (r ^ (r >> 27));
-; input parameters:
-; eax = r
-; output:
-; eax = new r
-; r10 modified
-        mov     r10d, eax
-        shr     eax,  27
-        xor     eax,  r10d
-        imul    eax,  1664525
-        ret
-;InitSubf1 endp
-
-; Subfunction used by SFMTRandomInitByArray
-InitSubf2: ; private
-; r = 1566083941U * (r ^ (r >> 27));
-; input parameters:
-; eax = r
-; output:
-; eax = new r
-; r10 modified
-        mov     r10d, eax
-        shr     eax,  27
-        xor     eax,  r10d
-        imul    eax,  1566083941
-        ret
-;InitSubf2 endp
-
-
-; Subfunciton for initialization and period certification, except seeding
-; par1 = aligned pointer to CRandomSFMTA
-InitAndPeriod: ; private
-        push    rbx
-        
-        ; initialize constants for Mother-Of-All
-        movaps  xmm0, oword [InitMother]
-        movaps  oword [par1+CRandomSFMTA.MF3], xmm0
-        movaps  xmm0, oword [InitMother+16]
-        movaps  oword [par1+CRandomSFMTA.MF1], xmm0
-        
-        ; initialize constants for SFMT
-        movaps  xmm0, oword [InitMask]
-        movaps  oword [par1+CRandomSFMTA.AMASK], xmm0
-
-        ; initialize various variables
-        xor     eax, eax
-        mov     dword [par1+CRandomSFMTA.one], eax
-        mov     dword [par1+4+CRandomSFMTA.one], 3FF00000H
-        mov     dword [par1+CRandomSFMTA.LASTINTERVAL], eax        
-        
-        ; get instruction set
-        push    par1
-        call    InstructionSet
-        pop     par1
-        mov     [par1+CRandomSFMTA.Instset], eax
-        
-        ; Period certification
-        ; Compute parity of STATE[0-4] & InitParity
-        movaps  xmm1, oword [par1+CRandomSFMTA.STATE]
-        andps   xmm1, oword [InitParity]
-        movhlps xmm2, xmm1                                 ; high qword
-        xorps   xmm1, xmm2                                 ; xor two qwords
-        pshufd  xmm2, xmm1, 1                              ; high dword
-        xorps   xmm1, xmm2                                 ; xor two dwords
-        movd    eax,  xmm1                                 ; do rest of xor in eax
-        mov     edx,  eax
-        shr     eax,  16
-        xor     eax,  edx                                  ; xor two words
-        xor     al,   ah                                   ; xor two bytes
-        jpo     L008                                       ; parity odd: period OK
-        
-        ; parity even: period not OK
-        ; Find a nonzero dword in period certification vector
-        xor     ebx, ebx                                   ; loop counter
-        lea     rdx, [InitParity]
-L005:   mov     eax, [rdx+rbx*4]                           ; InitParity[i]
-        test    eax, eax
-        jnz     L006
-        inc     ebx
-        ; assume that there is a nonzero dword in InitParity
-        jmp     L005                                       ; loop until nonzero found
-        
-L006:   ; find first nonzero bit in eax
-        bsf     edx, eax
-        ; flip the corresponding bit in STATE
-        btc     [par1+rbx*4+CRandomSFMTA.STATE], edx
-
-L008:   cmp     dword [par1+CRandomSFMTA.USEMOTHER], 0
-        je      L009
-        call    Mother_Next                                ; Make first random number ready
-
-L009:   ; Generate first random numbers and set IX = 0
-        call    SFMT_Generate
-        pop     rbx
-        ret
-;InitAndPeriod   endp
-
-
-;  extern "C" void SFMTRandomInitByArray
-; (void * Pthis, int ThisSize, int const seeds[], int NumSeeds, int IncludeMother = 0);
-; // Seed by more than 32 bits
-SFMTRandomInitByArray:
-; Parameters
-; par1  = Pthis
-; par2d = ThisSize
-; par3  = seeds
-; par4d = NumSeeds
-; par5d = IncludeMother
-
-; define constants:
-SFMT_SIZE equ SFMT_N*4                                     ; number of 32-bit integers in state
-
-%IF SFMT_SIZE >= 623
-   SFMT_LAG equ 11
-%ELIF SFMT_SIZE >= 68
-   SFMT_LAG equ  7
-%ELIF SFMT_SIZE >= 39
-   SFMT_LAG equ  5
-%ELSE
-   SFMT_LAG equ  3
-%ENDIF
-
-SFMT_MID equ ((SFMT_SIZE - SFMT_LAG) / 2)
-
-        xor     eax, eax
-        cmp     par5d, eax                                 ; IncludeMother (parameter is on stack if windows)
-        setnz   al                                         ; convert any nonzero value to 1
-
-        push    rbx
-        push    rbp
-        
-        cmp     par2d, SFMTSize                            ; ThisSize
-        jb      Error                                      ; Error exit if buffer too small
-
-        ; Align by 16. Will overlap part of Fill if Pthis unaligned        
-        and     par1, -16        
-
-        ; Store USEMOTHER
-        mov     [par1+CRandomSFMTA.USEMOTHER], eax 
-
-; 1. loop: Fill state vector with random numbers from NumSeeds
-; r = NumSeeds;
-; for (i = 0; i < SFMT_N*4; i++) {
-;    r = factor * (r ^ (r >> 30)) + i;
-;    sta[i] = r;}
-
-        mov     eax, par4d                                 ; r = NumSeeds
-        xor     ebx, ebx                                   ; i
-L100:   mov     par2d, eax
-        shr     eax, 30
-        xor     eax, par2d
-        imul    eax, 1812433253
-        add     eax, ebx
-        mov     [par1+rbx*4+CRandomSFMTA.STATE], eax
-        inc     ebx
-        cmp     ebx, SFMT_SIZE
-        jb      L100        
-
-        ; count = max(NumSeeds,size-1)
-        mov     eax,  SFMT_SIZE - 1
-        mov     r11d, par4d                                 ; NumSeeds
-        cmp     r11d, eax
-        cmovb   r11d, eax
-        
-; 2. loop: Fill state vector with random numbers from seeds[]
-; for (i = 1, j = 0; j < count; j++) {
-;    r = func1(sta[i] ^ sta[(i + mid) % size] ^ sta[(i + size - 1) % size]);
-;    sta[(i + mid) % size] += r;
-;    if (j < NumSeeds) r += seeds[j]
-;    r += i;
-;    sta[(i + mid + lag) % size] += r;
-;    sta[i] = r;
-;    i = (i + 1) % size;
-; }
-        ; register use:
-        ; par1  = Pthis
-        ; par2  = j
-        ; par3  = seeds
-        ; par4  = NumSeeds
-        ; eax   = r
-        ; ebx   = i
-        ; ebp   = (i + mid) % size, (i + mid + lag) % size
-        ; r10   = (i + size - 1) % size
-        ; r11   = count
-
-        xor     par2d, par2d           ; j = 0
-        lea     ebx, [par2+1]          ; i = 1
-
-L101:   ; r = sta[i] ^ sta[(i + mid) % size] ^ sta[(i + size - 1) % size];
-        mov     eax,  [par1+rbx*4+CRandomSFMTA.STATE]      ; sta[i]
-        lea     ebp,  [rbx+SFMT_MID]
-        cmp     ebp,  SFMT_SIZE
-        jb      L102
-        sub     ebp,  SFMT_SIZE
-L102:   xor     eax,  [par1+rbp*4+CRandomSFMTA.STATE]      ; sta[(i + mid) % size]
-        lea     r10d, [rbx+SFMT_SIZE-1]
-        cmp     r10d, SFMT_SIZE
-        jb      L103
-        sub     r10d, SFMT_SIZE
-L103:   xor     eax,  [par1+r10*4+CRandomSFMTA.STATE]      ; sta[(i + size - 1) % size]
-
-        ; r = func1(r) = (r ^ (r >> 27)) * 1664525U;
-        call    InitSubf1
-        
-        ; sta[(i + mid) % size] += r;
-        add     [par1+rbp*4+CRandomSFMTA.STATE], eax
-        
-        ; if (j < NumSeeds) r += seeds[j]
-        cmp     par2d, par4d
-        jnb     L104
-        add     eax, [par3+par2*4]        
-L104:
-        ; r += i;
-        add     eax, ebx
-        
-        ; sta[(i + mid + lag) % size] += r;
-        lea     ebp, [rbx+SFMT_MID+SFMT_LAG]
-        cmp     ebp, SFMT_SIZE
-        jb      L105
-        sub     ebp, SFMT_SIZE
-L105:   add     [par1+rbp*4+CRandomSFMTA.STATE], eax
-        
-        ;sta[i] = r;
-        mov     [par1+rbx*4+CRandomSFMTA.STATE], eax
-        
-        ; i = (i + 1) % size;
-        inc     ebx
-        cmp     ebx, SFMT_SIZE
-        jb      L106
-        sub     ebx, SFMT_SIZE
-L106:
-        ; j++, loop while j < count
-        inc     par2d
-        cmp     par2d, r11d
-        jb      L101
-        
-; 3. loop: Randomize some more
-; for (j = 0; j < size; j++) {
-;   r = func2(sta[i] + sta[(i + mid) % size] + sta[(i + size - 1) % size]);
-;   sta[(i + mid) % size] ^= r;
-;   r -= i;
-;   sta[(i + mid + lag) % size] ^= r;
-;   sta[i] = r;
-;   i = (i + 1) % size;
-; }
-        ; j = 0
-        xor     par2d, par2d
-
-L110:    ; r = sta[i] + sta[(i + mid) % size] + sta[(i + size - 1) % size]
-        mov     eax,  [par1+rbx*4+CRandomSFMTA.STATE]      ; sta[i]
-        lea     ebp,  [rbx+SFMT_MID]
-        cmp     ebp,  SFMT_SIZE
-        jb      L111
-        sub     ebp,  SFMT_SIZE
-L111:   add     eax,  [par1+rbp*4+CRandomSFMTA.STATE]      ; sta[(i + mid) % size]
-        lea     r10d, [rbx+SFMT_SIZE-1]
-        cmp     r10d, SFMT_SIZE
-        jb      L112
-        sub     r10d, SFMT_SIZE
-L112:   add     eax,  [par1+r10*4+CRandomSFMTA.STATE]      ; sta[(i + size - 1) % size]
-
-        ; r = func2(r) = (x ^ (x >> 27)) * 1566083941U;
-        call    InitSubf2
-        
-        ; sta[(i + mid) % size] ^= r;
-        xor     [par1+rbp*4+CRandomSFMTA.STATE], eax
-        
-        ; r -= i;
-        sub     eax, ebx
-        
-        ; sta[(i + mid + lag) % size] ^= r;
-        lea     ebp, [rbx+SFMT_MID+SFMT_LAG]
-        cmp     ebp, SFMT_SIZE
-        jb      L113
-        sub     ebp, SFMT_SIZE
-L113:   xor     [par1+rbp*4+CRandomSFMTA.STATE], eax
-
-        ; sta[i] = r;
-        mov     [par1+rbx*4+CRandomSFMTA.STATE], eax
-        
-        ; i = (i + 1) % size;
-        inc     ebx
-        cmp     ebx, SFMT_SIZE
-        jb      L114
-        sub     ebx, SFMT_SIZE
-L114:
-        ; j++, loop while j < size
-        inc     par2d
-        cmp     par2d, SFMT_SIZE
-        jb      L110
-    
-        ; if (UseMother) {
-        cmp     dword [par1+CRandomSFMTA.USEMOTHER], 0
-        jz      L120
-        
-; 4. loop: Initialize MotherState
-; for (j = 0; j < 5; j++) {
-;    r = func2(r) + j;
-;    MotherState[j] = r + sta[2*j];
-; }
-        call    InitSubf2
-        mov     par2d, [par1+CRandomSFMTA.STATE]
-        add     par2d, eax
-        mov     [par1+CRandomSFMTA.M0], par2d
-        call    InitSubf2
-        inc     eax
-        mov     par2d, [par1+8+CRandomSFMTA.STATE]
-        add     par2d, eax
-        mov     [par1+CRandomSFMTA.M1], par2d
-        call    InitSubf2
-        add     eax, 2
-        mov     par2d, [par1+16+CRandomSFMTA.STATE]
-        add     par2d, eax        
-        mov     [par1+CRandomSFMTA.M2], par2d
-        call    InitSubf2
-        add     eax, 3
-        mov     par2d, [par1+24+CRandomSFMTA.STATE]
-        add     par2d, eax        
-        mov     [par1+CRandomSFMTA.M3], par2d
-        call    InitSubf2
-        add     eax, 4
-        mov     par2d, [par1+32+CRandomSFMTA.STATE]
-        add     par2d, eax        
-        mov     [par1+CRandomSFMTA.MC], par2d
-        
-L120:    ; More initialization and period certification
-        call    InitAndPeriod
-        
-        pop     rbp
-        pop     rbx
-        ret
-;SFMTRandomInitByArray ENDP
-
-
-Mother_Next: ; private
-; Internal procedure: advance Mother-Of-All generator
-; The random value is in M0
-; par1 = aligned pointer to structure CRandomSFMTA
-; eax, par1, xmm0 unchanged
-
-        movdqa  xmm1, oword [par1+CRandomSFMTA.M3]         ; load M3,M2
-        movdqa  xmm2, oword [par1+CRandomSFMTA.M1]         ; load M1,M0
-        movhps  qword [par1+CRandomSFMTA.M3], xmm1         ; M3=M2
-        movq    qword [par1+CRandomSFMTA.M2], xmm2         ; M2=M1
-        movhps  qword [par1+CRandomSFMTA.M1], xmm2         ; M1=M0
-        pmuludq xmm1, oword [par1+CRandomSFMTA.MF3]        ; M3*MF3, M2*MF2
-        pmuludq xmm2, oword [par1+CRandomSFMTA.MF1]        ; M1*MF1, M0*MF0
-        paddq   xmm1, xmm2                                 ; P3+P1, P2+P0
-        movhlps xmm2, xmm1                                 ; Get high qword
-        movq    xmm3, qword [par1+CRandomSFMTA.MC]         ; +carry
-        paddq   xmm1, xmm3
-        paddq   xmm1, xmm2                                 ; P0+P1+P2+P3
-        movq    qword [par1+CRandomSFMTA.M0], xmm1         ; Store new M0 and carry
-        ret
-;Mother_Next endp
-
-
-align 16
-SFMT_Generate: ; private
-; void CRandomSFMT::Generate() {
-; Fill state array with new random numbers
-
-        push    rbx
-        
-        ; register use
-        ; par1 = Pthis (rcx or rdi)
-        ; edx  = i*16 + offset state
-        ; eax, ebx = loop end
-        ; xmm1 = r1
-        ; xmm2 = r2 = r
-        ; xmm0, xmm3 = scratch
-        
-        ; r1 = state[SFMT_N*16 - 2];
-        ; r2 = state[SFMT_N*16 - 1];
-        movdqa  xmm1, oword [par1+(SFMT_N-2)*16+CRandomSFMTA.STATE]
-        movdqa  xmm2, oword [par1+(SFMT_N-1)*16+CRandomSFMTA.STATE]
-        mov     edx, CRandomSFMTA.STATE
-        
-;static inline __m128i sfmt_recursion(__m128i const &a, __m128i const &b, 
-;__m128i const &c, __m128i const &d, __m128i const &mask) {
-;    __m128i a1, b1, c1, d1, z1, z2;
-;    b1 = _mm_srli_epi32(b, SFMT_SR1);
-;    a1 = _mm_slli_si128(a, SFMT_SL2);
-;    c1 = _mm_srli_si128(c, SFMT_SR2);
-;    d1 = _mm_slli_epi32(d, SFMT_SL1);
-;    b1 = _mm_and_si128(b1, mask);
-;    z1 = _mm_xor_si128(a, a1);
-;    z2 = _mm_xor_si128(b1, d1);
-;    z1 = _mm_xor_si128(z1, c1);
-;    z2 = _mm_xor_si128(z1, z2);
-;    return z2;}
-
-; for (i = 0; i < SFMT_N - SFMT_M; i++) {
-;    r = sfmt_recursion(state[i], state[i + SFMT_M], r1, r2, mask);
-;    state[i] = r;
-;    r1 = r2;
-;    r2 = r;
-; }
-
-        mov      eax, (SFMT_N-SFMT_M)*16 + CRandomSFMTA.STATE ; first loop end
-        mov      ebx, SFMT_N*16 + CRandomSFMTA.STATE          ; second loop end
-
-; first i loop from 0 to SFMT_N - SFMT_M
-align 8
-L201:   movdqa   xmm0, oword [par1+rdx+SFMT_M*16]          ; b
-        psrld    xmm0, SFMT_SR1                            ; b1
-        pand     xmm0, oword [par1+CRandomSFMTA.AMASK]     ; b1
-        movdqa   xmm3, oword [par1+rdx]                    ; a
-        pxor     xmm0, xmm3
-        pslldq   xmm3, SFMT_SL2                            ; a1
-        psrldq   xmm1, SFMT_SR2                            ; c1, c = r1
-        pxor     xmm0, xmm3
-        pxor     xmm0, xmm1
-        movdqa   xmm1, xmm2                                ; r1 = r2
-        pslld    xmm2, SFMT_SL1                            ; d1, d = r2
-        pxor     xmm2, xmm0                                ; r2 = r
-        ; state[i] = r;
-        movdqa   oword [par1+rdx], xmm2
-        
-        ; i++ while i < SFMT_N - SFMT_M
-        add      edx, 16
-        cmp      edx, eax
-        jb       L201
-        
-;align 16
-L202:   ; second i loop from SFMT_N - SFMT_M + 1 to SFMT_N
-        movdqa   xmm0, oword [par1+rdx+(SFMT_M-SFMT_N)*16]; b
-        psrld    xmm0, SFMT_SR1                            ; b1
-        pand     xmm0, oword [par1+CRandomSFMTA.AMASK]     ; b1
-        movdqa   xmm3, oword [par1+rdx]                    ; a
-        pxor     xmm0, xmm3
-        pslldq   xmm3, SFMT_SL2                            ; a1
-        psrldq   xmm1, SFMT_SR2                            ; c1, c = r1
-        pxor     xmm0, xmm3
-        pxor     xmm0, xmm1
-        movdqa   xmm1, xmm2                                ; r1 = r2
-        pslld    xmm2, SFMT_SL1                            ; d1, d = r2
-        pxor     xmm2, xmm0                                ; r2 = r
-        ; state[i] = r;
-        movdqa   oword [par1+rdx], xmm2
-        
-        ; i++ while i < SFMT_N
-        add      edx, 16
-        cmp      edx, ebx
-        jb       L202
-        
-        ; Check if initialized
-L208:   cmp     dword [par1+CRandomSFMTA.AMASK], SFMT_MASK1
-        jne     Error                                      ; Make error if not initialized
-
-        ; ix = 0;
-        mov      dword [par1+CRandomSFMTA.IX], 0 ; point to start of STATE buffer
-        pop      rbx
-        ret
-;SFMT_Generate endp
-
-
-;  extern "C" unsigned int SFMTBRandom(void * Pthis);                     // Output random bits
-
-SFMTBRandom:                                               ; generate random bits
-        ; Align Pthis by 16. Will overlap part of Fill1 if Pthis unaligned        
-        and     par1, -16        
-
-SFMTBRandom_reg:                                           ; Entry for register parameters, used internally
-
-; if (ix >= SFMT_N*4) Generate();
-        mov     edx, [par1+CRandomSFMTA.IX]
-        cmp     edx, SFMT_N*16
-        jnb     NeedGenerate
-        
-; y = ((uint32_t*)state)[ix++];
-        mov     eax, dword [par1+rdx+CRandomSFMTA.STATE]
-        add     edx, 4
-        mov     [par1+CRandomSFMTA.IX], edx
-
-AfterGenerate:
-; if (UseMother) y += MotherBits();
-        cmp     dword [par1+CRandomSFMTA.USEMOTHER], 0
-        jz      NoMother
-        
-        ; add mother bits
-        add     eax,  [par1+CRandomSFMTA.M0]               ; Add Mother random number        
-        call    Mother_Next                                ; Make next Mother random number ready
-        
-NoMother: ; return y;
-        ret
-        
-NeedGenerate: 
-        call    SFMT_Generate                              ; generate SFMT_N*4 random dwords
-        mov     eax, [par1+CRandomSFMTA.STATE]
-        mov     dword [par1+CRandomSFMTA.IX], 4
-        jmp     AfterGenerate
-        
-;SFMTBRandom ENDP
-
-
-;  extern "C" double SFMTRandom  (void * Pthis); // Output random float
-SFMTRandom:                                                ; generate random float with 52 bits resolution
-        ; Align Pthis by 16. Will overlap part of Fill1 if Pthis unaligned        
-        and     par1, -16
-        
-SFMTRandom_reg:                                            ; internal entry point        
-
-; check if there are at least 64 random bits in state buffer
-; if (ix >= SFMT_N*4-1) Generate();
-        mov     edx, [par1+CRandomSFMTA.IX]
-        cmp     edx, SFMT_N*16-4
-        jnb     L303
-
-L301:   ; read 64 random bits
-        movq    xmm0, qword [par1+rdx+CRandomSFMTA.STATE]
-        add     edx, 8
-        mov     [par1+CRandomSFMTA.IX], edx
-
-        ; combine with Mother-Of-All generator?
-        cmp     dword [par1+CRandomSFMTA.USEMOTHER], 0
-        jz      L302 ; ConvertToFloat
-        
-        ; add mother bits
-        movq    xmm1, qword [par1+CRandomSFMTA.M0]         ; Mother random number MC and M0
-        pshuflw xmm1, xmm1, 01001011B                      ; Put M0 before MC, and swap the words in MC
-        paddq   xmm0, xmm1                                 ; Add SFMT and Mother outputs
-        call    Mother_Next                                ; Make next Mother random number ready
-        
-L302:   ; ConvertToFloat
-        psrlq	xmm0, 12			                       ; align with mantissa field of double precision float
-        movsd   xmm1, [par1+CRandomSFMTA.one]              ; 1.0 double precision
-        por     xmm0, xmm1                                 ; insert exponent to get 1.0 <= x < 2.0
-        subsd   xmm0, xmm1                                 ; subtract 1.0 to get 0.0 <= x < 1.0
-        ret                                                ; return value        
-        
-L303:   ; NeedGenerateR
-        call    SFMT_Generate                              ; generate SFMT_N*4 random dwords
-        xor     edx, edx
-        jmp     L301
-
-;SFMTRandom ENDP
-
-
-; extern "C" long double SFMTRandomL (void * Pthis);
-SFMTRandomL:                                               ; generate random float with 63 bits resolution
-        ; Align Pthis by 16.
-        and     par1, -16
-        
-SFMTRandomL_reg:                                           ; internal entry point        
-
-; check if there are at least 64 random bits in state buffer
-; if (ix >= SFMT_N*4-1) Generate();
-        mov     edx, [par1+CRandomSFMTA.IX]
-        cmp     edx, SFMT_N*16-4
-        jnb     L403
-
-L401:   ; read 64 random bits
-        movq    xmm0, qword [par1+rdx+CRandomSFMTA.STATE]
-        add     edx, 8
-        mov     [par1+CRandomSFMTA.IX], edx
-
-        ; combine with Mother-Of-All generator?
-        cmp     dword [par1+CRandomSFMTA.USEMOTHER], 0
-        jz      L402
-                
-        ; add mother bits
-        movq    xmm1, qword [par1+CRandomSFMTA.M0]        ; Mother random number MC and M0
-        pshuflw xmm1, xmm1, 01001011B                     ; Put M0 before MC, and swap the words in MC
-        paddq   xmm0, xmm1                                ; Add SFMT and Mother outputs
-        call    Mother_Next                               ; Make next Mother random number ready
-        
-L402:   ;ConvertToFloat
-        sub     rsp, 16                                   ; make space for long double
-        psrlq	xmm0, 1                                   ; align with mantissa field of long double
-        pcmpeqw xmm1, xmm1                                ; all 1's
-        psllq   xmm1, 63                                  ; create a 1 in bit 63
-        por     xmm0, xmm1                                ; bit 63 is always 1 in long double
-        movq    qword [rsp], xmm0                         ; store mantissa
-        mov     dword [rsp+8], 3FFFH                      ; exponent
-        fld     tword [rsp]                               ; load long double
-        fsub    qword [par1+CRandomSFMTA.one]             ; subtract 1.0 to get 0.0 <= x < 1.0
-        pcmpeqw xmm0, xmm0                                ; make a NAN for compilers that don't support long double
-        add     rsp, 16
-        ret                                               ; return value in st(0)
-        
-L403:   ;NeedGenerateR
-        call    SFMT_Generate                             ; generate SFMT_N*4 random dwords
-        xor     edx, edx
-        jmp     L401        
-;SFMTRandomL ENDP
-
-
-;  extern "C" int SFMTIRandom (void * Pthis, int min, int max);  // Output random integer
-
-SFMTIRandom:
-; par1  = Pthis
-; par2d = min
-; par3d = max
-
-        ; Align Pthis by 16.
-        and     par1, -16        
-        push    par2                                       ; save min, max
-        push    par3
-        call    SFMTBRandom_reg                            ; random bits
-        pop     rdx                                        ; max
-        pop     rcx                                        ; min        
-        sub     edx, ecx
-        jl      short WrongInterval                        ; max < min
-        inc     edx                                        ; max - min + 1
-        mul     edx                                        ; multiply random number by interval and truncate
-        lea     eax, [rdx+rcx]                             ; add min to high dword of product
-        ret
-WrongInterval:
-        mov     eax, 80000000H                             ; error exit
-        ret
-;SFMTIRandom ENDP
-
-
-;  extern "C" int SFMTIRandomX (void * Pthis, int min, int max); // Output random integer
-
-SFMTIRandomX:
-; par1  = Pthis
-; par2d = min
-; par3d = max
-
-        push    rbx
-        ; Align Pthis by 16.
-        and     par1, -16        
-
-        mov     ebx, par3d 
-        sub     ebx, par2d                                 ; max - min
-        jle     short M30                                  ; max <= min (signed)
-        inc     ebx                                        ; interval = max - min + 1
-        
-        ; if (interval != LastInterval) {
-        cmp     ebx, [par1+CRandomSFMTA.LASTINTERVAL]
-        je      M10
-        ; need to calculate new rejection limit
-        ; RLimit = uint32(((uint64)1 << 32) / interval) * interval - 1;}
-        xor     eax, eax                                   ; 0
-        lea     edx, [eax+1]                               ; 1
-        div     ebx                                        ; (would give overflow if interval = 1)
-        mul     ebx
-        dec     eax
-        mov     [par1+CRandomSFMTA.RLIMIT], eax       
-        mov     [par1+CRandomSFMTA.LASTINTERVAL], ebx
-M10:    mov     ebx, par2d                                 ; save min
-
-M20:    ; do { // Rejection loop
-        call    SFMTBRandom_reg                            ; random bits (par1 is preserved)
-        ; longran  = (uint64)BRandom() * interval;
-        mul     dword [par1+CRandomSFMTA.LASTINTERVAL]
-        ; } while (remainder > RLimit);
-        cmp     eax, [par1+CRandomSFMTA.RLIMIT]
-        ja      M20
-        
-        ; return (int32)iran + min
-        lea     eax, [rbx+rdx]
-        pop     rbx
-        ret
-        
-M30:    jl      M40
-        ; max = min. Return min
-        mov     eax, par2d
-        pop     rbx
-        ret                                                ; max = min exit
-        
-M40:    ; max < min: error
-        mov     eax, 80000000H                             ; error exit
-        pop     rbx
-        ret
-;SFMTIRandomX ENDP
-
-
-
-; -------------------------------------------------------------------------
-;  Single-threaded static link versions for SFMT generator
-; -------------------------------------------------------------------------
-
-;  extern "C" void SFMTgenRandomInit(int seed, int IncludeMother = 0); 
-SFMTgenRandomInit:
-; par1d = seed
-; par2d = IncludeMother
-
-        ; set up parameters for call SFMTRandomInit
-        mov     par4d, par2d                               ; IncludeMother
-        mov     par3d, par1d                               ; seed
-        mov     par2d, SFMTSize                            ; ThisSize
-        lea     par1,  [SFMTInstance]                      ; Get address of SFMTInstance into par1
-        jmp     SFMTRandomInit
-;SFMTgenRandomInit ENDP
-
-
-;  extern "C" void SFMTgenRandomInitByArray(int const seeds[], int NumSeeds, int IncludeMother = 0);
-SFMTgenRandomInitByArray:
-; par1  = seeds
-; par2d = NumSeeds
-; par3d = IncludeMother
-
-        ; set up parameters for call SFMTRandomInitByArray
-%IFDEF   WINDOWS
-        push    par3                                       ; IncludeMother on stack
-        sub     rsp, 32                                    ; empty shadow space
-        mov     par4d, par2d                               ; NumSeeds
-        mov     par3,  par1                                ; seeds
-        mov     par2d, SFMTSize                            ; ThisSize
-        lea     par1,  [SFMTInstance]                      ; Get address of SFMTInstance into par1
-        call	SFMTRandomInitByArray
-        add     rsp, 40
-        ret
-%ELSE    ; UNIX
-        mov     par5d, par3d                               ; IncludeMother in register
-        mov     par4d, par2d                               ; NumSeeds
-        mov     par3,  par1                                ; seeds
-        mov     par2d, SFMTSize                            ; ThisSize
-        lea     par1,  [SFMTInstance]                      ; Get address of SFMTInstance into par1
-        jmp     SFMTRandomInitByArray
-%ENDIF        
-;SFMTgenRandomInitByArray ENDP  
-
-
-;  extern "C" double SFMTgenRandom();
-SFMTgenRandom:                                             ; generate random float with 52 bits resolution
-        lea     par1, [SFMTInstance]                       ; Get address of SFMTInstance into par1
-        jmp     SFMTRandom_reg                             ; random bits
-;SFMTgenRandom ENDP
-
-
-;  extern "C" double SFMTgenRandom();
-SFMTgenRandomL:                                            ; generate random float with 63 bits resolution
-        lea     par1, [SFMTInstance]                       ; Get address of SFMTInstance into par1
-        jmp     SFMTRandomL_reg                            ; random bits
-;SFMTgenRandomL ENDP
-
-
-;  extern "C" int SFMTgenIRandom (int min, int max);
-SFMTgenIRandom:   
-        mov     par3d, par2d
-        mov     par2d, par1d
-        lea     par1, [SFMTInstance]                       ; Get address of SFMTInstance into par1
-        jmp     SFMTIRandom				                   ; continue in _SFMTIRandom
-;SFMTgenIRandom ENDP
-
-
-;  extern "C" int SFMTgenIRandomX (int min, int max);
-SFMTgenIRandomX:
-        mov     par3d, par2d
-        mov     par2d, par1d
-        lea     par1, [SFMTInstance]                       ; Get address of SFMTInstance into par1
-        jmp	    SFMTIRandomX                               ; continue in _SFMTIRandomX
-;SFMTgenIRandomX ENDP
-
-
-;  extern "C" uint32_t SFMTgenBRandom();
-SFMTgenBRandom:                                            ; generate random float with 32 bits resolution
-        lea     par1, [SFMTInstance]                       ; Get address of SFMTInstance into par1
-        jmp     SFMTBRandom_reg                            ; random bits
-;SFMTgenBRandom ENDP
-
-;END
diff --git a/contrib/libs/asmlib/strcat64.asm b/contrib/libs/asmlib/strcat64.asm
deleted file mode 100644
index 3c8a247e3e..0000000000
--- a/contrib/libs/asmlib/strcat64.asm
+++ /dev/null
@@ -1,70 +0,0 @@
-%include "defs.asm"
-
-;*************************  strcat64.asm  ************************************
-; Author:           Agner Fog
-; Date created:     2008-07-19
-; Last modified:    2008-10-16
-; Description:
-; Faster version of the standard strcat function:
-; char * strcat(char *dest, const char * src);
-; Copies zero-terminated string from src to end of dest.
-;
-; Overriding standard function strcat:
-; The alias ?OVR_strcat is changed to _strcat in the object file if
-; it is desired to override the standard library function strcat.
-;
-; Optimization:
-; Uses optimized functions A_strlen and A_memcpy.
-;
-; Copyright (c) 2009 GNU General Public License www.gnu.org/licenses
-;******************************************************************************
-
-default rel
-
-global A_strcat: function                  ; Function A_strcat
-global EXP(strcat): function               ; ?OVR removed if standard function strcat overridden
-
-; Imported from strlen64.asm
-extern A_strlen
-
-; Imported from memcpy64.asm
-extern A_memcpy
-
-
-SECTION .text  align=16
-
-; extern "C" char * A_strcat(char * dest, const char * src) {
-;    memcpy(dest+strlen(dest), src, strlen(src)+1);
-;    return dest
-; }
-
-; Function entry:
-A_strcat:
-EXP(strcat):
-
-%IFDEF  WINDOWS
-%define Rpar1   rcx                    ; function parameter 1
-%define Rpar2   rdx                    ; function parameter 2
-%define Rpar3   r8                     ; function parameter 3
-%ENDIF
-%IFDEF  UNIX
-%define Rpar1   rdi                    ; function parameter 1
-%define Rpar2   rsi                    ; function parameter 2
-%define Rpar3   rdx                    ; function parameter 3
-%ENDIF
-
-        push    Rpar1                  ; dest
-        push    Rpar2                  ; src
-        call    A_strlen               ; length of dest
-        push    rax                    ; strlen(dest)
-        mov     Rpar1, [rsp+8]         ; src
-        call    A_strlen               ; length of src
-        pop     Rpar1                  ; strlen(dest)
-        pop     Rpar2                  ; src
-        add     Rpar1, [rsp]           ; dest + strlen(dest)
-        lea     Rpar3, [rax+1]         ; strlen(src)+1
-        call    A_memcpy               ; copy
-        pop     rax                    ; return dest
-        ret
-
-;A_strcat ENDP
diff --git a/contrib/libs/asmlib/strcpy64.asm b/contrib/libs/asmlib/strcpy64.asm
deleted file mode 100644
index c505c48be7..0000000000
--- a/contrib/libs/asmlib/strcpy64.asm
+++ /dev/null
@@ -1,66 +0,0 @@
-%include "defs.asm"
-
-;*************************  strcpy64.asm  ************************************
-; Author:           Agner Fog
-; Date created:     2008-07-19
-; Last modified:    2011-07-01
-; Description:
-; Faster version of the standard strcpy function:
-; char * A_strcpy(char * dest, const char * src);
-; Copies zero-terminated string from src to dest, including terminating zero.
-;
-; Overriding standard function memcpy:
-; The alias ?OVR_strcpy is changed to _strcpy in the object file if
-; it is desired to override the standard library function strcpy.
-;
-; Optimization:
-; Uses optimized functions A_strlen and A_memcpy. These functions allow
-; calling without proper stack alignment.
-;
-; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
-;******************************************************************************
-
-default rel
-
-global A_strcpy: function                 ; Function A_strcpy
-global EXP(strcpy): function              ; ?OVR removed if standard function memcpy overridden
-
-; Imported from strlen64.asm
-extern A_strlen
-
-; Imported from memcpy64.asm
-extern A_memcpy
-
-
-SECTION .text  align=16
-
-; extern "C" char * A_strcpy(char * dest, const char * src) {
-;    return memcpy(dest, src, strlen(src)+1);
-; }
-
-; Function entry:
-A_strcpy:
-EXP(strcpy):
-
-%IFDEF  WINDOWS
-%define Rpar1   rcx                    ; function parameter 1
-%define Rpar2   rdx                    ; function parameter 2
-%define Rpar3   r8                     ; function parameter 3
-%ENDIF
-%IFDEF  UNIX
-%define Rpar1   rdi                    ; function parameter 1
-%define Rpar2   rsi                    ; function parameter 2
-%define Rpar3   rdx                    ; function parameter 3
-%ENDIF
-
-        push    Rpar1                  ; dest
-        push    Rpar2                  ; src
-        mov     Rpar1, Rpar2
-        ; (A_strlen does not require stack alignment)
-        call    A_strlen               ; length of src
-        lea     Rpar3,[rax+1]          ; include terminating zero in length
-        pop     Rpar2                  ; src
-        pop     Rpar1                  ; dest
-        jmp     A_memcpy               ; copy and return
-
-;A_strcpy ENDP
diff --git a/contrib/libs/asmlib/stricmp64.asm b/contrib/libs/asmlib/stricmp64.asm
deleted file mode 100644
index c568832b27..0000000000
--- a/contrib/libs/asmlib/stricmp64.asm
+++ /dev/null
@@ -1,86 +0,0 @@
-%include "defs.asm"
-
-;*************************  stricmpaz64.asm  **********************************
-; Author:           Agner Fog
-; Date created:     2008-12-05
-; Last modified:    2011-07-01
-; Source URL:       www.agner.org/optimize
-; Project:          asmlib.zip
-; Description:
-; Faster version of the standard stricmp or strcasecmp function:
-; int A_stricmp(const char *string1, const char *string2);
-; Compares two zero-terminated strings without case sensitivity.
-; Does not recognize locale-specific characters. A-Z are changed
-; to a-z before comparing, while other upper-case letters are not
-; converted but considered unique.
-;
-; Optimization:
-; SSE4.2 version not implemented because the gain is small.
-;
-; Copyright (c) 2008-2011 GNU General Public License www.gnu.org/licenses/gpl.html
-;******************************************************************************
-
-default rel
-
-global A_stricmp: function                     ; Function A_stricmp
-
-; ***************************************************************************
-; Define registers used for function parameters, used in 64-bit mode only
-; ***************************************************************************
- 
-%IFDEF WINDOWS
-  %define par1   rcx                   ; first parameter
-  %define par2   rdx                   ; second parameter
-%ENDIF
-  
-%IFDEF UNIX
-  %define par1   rdi                   ; first parameter
-  %define par2   rsi                   ; second parameter
-%ENDIF
-
-SECTION .text  align=16
-
-; extern "C" int A_stricmp(const char *string1, const char *string2);
-
-A_stricmp:
-        sub     par2, par1
-        
-L10:    mov     al,  [par1]            ; string1
-        cmp     al,  [par1+par2]       ; string2
-        jne     L20
-        inc     par1
-        test    al, al
-        jnz     L10                    ; continue with next byte
-        
-        ; terminating zero found. Strings are equal
-        xor     eax, eax
-        ret        
-        
-L20:    ; bytes are different. check case
-        xor     al, 20H                ; toggle case
-        cmp     al, [par1+par2]
-        jne     L30
-        ; possibly differing only by case. Check if a-z
-        or      al, 20H                ; upper case
-        sub     al, 'a'
-        cmp     al, 'z'-'a'
-        ja      L30                    ; not a-z
-        ; a-z and differing only by case
-        inc     par1
-        jmp     L10                    ; continue with next byte
-
-L30:    ; bytes are different, even after changing case
-        movzx   eax, byte [par1]       ; get original value again
-        sub     eax, 'A'
-        cmp     eax, 'Z' - 'A'
-        ja      L40
-        add     eax, 20H               ; A-Z, make lower case
-L40:    movzx   edx, byte [par1+par2]
-        sub     edx, 'A'
-        cmp     edx, 'Z' - 'A'
-        ja      L50
-        add     edx, 20H                ; A-Z, make lower case
-L50:    sub     eax, edx                ; subtract to get result
-        ret
-
-;A_stricmp END
diff --git a/contrib/libs/asmlib/strlen64.asm b/contrib/libs/asmlib/strlen64.asm
deleted file mode 100644
index ff65c10127..0000000000
--- a/contrib/libs/asmlib/strlen64.asm
+++ /dev/null
@@ -1,86 +0,0 @@
-%include "defs.asm"
-
-;**************************  strlen64.asm  **********************************
-; Author:           Agner Fog
-; Date created:     2008-07-19
-; Last modified:    2008-10-16
-; Description:
-; Faster version of the standard strlen function:
-; size_t strlen(const char * str);
-; Finds the length of a zero-terminated string of bytes, optimized for speed.
-;
-; Overriding standard function strlen:
-; The alias ?OVR_strlen is changed to _strlen in the object file if
-; it is desired to override the standard library function strlen.
-;
-; Calling conventions: 
-; Stack alignment is not required. No shadow space or red zone used.
-; Called internally from strcpy and strcat without stack aligned.
-;
-; Optimization:
-; Uses XMM registers to read 16 bytes at a time, aligned.
-; Misaligned parts of the string are read from the nearest 16-bytes boundary
-; and the irrelevant part masked out. It may read both before the begin of 
-; the string and after the end, but will never load any unnecessary cache 
-; line and never trigger a page fault for reading from non-existing memory 
-; pages because it never reads past the nearest following 16-bytes boundary.
-; It may, though, trigger any debug watch within the same 16-bytes boundary.
-;
-; The latest version of this file is available at:
-; www.agner.org/optimize/asmexamples.zip
-; Copyright (c) 2009 GNU General Public License www.gnu.org/licenses
-;******************************************************************************
-
-default rel
-
-global A_strlen: function              ; Function A_strlen
-global EXP(strlen): function           ; ?OVR removed if standard function strlen overridden
-
-
-SECTION .text  align=16
-
-; extern "C" int strlen (const char * s);
-
-; 64-bit Windows version:
-A_strlen:
-EXP(strlen):
-
-%IFDEF  WINDOWS
-        mov      rax,  rcx             ; get pointer to string from rcx
-        mov      r8,   rcx             ; copy pointer
-%define Rscopy   r8                    ; Copy of s
-
-%ELSE   ; Unix
-        mov      rax,  rdi             ; get pointer to string from rdi
-        mov      ecx,  edi             ; copy pointer (lower 32 bits)
-%define Rscopy   rdi                   ; Copy of s
-%ENDIF
-        
-        ; rax = s, ecx = 32 bits of s
-        pxor     xmm0, xmm0            ; set to zero
-        and      ecx,  0FH             ; lower 4 bits indicate misalignment
-        and      rax,  -10H            ; align pointer by 16
-        movdqa   xmm1, [rax]           ; read from nearest preceding boundary
-        pcmpeqb  xmm1, xmm0            ; compare 16 bytes with zero
-        pmovmskb edx,  xmm1            ; get one bit for each byte result
-        shr      edx,  cl              ; shift out false bits
-        shl      edx,  cl              ; shift back again
-        bsf      edx,  edx             ; find first 1-bit
-        jnz      L2                    ; found
-        
-        ; Main loop, search 16 bytes at a time
-L1:     add      rax,  10H             ; increment pointer by 16
-        movdqa   xmm1, [rax]           ; read 16 bytes aligned
-        pcmpeqb  xmm1, xmm0            ; compare 16 bytes with zero
-        pmovmskb edx,  xmm1            ; get one bit for each byte result
-        bsf      edx,  edx             ; find first 1-bit
-        ; (moving the bsf out of the loop and using test here would be faster for long strings on old processors,
-        ;  but we are assuming that most strings are short, and newer processors have higher priority)
-        jz       L1                    ; loop if not found
-        
-L2:     ; Zero-byte found. Compute string length        
-        sub      rax,  Rscopy          ; subtract start address
-        add      rax,  rdx             ; add byte index
-        ret
-        
-;A_strlen ENDP
diff --git a/contrib/libs/asmlib/substring64.asm b/contrib/libs/asmlib/substring64.asm
deleted file mode 100644
index 235b19a5f5..0000000000
--- a/contrib/libs/asmlib/substring64.asm
+++ /dev/null
@@ -1,75 +0,0 @@
-%include "defs.asm"
-
-;*************************  substring64.asm  **********************************
-; Author:           Agner Fog
-; Date created:     2011-07-18
-; Last modified:    2011-07-18
-; Source URL:       www.agner.org/optimize
-; Project:          asmlib.zip
-; Description:
-; Makes a substring of a zero-terminated ASCII string
-;
-; C++ prototype:
-; extern "C"
-; size_t A_substring(char * dest, const char * source, size_t pos, size_t len);
-; Makes a substring from source, starting at position pos (zero-based) and length
-; len and stores it in the array dest. It is the responsibility of the programmer
-; that the size of the dest array is at least len + 1.
-; The return value is the actual length of the substring. This may be less than 
-; len if the length of source is less than pos + len.
-;
-; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses/gpl.html
-;******************************************************************************
-
-global A_substring: function                      ; Function _A_substring
-
-extern A_strlen
-extern A_memcpy
-
-SECTION .text
-
-; extern "C"
-; size_t A_substring(char * dest, const char * source, size_t pos, size_t len);
-
-%ifdef WINDOWS
-%define par1    rcx                    ; dest
-%define par2    rdx                    ; source
-%define par3    r8                     ; pos
-%define par4    r9                     ; len
-%else   ; UNIX
-%define par1    rdi
-%define par2    rsi
-%define par3    rdx
-%define par4    rcx
-%endif
-
-A_substring:
-        push    par1
-        push    par2
-        push    par3
-        push    par4
-        mov     par1, par2
-        call    A_strlen               ; rax = strlen(source)
-        pop     par4
-        pop     par3
-        pop     par2
-        pop     par1        
-        sub     rax, par3              ; max length = strlen(source) - pos
-        jbe     empty                  ; strlen(source) <= pos. Return empty string
-        cmp     rax, par4
-        cmova   rax, par4              ; min(len, maxlen)
-        add     par2, par3             ; source + pos = source for memcpy
-        mov     par3, rax              ; length for memcpy
-        push    rax                    ; new length
-        call    A_memcpy
-        pop     rcx                    ; new length = return value, rax = dest
-        mov     byte [rcx+rax], 0      ; terminating zero
-        mov     rax, rcx               ; return new length
-        ret
-        
-empty:  ; return empty string
-        xor     eax, eax               ; return 0
-        mov     byte [par1], al
-        ret
-        
-;A_substring END
diff --git a/contrib/libs/asmlib/unalignedisfaster64.asm b/contrib/libs/asmlib/unalignedisfaster64.asm
deleted file mode 100644
index eed68a1398..0000000000
--- a/contrib/libs/asmlib/unalignedisfaster64.asm
+++ /dev/null
@@ -1,188 +0,0 @@
-%include "defs.asm"
-
-;*************************  unalignedisfaster64.asm  ******************************
-; Author:           Agner Fog
-; Date created:     2011-07-09
-; Last modified:    2013-08-30
-; Source URL:       www.agner.org/optimize
-; Project:          asmlib.zip
-; Language:         assembly, NASM/YASM syntax, 64 bit
-;
-; C++ prototype:
-; extern "C" int UnalignedIsFaster(void);
-;
-; Description:
-; This function finds out if unaligned 16-bytes memory read is
-; faster than aligned read followed by an alignment shift (PALIGNR) on the
-; current CPU.
-;
-; Return value:
-; 0:   Unaligned read is probably slower than alignment shift
-; 1:   Unknown
-; 2:   Unaligned read is probably faster than alignment shift
-;
-;
-; C++ prototype:
-; extern "C" int Store256BitIsFaster(void);
-;
-; Description:
-; This function finds out if a 32-bytes memory write is
-; faster than two 16-bytes writes on the current CPU.
-;
-; Return value:
-; 0:   32-bytes memory write is slower or AVX not supported
-; 1:   Unknown
-; 2:   32-bytes memory write is faster
-;
-; Copyright (c) 2011 - 2013 GNU General Public License www.gnu.org/licenses
-;******************************************************************************
-;
-; C++ prototype:
-; extern "C" int UnalignedIsFaster(void);
-
-global UnalignedIsFaster: function
-global Store256BitIsFaster: function
-extern CpuType
-extern InstructionSet
-
-
-SECTION .text
-
-UnalignedIsFaster:
-
-%ifdef  UNIX
-        push    0                      ; vendor
-        mov     rdi, rsp
-        push    0                      ; family 
-        mov     rsi, rsp
-        push    0                      ; model
-        mov     rdx, rsp 
-%else   ; WINDOWS
-        push    0                      ; vendor
-        mov     rcx, rsp
-        push    0                      ; family 
-        mov     rdx, rsp
-        push    0                      ; model
-        mov     r8,  rsp 
-%endif
-        call    CpuType                ; get vendor, family, model
-        pop     rdx                    ; model
-        pop     rcx                    ; family
-        pop     r8                     ; vendor
-        xor     eax, eax               ; return value
-        dec     r8d
-        jz      Intel
-        dec     r8d
-        jz      AMD
-        dec     r8d
-        jz      VIA
-        ; unknown vendor
-        inc     eax
-        jmp     Uend
-        
-Intel:  ; Unaligned read is faster on Intel Nehalem and later, but not Atom
-        ; Nehalem  = family 6, model 1AH
-        ; Atom     = family 6, model 1CH
-        ; Netburst = family 0FH
-        ; Future models are likely to be family 6, mayby > 6, model > 1C
-        cmp     ecx, 6
-        jb      Uend                   ; old Pentium 1, etc
-        cmp     ecx, 0FH
-        je      Uend                   ; old Netburst architecture
-        cmp     edx, 1AH
-        jb      Uend                   ; earlier than Nehalem
-        cmp     edx, 1CH
-        je      Uend                   ; Intel Atom
-        or      eax, 2                 ; Intel Nehalem and later, except Atom
-        jmp     Uend
-        
-AMD:    ; AMD processors:
-        ; The PALIGNR instruction is slow on AMD Bobcat but fast on Jaguar
-        ; K10/Opteron = family 10H     ; Use unaligned
-        ; Bobcat = family 14H          ; PALIGNR is very slow. Use unaligned
-        ; Piledriver = family 15H      ; Use unaligned
-        ; Jaguar = family 16H          ; PALIGNR is fast. Use aligned (aligned is faster in most cases, but not all)
-        cmp     ecx, 10H               ; AMD K8 or earlier: use aligned
-        jb      Uend    
-        cmp     ecx, 16H               ; Jaguar: use aligned
-        je      Uend
-        or      eax, 2                 ; AMD K10 or later: use unaligned
-        jmp     Uend
-        
-VIA:    ; Unaligned read is not faster than PALIGNR on VIA Nano 2000 and 3000                
-        cmp     ecx, 0FH
-        jna     Uend                   ; VIA Nano
-        inc     eax                    ; Future versions: unknown
-       ;jmp     Uend
-        
-Uend:   ret
-
-;UnalignedIsFaster ENDP
-
-
-Store256BitIsFaster:
-        call    InstructionSet
-        cmp     eax, 11                ; AVX supported
-        jb      S90
-%ifdef  UNIX
-        push    0                      ; vendor
-        mov     rdi, rsp
-        push    0                      ; family 
-        mov     rsi, rsp
-        push    0                      ; model
-        mov     rdx, rsp 
-%else   ; WINDOWS
-        push    0                      ; vendor
-        mov     rcx, rsp
-        push    0                      ; family 
-        mov     rdx, rsp
-        push    0                      ; model
-        mov     r8,  rsp 
-%endif
-        call    CpuType                ; get vendor, family, model
-        pop     rdx                    ; model
-        pop     rcx                    ; family
-        pop     rax                    ; vendor
-
-        cmp     eax, 1                 ; Intel
-        je      S_Intel
-        cmp     eax, 2                 ; AMD
-        je      S_AMD
-        cmp     eax, 3
-        je      S_VIA        
-        jmp     S91                    ; other vendor, not known
-        
-S_Intel:cmp     ecx, 6
-        jne     S92                    ; unknown family. possibly future model
-        ; model 2AH Sandy Bridge
-        ; model 3AH Ivy Bridge
-        ; model 3CH Haswell
-        ; Sandy Bridge and Ivy Bridge are slightly faster with 128 than with 256 bit moves on large data blocks
-        ; Haswell is much faster with 256 bit moves
-        cmp     edx, 3AH
-        jbe     S90
-        jmp     S92        
-
-S_AMD:  ; AMD
-        cmp     ecx, 15H               ; family 15h = Bulldozer, Piledriver
-        ja      S92                    ; assume future AMD families are faster
-                                       ; family 16H = Jaguar. 256 bit write is slightly faster
-        ; model 1 = Bulldozer is a little slower on 256 bit write
-        ; model 2 = Piledriver is terribly slow on 256 bit write
-        ; assume future models 3-4 are like Bulldozer
-        cmp     edx, 4
-        jbe     S90
-        jmp     S91                    ; later models: don't know
-        
-S_VIA:  jmp     S91                    ; don't know
-        
-S90:    xor     eax, eax               ; return 0
-        ret
-        
-S91:    mov     eax, 1                 ; return 1
-        ret        
-        
-S92:    mov     eax, 2                 ; return 2
-        ret        
-        
-; Store256BitIsFaster ENDP
diff --git a/contrib/libs/asmlib/ya.make b/contrib/libs/asmlib/ya.make
deleted file mode 100644
index 35baa5a7a2..0000000000
--- a/contrib/libs/asmlib/ya.make
+++ /dev/null
@@ -1,110 +0,0 @@
-LIBRARY()
-
-LICENSE(
-    GPL-1.0-or-later AND
-    GPL-2.0-only AND
-    GPL-3.0-or-later AND
-    LGPL-2.0-or-later AND
-    LGPL-3.0-only
-)
-
-VERSION(2016-11-16)
-
-LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
-
-ORIGINAL_SOURCE(https://www.agner.org/optimize/)
-
-NO_PLATFORM()
-
-SET(_YASM_PREDEFINED_FLAGS_VALUE "")
-
-IF (ARCH_X86_64)
-    IF (OS_DARWIN)
-        PEERDIR(
-            contrib/libs/asmglibc
-        )
-    ENDIF()
-    IF (NOT OS_DARWIN)
-        SRCS(
-            sfmt64.asm
-            mother64.asm
-            mersenne64.asm
-        )
-    ENDIF()
-    SRCS(
-        debugbreak64.asm
-        cachesize64.asm
-        divfixedi64.asm
-        rdtsc64.asm
-        strcat64.asm
-        unalignedisfaster64.asm
-        strcpy64.asm
-        substring64.asm
-        strlen64.asm
-        cputype64.asm
-        memcmp64.asm
-        memmove64.asm
-        stricmp64.asm
-        divfixedv64.asm
-        physseed64.asm
-        cpuid64.asm
-        round64.asm
-        memcpy64.asm
-        popcount64.asm
-        dispatchpatch64.asm
-        #instrset64.asm
-        procname64.asm
-        memset64.asm
-        #disabled because of protection violation
-        #strcountutf864.asm
-        #strcountset64.asm
-        #strtouplow64.asm
-        #strcmp64.asm
-        #strspn64.asm
-        #strstr64.asm
-    )
-ENDIF()
-
-IF (ARCH_I386)
-    SRCS(
-        debugbreak32.asm
-        cachesize32.asm
-        divfixedi32.asm
-        rdtsc32.asm
-        strcat32.asm
-        unalignedisfaster32.asm
-        strcpy32.asm
-        substring32.asm
-        strlen32.asm
-        cputype32.asm
-        memcmp32.asm
-        memmove32.asm
-        sfmt32.asm
-        stricmp32.asm
-        divfixedv32.asm
-        physseed32.asm
-        cpuid32.asm
-        mother32.asm
-        round32.asm
-        mersenne32.asm
-        memcpy32.asm
-        popcount32.asm
-        dispatchpatch32.asm
-        #instrset32.asm
-        procname32.asm
-        memset32.asm
-        #disabled because of protection violation
-        #strcountutf832.asm
-        #strcountset32.asm
-        #strtouplow32.asm
-        #strcmp32.asm
-        #strspn32.asm
-        #strstr32.asm
-    )
-ENDIF()
-
-SRCS(
-    dummy.c
-)
-
-END()
diff --git a/ydb/apps/ydb/CMakeLists.darwin-x86_64.txt b/ydb/apps/ydb/CMakeLists.darwin-x86_64.txt
index 832d38005c..87e7b9d72a 100644
--- a/ydb/apps/ydb/CMakeLists.darwin-x86_64.txt
+++ b/ydb/apps/ydb/CMakeLists.darwin-x86_64.txt
@@ -19,7 +19,6 @@ target_link_libraries(ydb PUBLIC
   contrib-libs-cxxsupp
   yutil
   library-cpp-cpuid_check
-  contrib-libs-asmlib
   commands
   library-cpp-resource
 )
diff --git a/ydb/apps/ydb/CMakeLists.linux-aarch64.txt b/ydb/apps/ydb/CMakeLists.linux-aarch64.txt
index b4b252b29b..619d67ad35 100644
--- a/ydb/apps/ydb/CMakeLists.linux-aarch64.txt
+++ b/ydb/apps/ydb/CMakeLists.linux-aarch64.txt
@@ -20,7 +20,6 @@ target_link_libraries(ydb PUBLIC
   contrib-libs-linux-headers
   contrib-libs-cxxsupp
   yutil
-  contrib-libs-asmlib
   commands
   library-cpp-resource
 )
diff --git a/ydb/apps/ydb/CMakeLists.linux-x86_64.txt b/ydb/apps/ydb/CMakeLists.linux-x86_64.txt
index c6722e7f54..ab7446e042 100644
--- a/ydb/apps/ydb/CMakeLists.linux-x86_64.txt
+++ b/ydb/apps/ydb/CMakeLists.linux-x86_64.txt
@@ -21,7 +21,6 @@ target_link_libraries(ydb PUBLIC
   contrib-libs-cxxsupp
   yutil
   library-cpp-cpuid_check
-  contrib-libs-asmlib
   commands
   library-cpp-resource
 )
diff --git a/ydb/apps/ydb/CMakeLists.windows-x86_64.txt b/ydb/apps/ydb/CMakeLists.windows-x86_64.txt
index 1919d40b1d..9bba9d0f4b 100644
--- a/ydb/apps/ydb/CMakeLists.windows-x86_64.txt
+++ b/ydb/apps/ydb/CMakeLists.windows-x86_64.txt
@@ -19,7 +19,6 @@ target_link_libraries(ydb PUBLIC
   contrib-libs-cxxsupp
   yutil
   library-cpp-cpuid_check
-  contrib-libs-asmlib
   commands
   library-cpp-resource
 )
diff --git a/ydb/apps/ydb/ya.make b/ydb/apps/ydb/ya.make
index b971d714c3..dc11518ba5 100644
--- a/ydb/apps/ydb/ya.make
+++ b/ydb/apps/ydb/ya.make
@@ -6,10 +6,7 @@ SRCS(
     main.cpp
 )
 
-DISABLE(USE_ASMLIB)
-
 PEERDIR(
-    contrib/libs/asmlib
     ydb/apps/ydb/commands
 )
 
@@ -17,6 +14,15 @@ RESOURCE(
     ydb/apps/ydb/version.txt version.txt
 )
 
+IF (NOT USE_SSE4 AND NOT OPENSOURCE)
+    # contrib/libs/glibasm can not be built without SSE4
+    # Replace it with contrib/libs/asmlib which can be built this way.
+    DISABLE(USE_ASMLIB)
+    PEERDIR(
+        contrib/libs/asmlib
+    )
+ENDIF()
+
 #
 # DON'T ALLOW NEW DEPENDENCIES WITHOUT EXPLICIT APPROVE FROM  kikimr-dev@ or fomichev@
 #
author	thegeorg <thegeorg@yandex-team.com>	2023-08-22 18:56:30 +0300
committer	thegeorg <thegeorg@yandex-team.com>	2023-08-22 19:13:38 +0300
commit	769d14120ef8e30363c7dd6870ce1b82552587c3 (patch)
tree	c407d1d3f152b9f6eb13f50abc3f5b06db82f9b3
parent	494eee7cbbaf3e7d71a133c80c96aec26e518c2a (diff)
download	ydb-769d14120ef8e30363c7dd6870ce1b82552587c3.tar.gz