summaryrefslogtreecommitdiffstats
path: root/contrib/libs/asmlib/memcpy64.asm
diff options
context:
space:
mode:
authorpnv1 <[email protected]>2023-04-27 19:15:07 +0300
committerpnv1 <[email protected]>2023-04-27 19:15:07 +0300
commita66c59109292f9e0fb44ede41adfdebe569e4df3 (patch)
tree906b3d10274afd16e8e70c61ff416bff9075422e /contrib/libs/asmlib/memcpy64.asm
parent9ca91b40d6f45546e20a646d15590c0cc6cc9778 (diff)
Switch to old asmlib to be able to build ydb cli without sse4
Diffstat (limited to 'contrib/libs/asmlib/memcpy64.asm')
-rw-r--r--contrib/libs/asmlib/memcpy64.asm1332
1 files changed, 1332 insertions, 0 deletions
diff --git a/contrib/libs/asmlib/memcpy64.asm b/contrib/libs/asmlib/memcpy64.asm
new file mode 100644
index 00000000000..d590990b99d
--- /dev/null
+++ b/contrib/libs/asmlib/memcpy64.asm
@@ -0,0 +1,1332 @@
+%include "defs.asm"
+
+;************************* memcpy64.asm ************************************
+; Author: Agner Fog
+; Date created: 2008-07-19
+; Last modified: 2016-11-12 (patched version with AVX512 support removed)
+;
+; Description:
+; Faster version of the standard memcpy function:
+; void * A_memcpy(void *dest, const void *src, size_t count);
+; Copies 'count' bytes from 'src' to 'dest'
+;
+; Overriding standard function memcpy:
+; The alias ?OVR_memcpy is changed to _memcpy in the object file if
+; it is desired to override the standard library function memcpy.
+;
+; The function uses non-temporal writes to bypass the cache when the size is
+; bigger than half the size of the largest_level cache. This limit can be
+; read with GetMemcpyCacheLimit and changed with SetMemcpyCacheLimit
+; C++ prototypes:
+; extern "C" size_t GetMemcpyCacheLimit(); // in memcpy64.asm
+; extern "C" void SetMemcpyCacheLimit(); // in memmove64.asm
+; extern "C" void SetMemcpyCacheLimit1(); // used internally
+;
+; Position-independent code is generated if POSITIONINDEPENDENT is defined.
+;
+; CPU dispatching included SSE2, Suppl-SSE3 and AVX instruction sets.
+;
+; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+default rel
+
+global A_memcpy: function ; Function A_memcpy
+global EXP(memcpy): function ; ?OVR removed if standard function memcpy overridden
+global memcpySSE2: function ; Version for processors with only SSE2
+global memcpySSSE3: function ; Version for processors with SSSE3
+global memcpyU: function ; Version for processors with fast unaligned read
+global memcpyU256: function ; Version for processors with fast 256-bit read/write
+
+global GetMemcpyCacheLimit: function ; Get the size limit for bypassing cache when copying with memcpy and memmove
+global SetMemcpyCacheLimit1: function ; Set the size limit for bypassing cache when copying with memcpy
+
+
+; Imported from instrset64.asm
+extern InstructionSet ; Instruction set for CPU dispatcher
+
+; Imported from unalignedisfaster64.asm:
+extern UnalignedIsFaster ; Tells if unaligned read is faster than PALIGNR
+extern Store256BitIsFaster ; Tells if a 256 bit store is faster than two 128 bit stores
+
+; Imported from cachesize32.asm:
+extern DataCacheSize ; Gets size of data cache
+
+
+; Define prolog for this function
+%MACRO PROLOGM 0
+%IFDEF WINDOWS
+ push rsi
+ push rdi
+ mov rdi, rcx ; dest
+ mov r9, rcx ; dest
+ mov rsi, rdx ; src
+ mov rcx, r8 ; count
+%ELSE ; Unix
+ mov rcx, rdx ; count
+ mov r9, rdi ; dest
+%ENDIF
+%ENDM
+
+; Define return from this function
+%MACRO RETURNM 0
+%IFDEF WINDOWS
+ pop rdi
+ pop rsi
+%ENDIF
+ mov rax, r9 ; Return value = dest
+ ret
+%ENDM
+
+
+SECTION .text align=16
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Common entry for dispatch
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; extern "C" void * A_memcpy(void * dest, const void * src, size_t count);
+; Function entry:
+A_memcpy:
+EXP(memcpy):
+ jmp qword [memcpyDispatch] ; Go to appropriate version, depending on instruction set
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; AVX Version for processors with fast unaligned read and fast 32 bytes write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+memcpyU256: ; global label
+memcpyU256@: ; local label
+ PROLOGM
+ cmp rcx, 40H
+ jb A1000 ; Use simpler code if count < 64
+
+ ; count >= 64
+ ; Calculate size of first block up to first regular boundary of dest
+ mov edx, edi
+ neg edx
+ and edx, 1FH
+ jz B3100 ; Skip if dest aligned by 32
+
+ ; edx = size of first partial block, 1 - 31 bytes
+ test dl, 3
+ jz B3030
+ test dl, 1
+ jz B3020
+ ; move 1 byte
+ movzx eax, byte [rsi]
+ mov [rdi], al
+ inc rsi
+ inc rdi
+B3020: test dl, 2
+ jz B3030
+ ; move 2 bytes
+ movzx eax, word [rsi]
+ mov [rdi], ax
+ add rsi, 2
+ add rdi, 2
+B3030: test dl, 4
+ jz B3040
+ ; move 4 bytes
+ mov eax, [rsi]
+ mov [rdi], eax
+ add rsi, 4
+ add rdi, 4
+B3040: test dl, 8
+ jz B3050
+ ; move 8 bytes
+ mov rax, [rsi]
+ mov [rdi], rax
+ add rsi, 8
+ add rdi, 8
+B3050: test dl, 16
+ jz B3060
+ ; move 16 bytes
+ movups xmm0, [rsi]
+ movaps [rdi], xmm0
+ add rsi, 16
+ add rdi, 16
+B3060: sub rcx, rdx
+
+B3100: ; Now dest is aligned by 32. Any partial block has been moved
+
+ ; Set up for loop moving 32 bytes per iteration:
+ mov rdx, rcx ; Save count
+ and rcx, -20H ; Round down to nearest multiple of 32
+ add rsi, rcx ; Point to the end
+ add rdi, rcx ; Point to the end
+ sub rdx, rcx ; Remaining data after loop
+ ; Check if count very big
+ cmp rcx, [CacheBypassLimit]
+ ja I3100 ; Use non-temporal store if count > CacheBypassLimit
+ neg rcx ; Negative index from the end
+
+H3100: ; copy -rcx bytes in blocks of 32 bytes.
+
+ ; Check for false memory dependence: The CPU may falsely assume
+ ; a partial overlap between the written destination and the following
+ ; read source if source is unaligned and
+ ; (src-dest) modulo 4096 is close to 4096
+ test sil, 1FH
+ jz H3110 ; aligned
+ mov eax, esi
+ sub eax, edi
+ and eax, 0FFFH ; modulo 4096
+ cmp eax, 1000H - 200H
+ ja J3100
+
+align 16
+H3110: ; main copy loop, 32 bytes at a time
+ ; rcx has negative index from the end, counting up to zero
+ vmovups ymm0, [rsi+rcx]
+ vmovaps [rdi+rcx], ymm0
+ add rcx, 20H
+ jnz H3110
+ sfence
+ vzeroupper ; end of AVX mode
+
+H3120: ; Move the remaining edx bytes (0 - 31):
+ add rsi, rdx
+ add rdi, rdx
+ neg rdx
+ jz H3500 ; Skip if no more data
+ ; move 16-8-4-2-1 bytes, aligned
+ cmp edx, -10H
+ jg H3200
+ ; move 16 bytes
+ movups xmm0, [rsi+rdx]
+ movaps [rdi+rdx], xmm0
+ add rdx, 10H
+H3200: cmp edx, -8
+ jg H3210
+ ; move 8 bytes
+ movq xmm0, qword [rsi+rdx]
+ movq qword [rdi+rdx], xmm0
+ add rdx, 8
+ jz H500 ; Early skip if count divisible by 8
+H3210: cmp edx, -4
+ jg H3220
+ ; move 4 bytes
+ mov eax, [rsi+rdx]
+ mov [rdi+rdx], eax
+ add rdx, 4
+H3220: cmp edx, -2
+ jg H3230
+ ; move 2 bytes
+ movzx eax, word [rsi+rdx]
+ mov [rdi+rdx], ax
+ add rdx, 2
+H3230: cmp edx, -1
+ jg H3500
+ ; move 1 byte
+ movzx eax, byte [rsi+rdx]
+ mov [rdi+rdx], al
+H3500: ; finished
+ RETURNM
+
+I3100: ; non-temporal move
+ neg rcx ; Negative index from the end
+
+align 16
+I3110: ; main copy loop, 32 bytes at a time
+ ; rcx has negative index from the end, counting up to zero
+ vmovups ymm0, [rsi+rcx]
+ vmovntps [rdi+rcx], ymm0
+ add rcx, 20H
+ jnz I3110
+ sfence
+ vzeroupper ; end of AVX mode
+ jmp H3120 ; Move the remaining edx bytes (0 - 31)
+
+
+align 16
+J3100: ; There is a false memory dependence.
+ ; check if src and dest overlap, if not then it is safe
+ ; to copy backwards to avoid false memory dependence
+%if 1
+ ; Use this version if you want consistent behavior in the case
+ ; where dest > src and overlap. However, this case is undefined
+ ; anyway because part of src is overwritten before copying
+ push rdx
+ mov rax, rsi
+ sub rax, rdi
+ cqo
+ xor rax, rdx
+ sub rax, rdx ; abs(src-dest)
+ neg rcx ; size
+ pop rdx ; restore rdx
+ cmp rax, rcx
+ jnb J3110
+ neg rcx ; restore rcx
+ jmp H3110 ; overlap between src and dest. Can't copy backwards
+%else
+ ; save time by not checking the case that is undefined anyway
+ mov rax, rsi
+ sub rax, rdi
+ neg rcx ; size
+ cmp rax, rcx
+ jnb J3110 ; OK to copy backwards
+ ; must copy forwards
+ neg rcx ; restore ecx
+ jmp H3110 ; copy forwards
+
+%endif
+
+J3110: ; copy backwards, rcx = size. rsi, rdi = end of src, dest
+ push rsi
+ push rdi
+ sub rsi, rcx
+ sub rdi, rcx
+J3120: ; loop backwards
+ vmovups ymm0, [rsi+rcx-20H]
+ vmovaps [rdi+rcx-20H], ymm0
+ sub rcx, 20H
+ jnz J3120
+ sfence
+ vzeroupper
+ pop rdi
+ pop rsi
+ jmp H3120
+
+align 16
+ ; count < 64. Move 32-16-8-4-2-1 bytes
+ ; multiple CPU versions (SSSE3 and above)
+A1000: add rsi, rcx ; end of src
+ add rdi, rcx ; end of dest
+ neg rcx ; negative index from the end
+ cmp ecx, -20H
+ jg A1100
+ ; move 32 bytes
+ ; movdqu is faster than 64-bit moves on processors with SSSE3
+ movups xmm0, [rsi+rcx]
+ movups xmm1, [rsi+rcx+10H]
+ movups [rdi+rcx], xmm0
+ movups [rdi+rcx+10H], xmm1
+ add rcx, 20H
+A1100: cmp ecx, -10H
+ jg A1200
+ ; move 16 bytes
+ movups xmm0, [rsi+rcx]
+ movups [rdi+rcx], xmm0
+ add rcx, 10H
+A1200: cmp ecx, -8
+ jg A1300
+ ; move 8 bytes
+ mov rax, qword [rsi+rcx]
+ mov qword [rdi+rcx], rax
+ add rcx, 8
+A1300: cmp ecx, -4
+ jg A1400
+ ; move 4 bytes
+ mov eax, [rsi+rcx]
+ mov [rdi+rcx], eax
+ add rcx, 4
+ jz A1900 ; early out if count divisible by 4
+A1400: cmp ecx, -2
+ jg A1500
+ ; move 2 bytes
+ movzx eax, word [rsi+rcx]
+ mov [rdi+rcx], ax
+ add rcx, 2
+A1500: cmp ecx, -1
+ jg A1900
+ ; move 1 byte
+ movzx eax, byte [rsi+rcx]
+ mov [rdi+rcx], al
+A1900: ; finished
+ RETURNM
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Version for processors with fast unaligned read and fast 16 bytes write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+memcpyU: ; global label
+memcpyU@: ; local label
+ PROLOGM
+ cmp rcx, 40H
+ jb A1000 ; Use simpler code if count < 64
+
+ ; count >= 64
+ ; Calculate size of first block up to first regular boundary of dest
+ mov edx, edi
+ neg edx
+ and edx, 0FH
+ jz B2100 ; Skip if dest aligned by 16
+
+ ; edx = size of first partial block, 1 - 15 bytes
+ test dl, 3
+ jz B2030
+ test dl, 1
+ jz B2020
+ ; move 1 byte
+ movzx eax, byte [rsi]
+ mov [rdi], al
+ inc rsi
+ inc rdi
+B2020: test dl, 2
+ jz B2030
+ ; move 2 bytes
+ movzx eax, word [rsi]
+ mov [rdi], ax
+ add rsi, 2
+ add rdi, 2
+B2030: test dl, 4
+ jz B2040
+ ; move 4 bytes
+ mov eax, [rsi]
+ mov [rdi], eax
+ add rsi, 4
+ add rdi, 4
+B2040: test dl, 8
+ jz B2050
+ ; move 8 bytes
+ mov rax, [rsi]
+ mov [rdi], rax
+ add rsi, 8
+ add rdi, 8
+B2050: sub rcx, rdx
+B2100: ; Now dest is aligned by 16. Any partial block has been moved
+
+ ; Set up for loop moving 32 bytes per iteration:
+ mov rdx, rcx ; Save count
+ and rcx, -20H ; Round down to nearest multiple of 32
+ add rsi, rcx ; Point to the end
+ add rdi, rcx ; Point to the end
+ sub rdx, rcx ; Remaining data after loop
+
+ ; Check if count very big
+ cmp rcx, [CacheBypassLimit]
+ ja I100 ; Use non-temporal store if count > CacheBypassLimit
+ neg rcx ; Negative index from the end
+
+H100: ; copy -rcx bytes in blocks of 32 bytes.
+
+ ; Check for false memory dependence: The CPU may falsely assume
+ ; a partial overlap between the written destination and the following
+ ; read source if source is unaligned and
+ ; (src-dest) modulo 4096 is close to 4096
+ test sil, 0FH
+ jz H110 ; aligned
+ mov eax, esi
+ sub eax, edi
+ and eax, 0FFFH ; modulo 4096
+ cmp eax, 1000H - 200H
+ ja J100
+
+H110: ; main copy loop, 32 bytes at a time
+ ; rcx has negative index from the end, counting up to zero
+ movups xmm0, [rsi+rcx]
+ movups xmm1, [rsi+rcx+10H]
+ movaps [rdi+rcx], xmm0
+ movaps [rdi+rcx+10H], xmm1
+ add rcx, 20H
+ jnz H110
+
+H120: ; Move the remaining edx bytes (0 - 31):
+ add rsi, rdx
+ add rdi, rdx
+ neg rdx
+ jz H500 ; Skip if no more data
+ ; move 16-8-4-2-1 bytes, aligned
+ cmp edx, -10H
+ jg H200
+ ; move 16 bytes
+ movups xmm0, [rsi+rdx]
+ movaps [rdi+rdx], xmm0
+ add rdx, 10H
+H200: cmp edx, -8
+ jg H210
+ ; move 8 bytes
+ movq xmm0, qword [rsi+rdx]
+ movq qword [rdi+rdx], xmm0
+ add rdx, 8
+ jz H500 ; Early skip if count divisible by 8
+H210: cmp edx, -4
+ jg H220
+ ; move 4 bytes
+ mov eax, [rsi+rdx]
+ mov [rdi+rdx], eax
+ add rdx, 4
+H220: cmp edx, -2
+ jg H230
+ ; move 2 bytes
+ movzx eax, word [rsi+rdx]
+ mov [rdi+rdx], ax
+ add rdx, 2
+H230: cmp edx, -1
+ jg H500
+ ; move 1 byte
+ movzx eax, byte [rsi+rdx]
+ mov [rdi+rdx], al
+H500: ; finished
+ RETURNM
+
+I100: ; non-temporal move
+ neg rcx ; Negative index from the end
+
+align 16
+I110: ; main copy loop, 32 bytes at a time
+ ; rcx has negative index from the end, counting up to zero
+ movups xmm0, [rsi+rcx]
+ movups xmm1, [rsi+rcx+10H]
+ movntps [rdi+rcx], xmm0
+ movntps [rdi+rcx+10H], xmm1
+ add rcx, 20H
+ jnz I110
+ sfence
+ jmp H120 ; Move the remaining edx bytes (0 - 31):
+
+
+align 16
+J100: ; There is a false memory dependence.
+ ; check if src and dest overlap, if not then it is safe
+ ; to copy backwards to avoid false memory dependence
+%if 1
+ ; Use this version if you want consistent behavior in the case
+ ; where dest > src and overlap. However, this case is undefined
+ ; anyway because part of src is overwritten before copying
+ push rdx
+ mov rax, rsi
+ sub rax, rdi
+ cqo
+ xor rax, rdx
+ sub rax, rdx ; abs(src-dest)
+ neg rcx ; size
+ pop rdx ; restore rdx
+ cmp rax, rcx
+ jnb J110
+ neg rcx ; restore rcx
+ jmp H110 ; overlap between src and dest. Can't copy backwards
+%else
+ ; save time by not checking the case that is undefined anyway
+ mov rax, rsi
+ sub rax, rdi
+ neg rcx ; size
+ cmp rax, rcx
+ jnb J110 ; OK to copy backwards
+ ; must copy forwards
+ neg rcx ; restore ecx
+ jmp H110 ; copy forwards
+
+%endif
+
+J110: ; copy backwards, rcx = size. rsi, rdi = end of src, dest
+ push rsi
+ push rdi
+ sub rsi, rcx
+ sub rdi, rcx
+J120: ; loop backwards
+ movups xmm1, [rsi+rcx-20H]
+ movups xmm0, [rsi+rcx-10H]
+ movaps [rdi+rcx-20H], xmm1
+ movaps [rdi+rcx-10H], xmm0
+ sub rcx, 20H
+ jnz J120
+ pop rdi
+ pop rsi
+ jmp H120
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Version for processors with SSSE3. Aligned read + shift + aligned write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+memcpySSSE3: ; global label
+memcpySSSE3@: ; local label
+ PROLOGM
+ cmp rcx, 40H
+ jb A1000 ; Use simpler code if count < 64
+
+ ; count >= 64
+ ; Calculate size of first block up to first regular boundary of dest
+ mov edx, edi
+ neg edx
+ and edx, 0FH
+ jz B1200 ; Skip if dest aligned by 16
+
+ ; edx = size of first partial block, 1 - 15 bytes
+ test dl, 3
+ jz B1030
+ test dl, 1
+ jz B1020
+ ; move 1 byte
+ movzx eax, byte [rsi]
+ mov [rdi], al
+ inc rsi
+ inc rdi
+B1020: test dl, 2
+ jz B1030
+ ; move 2 bytes
+ movzx eax, word [rsi]
+ mov [rdi], ax
+ add rsi, 2
+ add rdi, 2
+B1030: test dl, 4
+ jz B1040
+ ; move 4 bytes
+ mov eax, [rsi]
+ mov [rdi], eax
+ add rsi, 4
+ add rdi, 4
+B1040: test dl, 8
+ jz B1050
+ ; move 8 bytes
+ mov rax, [rsi]
+ mov [rdi], rax
+ add rsi, 8
+ add rdi, 8
+B1050: sub rcx, rdx
+B1200: ; Now dest is aligned by 16. Any partial block has been moved
+ ; Find alignment of src modulo 16 at this point:
+ mov eax, esi
+ and eax, 0FH
+
+ ; Set up for loop moving 32 bytes per iteration:
+ mov edx, ecx ; Save count (lower 32 bits)
+ and rcx, -20H ; Round down count to nearest multiple of 32
+ add rsi, rcx ; Point to the end
+ add rdi, rcx ; Point to the end
+ sub edx, ecx ; Remaining data after loop (0-31)
+ sub rsi, rax ; Nearest preceding aligned block of src
+
+ ; Check if count very big
+ cmp rcx, [CacheBypassLimit]
+ ja B1400 ; Use non-temporal store if count > CacheBypassLimit
+ neg rcx ; Negative index from the end
+
+ ; Dispatch to different codes depending on src alignment
+ lea r8, [AlignmentDispatchSSSE3]
+ jmp near [r8+rax*8]
+
+B1400: neg rcx
+ ; Dispatch to different codes depending on src alignment
+ lea r8, [AlignmentDispatchNT]
+ jmp near [r8+rax*8]
+
+
+align 16
+C100: ; Code for aligned src. SSE2 and SSSE3 versions
+ ; The nice case, src and dest have same alignment.
+
+ ; Loop. rcx has negative index from the end, counting up to zero
+ movaps xmm0, [rsi+rcx]
+ movaps xmm1, [rsi+rcx+10H]
+ movaps [rdi+rcx], xmm0
+ movaps [rdi+rcx+10H], xmm1
+ add rcx, 20H
+ jnz C100
+
+ ; Move the remaining edx bytes (0 - 31):
+ add rsi, rdx
+ add rdi, rdx
+ neg rdx
+ jz C500 ; Skip if no more data
+ ; move 16-8-4-2-1 bytes, aligned
+ cmp edx, -10H
+ jg C200
+ ; move 16 bytes
+ movaps xmm0, [rsi+rdx]
+ movaps [rdi+rdx], xmm0
+ add rdx, 10H
+C200: cmp edx, -8
+ jg C210
+ ; move 8 bytes
+ mov rax, [rsi+rdx]
+ mov [rdi+rdx], rax
+ add rdx, 8
+ jz C500 ; Early skip if count divisible by 8
+C210: cmp edx, -4
+ jg C220
+ ; move 4 bytes
+ mov eax, [rsi+rdx]
+ mov [rdi+rdx], eax
+ add rdx, 4
+C220: cmp edx, -2
+ jg C230
+ ; move 2 bytes
+ movzx eax, word [rsi+rdx]
+ mov [rdi+rdx], ax
+ add rdx, 2
+C230: cmp edx, -1
+ jg C500
+ ; move 1 byte
+ movzx eax, byte [rsi+rdx]
+ mov [rdi+rdx], al
+C500: ; finished
+ RETURNM
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Version for processors with SSE2. Aligned read + shift + aligned write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+memcpySSE2: ; global label
+memcpySSE2@: ; local label
+ PROLOGM
+ cmp rcx, 40H
+ jae B0100 ; Use simpler code if count < 64
+
+ ; count < 64. Move 32-16-8-4-2-1 bytes
+ add rsi, rcx ; end of src
+ add rdi, rcx ; end of dest
+ neg rcx ; negative index from the end
+ cmp ecx, -20H
+ jg A100
+ ; move 32 bytes
+ ; mov r64 is faster than movdqu on Intel Pentium M and Core 1
+ ; movdqu is fast on Nehalem and later
+ mov rax, [rsi+rcx]
+ mov rdx, [rsi+rcx+8]
+ mov [rdi+rcx], rax
+ mov [rdi+rcx+8], rdx
+ mov rax, qword [rsi+rcx+10H]
+ mov rdx, qword [rsi+rcx+18H]
+ mov qword [rdi+rcx+10H], rax
+ mov qword [rdi+rcx+18H], rdx
+ add rcx, 20H
+A100: cmp ecx, -10H
+ jg A200
+ ; move 16 bytes
+ mov rax, [rsi+rcx]
+ mov rdx, [rsi+rcx+8]
+ mov [rdi+rcx], rax
+ mov [rdi+rcx+8], rdx
+ add rcx, 10H
+A200: cmp ecx, -8
+ jg A300
+ ; move 8 bytes
+ mov rax, qword [rsi+rcx]
+ mov qword [rdi+rcx], rax
+ add rcx, 8
+A300: cmp ecx, -4
+ jg A400
+ ; move 4 bytes
+ mov eax, [rsi+rcx]
+ mov [rdi+rcx], eax
+ add rcx, 4
+ jz A900 ; early out if count divisible by 4
+A400: cmp ecx, -2
+ jg A500
+ ; move 2 bytes
+ movzx eax, word [rsi+rcx]
+ mov [rdi+rcx], ax
+ add rcx, 2
+A500: cmp ecx, -1
+ jg A900
+ ; move 1 byte
+ movzx eax, byte [rsi+rcx]
+ mov [rdi+rcx], al
+A900: ; finished
+ RETURNM
+
+B0100: ; count >= 64
+ ; Calculate size of first block up to first regular boundary of dest
+ mov edx, edi
+ neg edx
+ and edx, 0FH
+ jz B0200 ; Skip if dest aligned by 16
+
+ ; edx = size of first partial block, 1 - 15 bytes
+ test dl, 3
+ jz B0030
+ test dl, 1
+ jz B0020
+ ; move 1 byte
+ movzx eax, byte [rsi]
+ mov [rdi], al
+ inc rsi
+ inc rdi
+B0020: test dl, 2
+ jz B0030
+ ; move 2 bytes
+ movzx eax, word [rsi]
+ mov [rdi], ax
+ add rsi, 2
+ add rdi, 2
+B0030: test dl, 4
+ jz B0040
+ ; move 4 bytes
+ mov eax, [rsi]
+ mov [rdi], eax
+ add rsi, 4
+ add rdi, 4
+B0040: test dl, 8
+ jz B0050
+ ; move 8 bytes
+ mov rax, [rsi]
+ mov [rdi], rax
+ add rsi, 8
+ add rdi, 8
+B0050: sub rcx, rdx
+B0200: ; Now dest is aligned by 16. Any partial block has been moved
+
+ ; This part will not always work if count < 64
+ ; Calculate size of first block up to first regular boundary of dest
+ mov edx, edi
+ neg edx
+ and edx, 0FH
+ jz B300 ; Skip if dest aligned by 16
+
+ ; rdx = size of first partial block, 1 - 15 bytes
+ add rsi, rdx
+ add rdi, rdx
+ sub rcx, rdx
+ neg rdx
+ cmp edx, -8
+ jg B200
+ ; move 8 bytes
+ mov rax, [rsi+rdx]
+ mov [rdi+rdx], rax
+ add rdx, 8
+B200: cmp edx, -4
+ jg B210
+ ; move 4 bytes
+ mov eax, [rsi+rdx]
+ mov [rdi+rdx], eax
+ add rdx, 4
+ jz B300 ; early out if aligned by 4
+B210: cmp edx, -2
+ jg B220
+ ; move 2 bytes
+ movzx eax, word [rsi+rdx]
+ mov [rdi+rdx], ax
+ add rdx, 2
+B220: cmp edx, -1
+ jg B300
+ ; move 1 byte
+ movzx eax, byte [rsi+rdx]
+ mov [rdi+rdx], al
+
+B300: ; Now dest is aligned by 16. Any partial block has been moved
+ ; Find alignment of src modulo 16 at this point:
+ mov eax, esi
+ and eax, 0FH
+
+ ; Set up for loop moving 32 bytes per iteration:
+ mov edx, ecx ; Save count (lower 32 bits)
+ and rcx, -20H ; Round down count to nearest multiple of 32
+ add rsi, rcx ; Point to the end
+ add rdi, rcx ; Point to the end
+ sub edx, ecx ; Remaining data after loop (0-31)
+ sub rsi, rax ; Nearest preceding aligned block of src
+
+ ; Check if count very big
+ cmp rcx, [CacheBypassLimit]
+ ja B400 ; Use non-temporal store if count > CacheBypassLimit
+ neg rcx ; Negative index from the end
+
+ ; Dispatch to different codes depending on src alignment
+ lea r8, [AlignmentDispatchSSE2]
+ jmp near [r8+rax*8]
+
+B400: neg rcx
+ ; Dispatch to different codes depending on src alignment
+ lea r8, [AlignmentDispatchNT]
+ jmp near [r8+rax*8]
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Macros and alignment jump tables
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Macros for each src alignment, SSE2 instruction set:
+; Make separate code for each alignment u because the shift instructions
+; have the shift count as a constant:
+
+%MACRO MOVE_UNALIGNED_SSE2 2 ; u, nt
+; Move rcx + rdx bytes of data
+; Source is misaligned. (src-dest) modulo 16 = %1
+; %2 = 1 if non-temporal store desired
+; eax = %1
+; rsi = src - %1 = nearest preceding 16-bytes boundary
+; rdi = dest (aligned)
+; rcx = - (count rounded down to nearest divisible by 32)
+; edx = remaining bytes to move after loop
+ movdqa xmm0, [rsi+rcx] ; Read from nearest preceding 16B boundary
+%%L1: ; Loop. rcx has negative index from the end, counting up to zero
+ movdqa xmm1, [rsi+rcx+10H] ; Read next two blocks aligned
+ movdqa xmm2, [rsi+rcx+20H]
+ movdqa xmm3, xmm1 ; Copy because used twice
+ psrldq xmm0, %1 ; shift right
+ pslldq xmm1, 16-%1 ; shift left
+ por xmm0, xmm1 ; combine blocks
+ %IF %2 == 0
+ movdqa [rdi+rcx], xmm0 ; Save aligned
+ %ELSE
+ movntdq [rdi+rcx], xmm0 ; non-temporal save
+ %ENDIF
+ movdqa xmm0, xmm2 ; Save for next iteration
+ psrldq xmm3, %1 ; shift right
+ pslldq xmm2, 16-%1 ; shift left
+ por xmm3, xmm2 ; combine blocks
+ %IF %2 == 0
+ movdqa [rdi+rcx+10H], xmm3 ; Save aligned
+ %ELSE
+ movntdq [rdi+rcx+10H], xmm3 ; non-temporal save
+ %ENDIF
+ add rcx, 20H ; Loop through negative values up to zero
+ jnz %%L1
+
+ ; Set up for edx remaining bytes
+ add rsi, rdx
+ add rdi, rdx
+ neg rdx
+ cmp edx, -10H
+ jg %%L2
+ ; One more 16-bytes block to move
+ movdqa xmm1, [rsi+rdx+10H]
+ psrldq xmm0, %1 ; shift right
+ pslldq xmm1, 16-%1 ; shift left
+ por xmm0, xmm1 ; combine blocks
+ %IF %2 == 0
+ movdqa [rdi+rdx], xmm0 ; Save aligned
+ %ELSE
+ movntdq [rdi+rdx], xmm0 ; non-temporal save
+ %ENDIF
+ add rdx, 10H
+%%L2: ; Get src pointer back to misaligned state
+ add rsi, rax
+ %IF %2 == 1
+ sfence
+ %ENDIF
+ ; Move remaining 0 - 15 bytes, unaligned
+ jmp C200
+%ENDMACRO
+
+
+%MACRO MOVE_UNALIGNED_SSE2_4 1 ; nt
+; Special case for u = 4
+; %1 = 1 if non-temporal store desired
+ movaps xmm0, [rsi+rcx] ; Read from nearest preceding 16B boundary
+%%L1: ; Loop. rcx has negative index from the end, counting up to zero
+ movaps xmm1, [rsi+rcx+10H] ; Read next two blocks aligned
+ movss xmm0, xmm1 ; Moves 4 bytes, leaves remaining bytes unchanged
+ shufps xmm0, xmm0, 00111001B ; Rotate
+ %IF %1 == 0
+ movaps [rdi+rcx], xmm0 ; Save aligned
+ %ELSE
+ movntps [rdi+rcx], xmm0 ; Non-temporal save
+ %ENDIF
+ movaps xmm0, [rsi+rcx+20H]
+ movss xmm1, xmm0
+ shufps xmm1, xmm1, 00111001B
+ %IF %1 == 0
+ movaps [rdi+rcx+10H], xmm1 ; Save aligned
+ %ELSE
+ movntps [rdi+rcx+10H], xmm1 ; Non-temporal save
+ %ENDIF
+ add rcx, 20H ; Loop through negative values up to zero
+ jnz %%L1
+ ; Set up for edx remaining bytes
+ add rsi, rdx
+ add rdi, rdx
+ neg rdx
+ cmp edx, -10H
+ jg %%L2
+ ; One more 16-bytes block to move
+ movaps xmm1, [rsi+rdx+10H] ; Read next two blocks aligned
+ movss xmm0, xmm1
+ shufps xmm0, xmm0, 00111001B
+ %IF %1 == 0
+ movaps [rdi+rdx], xmm0 ; Save aligned
+ %ELSE
+ movntps [rdi+rdx], xmm0 ; Non-temporal save
+ %ENDIF
+ add rdx, 10H
+%%L2: ; Get src pointer back to misaligned state
+ add rsi, rax
+ %IF %1 == 1
+ sfence
+ %ENDIF
+ ; Move remaining 0 - 15 bytes, unaligned
+ jmp C200
+%ENDMACRO
+
+
+%MACRO MOVE_UNALIGNED_SSE2_8 1 ; nt
+; Special case for u = 8
+; %1 = 1 if non-temporal store desired
+ movaps xmm0, [rsi+rcx] ; Read from nearest preceding 16B boundary
+%%L1: ; Loop. rcx has negative index from the end, counting up to zero
+ movaps xmm1, [rsi+rcx+10H] ; Read next two blocks aligned
+ movsd xmm0, xmm1 ; Moves 8 bytes, leaves remaining bytes unchanged
+ shufps xmm0, xmm0, 01001110B ; Rotate
+ %IF %1 == 0
+ movaps [rdi+rcx], xmm0 ; Save aligned
+ %ELSE
+ movntps [rdi+rcx], xmm0 ; Non-temporal save
+ %ENDIF
+ movaps xmm0, [rsi+rcx+20H]
+ movsd xmm1, xmm0
+ shufps xmm1, xmm1, 01001110B
+ %IF %1 == 0
+ movaps [rdi+rcx+10H], xmm1 ; Save aligned
+ %ELSE
+ movntps [rdi+rcx+10H], xmm1 ; Non-temporal save
+ %ENDIF
+ add rcx, 20H ; Loop through negative values up to zero
+ jnz %%L1
+ ; Set up for edx remaining bytes
+ add rsi, rdx
+ add rdi, rdx
+ neg rdx
+ cmp edx, -10H
+ jg %%L2
+ ; One more 16-bytes block to move
+ movaps xmm1, [rsi+rdx+10H] ; Read next two blocks aligned
+ movsd xmm0, xmm1
+ shufps xmm0, xmm0, 01001110B
+ %IF %1 == 0
+ movaps [rdi+rdx], xmm0 ; Save aligned
+ %ELSE
+ movntps [rdi+rdx], xmm0 ; Non-temporal save
+ %ENDIF
+ add rdx, 10H
+%%L2: ; Get src pointer back to misaligned state
+ add rsi, rax
+ %IF %1 == 1
+ sfence
+ %ENDIF
+ ; Move remaining 0 - 15 bytes, unaligned
+ jmp C200
+%ENDMACRO
+
+
+%MACRO MOVE_UNALIGNED_SSE2_12 1 ; nt
+; Special case for u = 12
+; %1 = 1 if non-temporal store desired
+ movaps xmm0, [rsi+rcx] ; Read from nearest preceding 16B boundary
+ shufps xmm0, xmm0, 10010011B
+%%L1: ; Loop. rcx has negative index from the end, counting up to zero
+ movaps xmm1, [rsi+rcx+10H] ; Read next two blocks aligned
+ movaps xmm2, [rsi+rcx+20H]
+ shufps xmm1, xmm1, 10010011B
+ shufps xmm2, xmm2, 10010011B
+ movaps xmm3, xmm2
+ movss xmm2, xmm1 ; Moves 4 bytes, leaves remaining bytes unchanged
+ movss xmm1, xmm0 ; Moves 4 bytes, leaves remaining bytes unchanged
+ %IF %1 == 0
+ movaps [rdi+rcx], xmm1 ; Save aligned
+ movaps [rdi+rcx+10H], xmm2 ; Save aligned
+ %ELSE
+ movntps [rdi+rcx], xmm1 ; Non-temporal save
+ movntps [rdi+rcx+10H], xmm2 ; Non-temporal save
+ %ENDIF
+ movaps xmm0, xmm3 ; Save for next iteration
+ add rcx, 20H ; Loop through negative values up to zero
+ jnz %%L1
+ ; Set up for edx remaining bytes
+ add rsi, rdx
+ add rdi, rdx
+ neg rdx
+ cmp edx, -10H
+ jg %%L2
+ ; One more 16-bytes block to move
+ movaps xmm1, [rsi+rdx+10H] ; Read next two blocks aligned
+ shufps xmm1, xmm1, 10010011B
+ movss xmm1, xmm0 ; Moves 4 bytes, leaves remaining bytes unchanged
+ %IF %1 == 0
+ movaps [rdi+rdx], xmm1 ; Save aligned
+ %ELSE
+ movntps [rdi+rdx], xmm1 ; Non-temporal save
+ %ENDIF
+ add rdx, 10H
+%%L2: ; Get src pointer back to misaligned state
+ add rsi, rax
+ %IF %1 == 1
+ sfence
+ %ENDIF
+ ; Move remaining 0 - 15 bytes, unaligned
+ jmp C200
+%ENDMACRO
+
+
+; Macros for each src alignment, Suppl.SSE3 instruction set:
+; Make separate code for each alignment u because the palignr instruction
+; has the shift count as a constant:
+
+%MACRO MOVE_UNALIGNED_SSSE3 1 ; u
+; Move rcx + rdx bytes of data
+; Source is misaligned. (src-dest) modulo 16 = %1
+; eax = %1
+; rsi = src - %1 = nearest preceding 16-bytes boundary
+; rdi = dest (aligned)
+; rcx = - (count rounded down to nearest divisible by 32)
+; edx = remaining bytes to move after loop
+ movdqa xmm0, [rsi+rcx] ; Read from nearest preceding 16B boundary
+
+%%L1: ; Loop. rcx has negative index from the end, counting up to zero
+ movdqa xmm2, [rsi+rcx+10H] ; Read next two blocks
+ movdqa xmm3, [rsi+rcx+20H]
+ movdqa xmm1, xmm0 ; Save xmm0
+ movdqa xmm0, xmm3 ; Save for next iteration
+ palignr xmm3, xmm2, %1 ; Combine parts into aligned block
+ palignr xmm2, xmm1, %1 ; Combine parts into aligned block
+ movdqa [rdi+rcx], xmm2 ; Save aligned
+ movdqa [rdi+rcx+10H], xmm3 ; Save aligned
+ add rcx, 20H
+ jnz %%L1
+
+ ; Set up for edx remaining bytes
+ add rsi, rdx
+ add rdi, rdx
+ neg rdx
+ cmp edx, -10H
+ jg %%L2
+ ; One more 16-bytes block to move
+ movdqa xmm2, [rsi+rdx+10H]
+ palignr xmm2, xmm0, %1
+ movdqa [rdi+rdx], xmm2
+ add rdx, 10H
+%%L2: ; Get src pointer back to misaligned state
+ add rsi, rax
+ ; Move remaining 0 - 15 bytes
+ jmp C200
+%ENDMACRO
+
+
+; Make 15 instances of SSE2 macro for each value of the alignment u.
+; These are pointed to by the jump table AlignmentDispatchSSE2 below
+; (alignments and fillers are inserted manually to minimize the number
+; of 16-bytes boundaries inside loops)
+
+align 16
+D104: MOVE_UNALIGNED_SSE2_4 0
+times 4 nop
+D108: MOVE_UNALIGNED_SSE2_8 0
+times 4 nop
+D10C: MOVE_UNALIGNED_SSE2_12 0
+times 1 nop
+D101: MOVE_UNALIGNED_SSE2 1, 0
+D102: MOVE_UNALIGNED_SSE2 2, 0
+D103: MOVE_UNALIGNED_SSE2 3, 0
+D105: MOVE_UNALIGNED_SSE2 5, 0
+D106: MOVE_UNALIGNED_SSE2 6, 0
+D107: MOVE_UNALIGNED_SSE2 7, 0
+D109: MOVE_UNALIGNED_SSE2 9, 0
+times 1 nop
+D10A: MOVE_UNALIGNED_SSE2 0AH, 0
+D10B: MOVE_UNALIGNED_SSE2 0BH, 0
+D10D: MOVE_UNALIGNED_SSE2 0DH, 0
+D10E: MOVE_UNALIGNED_SSE2 0EH, 0
+D10F: MOVE_UNALIGNED_SSE2 0FH, 0
+
+; Make 15 instances of Suppl-SSE3 macro for each value of the alignment u.
+; These are pointed to by the jump table AlignmentDispatchSupSSE3 below
+
+align 16
+E104: MOVE_UNALIGNED_SSSE3 4
+E108: MOVE_UNALIGNED_SSSE3 8
+E10C: MOVE_UNALIGNED_SSSE3 0CH
+E101: MOVE_UNALIGNED_SSSE3 1
+E102: MOVE_UNALIGNED_SSSE3 2
+E103: MOVE_UNALIGNED_SSSE3 3
+E105: MOVE_UNALIGNED_SSSE3 5
+E106: MOVE_UNALIGNED_SSSE3 6
+E107: MOVE_UNALIGNED_SSSE3 7
+E109: MOVE_UNALIGNED_SSSE3 9
+times 1 nop
+E10A: MOVE_UNALIGNED_SSSE3 0AH
+E10B: MOVE_UNALIGNED_SSSE3 0BH
+E10D: MOVE_UNALIGNED_SSSE3 0DH
+E10E: MOVE_UNALIGNED_SSSE3 0EH
+E10F: MOVE_UNALIGNED_SSSE3 0FH
+
+; Codes for non-temporal move. Aligned case first
+
+align 16
+F100: ; Non-temporal move, src and dest have same alignment.
+ ; Loop. rcx has negative index from the end, counting up to zero
+ movaps xmm0, [rsi+rcx] ; Read
+ movaps xmm1, [rsi+rcx+10H]
+ movntps [rdi+rcx], xmm0 ; Write non-temporal (bypass cache)
+ movntps [rdi+rcx+10H], xmm1
+ add rcx, 20H
+ jnz F100 ; Loop through negative rcx up to zero
+
+ ; Move the remaining edx bytes (0 - 31):
+ add rsi, rdx
+ add rdi, rdx
+ neg rdx
+ jz C500 ; Skip if no more data
+ ; Check if we can more one more 16-bytes block
+ cmp edx, -10H
+ jg C200
+ ; move 16 bytes, aligned
+ movaps xmm0, [rsi+rdx]
+ movntps [rdi+rdx], xmm0
+ add rdx, 10H
+ sfence
+ ; move the remaining 0 - 15 bytes
+ jmp C200
+
+; Make 15 instances of MOVE_UNALIGNED_SSE2 macro for each value of
+; the alignment u.
+; These are pointed to by the jump table AlignmentDispatchNT below
+
+;align 16
+F104: MOVE_UNALIGNED_SSE2_4 1
+F108: MOVE_UNALIGNED_SSE2_8 1
+F10C: MOVE_UNALIGNED_SSE2_12 1
+F101: MOVE_UNALIGNED_SSE2 1, 1
+F102: MOVE_UNALIGNED_SSE2 2, 1
+F103: MOVE_UNALIGNED_SSE2 3, 1
+F105: MOVE_UNALIGNED_SSE2 5, 1
+F106: MOVE_UNALIGNED_SSE2 6, 1
+F107: MOVE_UNALIGNED_SSE2 7, 1
+F109: MOVE_UNALIGNED_SSE2 9, 1
+F10A: MOVE_UNALIGNED_SSE2 0AH, 1
+F10B: MOVE_UNALIGNED_SSE2 0BH, 1
+F10D: MOVE_UNALIGNED_SSE2 0DH, 1
+F10E: MOVE_UNALIGNED_SSE2 0EH, 1
+F10F: MOVE_UNALIGNED_SSE2 0FH, 1
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; CPU dispatcher
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+memcpyCPUDispatch: ; CPU dispatcher, check for instruction sets and which method is fastest
+ ; This part is executed only once
+ push rbx
+ push rcx
+ push rdx
+ push rsi
+ push rdi
+ push r8
+ ; set CacheBypassLimit to half the size of the largest level cache
+ call GetMemcpyCacheLimit@
+ mov eax, 1
+ cpuid ; Get feature flags
+ lea rbx, [memcpySSE2@]
+ bt ecx, 9 ; Test bit for SupplSSE3
+ jnc Q100
+ lea rbx, [memcpySSSE3@]
+ call UnalignedIsFaster ; Test if unaligned read is faster than aligned read and shift
+ test eax, eax
+ jz Q100
+ lea rbx, [memcpyU@]
+ call Store256BitIsFaster ; Test if 256-bit read/write is available and faster than 128-bit read/write
+ test eax, eax
+ jz Q100
+ lea rbx, [memcpyU256@]
+Q100:
+ ; Insert appropriate pointer
+ mov [memcpyDispatch], rbx
+ mov rax, rbx
+ pop r8
+ pop rdi
+ pop rsi
+ pop rdx
+ pop rcx
+ pop rbx
+ ; Jump according to the replaced function pointer
+ jmp rax
+
+; extern "C" size_t GetMemcpyCacheLimit();
+GetMemcpyCacheLimit:
+GetMemcpyCacheLimit@: ; local limit
+ mov rax, [CacheBypassLimit]
+ test rax, rax
+ jnz U200
+ ; Get half the size of the largest level cache
+%ifdef WINDOWS
+ xor ecx, ecx ; 0 means largest level cache
+%else
+ xor edi, edi ; 0 means largest level cache
+%endif
+ call DataCacheSize ; get cache size
+ shr rax, 1 ; half the size
+ jnz U100
+ mov eax, 400000H ; cannot determine cache size. use 4 Mbytes
+U100: mov [CacheBypassLimit], rax
+U200: ret
+
+; Note: SetMemcpyCacheLimit is defined in memmove64.asm, calling SetMemcpyCacheLimit1
+SetMemcpyCacheLimit1:
+%ifdef WINDOWS
+ mov rax, rcx
+%else
+ mov rax, rdi
+%endif
+ test rax, rax
+ jnz U400
+ ; zero, means default
+ mov [CacheBypassLimit], rax
+ call GetMemcpyCacheLimit@
+U400: mov [CacheBypassLimit], rax
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; getDispatch, for testing only
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+getDispatch:
+mov rax,[memcpyDispatch]
+ret
+
+global getDispatch
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; data section. jump tables, dispatch function pointer, cache size
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Data segment must be included in function namespace
+SECTION .data
+align 16
+
+; Jump tables for alignments 0 - 15:
+; The CPU dispatcher replaces AlignmentDispatch with
+; AlignmentDispatchSSE2 or AlignmentDispatchSupSSE3 if Suppl-SSE3
+; is supported.
+
+; Code pointer for each alignment for SSE2 instruction set
+AlignmentDispatchSSE2:
+DQ C100, D101, D102, D103, D104, D105, D106, D107
+DQ D108, D109, D10A, D10B, D10C, D10D, D10E, D10F
+
+; Code pointer for each alignment for Suppl-SSE3 instruction set
+AlignmentDispatchSSSE3:
+DQ C100, E101, E102, E103, E104, E105, E106, E107
+DQ E108, E109, E10A, E10B, E10C, E10D, E10E, E10F
+
+; Code pointer for each alignment for non-temporal store
+AlignmentDispatchNT:
+DQ F100, F101, F102, F103, F104, F105, F106, F107
+DQ F108, F109, F10A, F10B, F10C, F10D, F10E, F10F
+
+; Pointer to appropriate version.
+; This initially points to memcpyCPUDispatch. memcpyCPUDispatch will
+; change this to the appropriate version of memcpy, so that
+; memcpyCPUDispatch is only executed once:
+memcpyDispatch DQ memcpyCPUDispatch
+
+; Bypass cache by using non-temporal moves if count > CacheBypassLimit
+; The optimal value of _CacheBypassLimit is difficult to estimate, but
+; a reasonable value is half the size of the largest cache:
+CacheBypassLimit: DQ 0