diff options
author | thegeorg <[email protected]> | 2023-08-22 18:56:30 +0300 |
---|---|---|
committer | thegeorg <[email protected]> | 2023-08-22 19:13:38 +0300 |
commit | 769d14120ef8e30363c7dd6870ce1b82552587c3 (patch) | |
tree | c407d1d3f152b9f6eb13f50abc3f5b06db82f9b3 /contrib/libs/asmlib/memcpy64.asm | |
parent | 494eee7cbbaf3e7d71a133c80c96aec26e518c2a (diff) |
Extract asmlib manipulations into separate block
Diffstat (limited to 'contrib/libs/asmlib/memcpy64.asm')
-rw-r--r-- | contrib/libs/asmlib/memcpy64.asm | 1332 |
1 files changed, 0 insertions, 1332 deletions
diff --git a/contrib/libs/asmlib/memcpy64.asm b/contrib/libs/asmlib/memcpy64.asm deleted file mode 100644 index d590990b99d..00000000000 --- a/contrib/libs/asmlib/memcpy64.asm +++ /dev/null @@ -1,1332 +0,0 @@ -%include "defs.asm"
-
-;************************* memcpy64.asm ************************************
-; Author: Agner Fog
-; Date created: 2008-07-19
-; Last modified: 2016-11-12 (patched version with AVX512 support removed)
-;
-; Description:
-; Faster version of the standard memcpy function:
-; void * A_memcpy(void *dest, const void *src, size_t count);
-; Copies 'count' bytes from 'src' to 'dest'
-;
-; Overriding standard function memcpy:
-; The alias ?OVR_memcpy is changed to _memcpy in the object file if
-; it is desired to override the standard library function memcpy.
-;
-; The function uses non-temporal writes to bypass the cache when the size is
-; bigger than half the size of the largest_level cache. This limit can be
-; read with GetMemcpyCacheLimit and changed with SetMemcpyCacheLimit
-; C++ prototypes:
-; extern "C" size_t GetMemcpyCacheLimit(); // in memcpy64.asm
-; extern "C" void SetMemcpyCacheLimit(); // in memmove64.asm
-; extern "C" void SetMemcpyCacheLimit1(); // used internally
-;
-; Position-independent code is generated if POSITIONINDEPENDENT is defined.
-;
-; CPU dispatching included SSE2, Suppl-SSE3 and AVX instruction sets.
-;
-; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
-;******************************************************************************
-
-default rel
-
-global A_memcpy: function ; Function A_memcpy
-global EXP(memcpy): function ; ?OVR removed if standard function memcpy overridden
-global memcpySSE2: function ; Version for processors with only SSE2
-global memcpySSSE3: function ; Version for processors with SSSE3
-global memcpyU: function ; Version for processors with fast unaligned read
-global memcpyU256: function ; Version for processors with fast 256-bit read/write
-
-global GetMemcpyCacheLimit: function ; Get the size limit for bypassing cache when copying with memcpy and memmove
-global SetMemcpyCacheLimit1: function ; Set the size limit for bypassing cache when copying with memcpy
-
-
-; Imported from instrset64.asm
-extern InstructionSet ; Instruction set for CPU dispatcher
-
-; Imported from unalignedisfaster64.asm:
-extern UnalignedIsFaster ; Tells if unaligned read is faster than PALIGNR
-extern Store256BitIsFaster ; Tells if a 256 bit store is faster than two 128 bit stores
-
-; Imported from cachesize32.asm:
-extern DataCacheSize ; Gets size of data cache
-
-
-; Define prolog for this function
-%MACRO PROLOGM 0
-%IFDEF WINDOWS
- push rsi
- push rdi
- mov rdi, rcx ; dest
- mov r9, rcx ; dest
- mov rsi, rdx ; src
- mov rcx, r8 ; count
-%ELSE ; Unix
- mov rcx, rdx ; count
- mov r9, rdi ; dest
-%ENDIF
-%ENDM
-
-; Define return from this function
-%MACRO RETURNM 0
-%IFDEF WINDOWS
- pop rdi
- pop rsi
-%ENDIF
- mov rax, r9 ; Return value = dest
- ret
-%ENDM
-
-
-SECTION .text align=16
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-; Common entry for dispatch
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-; extern "C" void * A_memcpy(void * dest, const void * src, size_t count);
-; Function entry:
-A_memcpy:
-EXP(memcpy):
- jmp qword [memcpyDispatch] ; Go to appropriate version, depending on instruction set
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-; AVX Version for processors with fast unaligned read and fast 32 bytes write
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-align 16
-memcpyU256: ; global label
-memcpyU256@: ; local label
- PROLOGM
- cmp rcx, 40H
- jb A1000 ; Use simpler code if count < 64
-
- ; count >= 64
- ; Calculate size of first block up to first regular boundary of dest
- mov edx, edi
- neg edx
- and edx, 1FH
- jz B3100 ; Skip if dest aligned by 32
-
- ; edx = size of first partial block, 1 - 31 bytes
- test dl, 3
- jz B3030
- test dl, 1
- jz B3020
- ; move 1 byte
- movzx eax, byte [rsi]
- mov [rdi], al
- inc rsi
- inc rdi
-B3020: test dl, 2
- jz B3030
- ; move 2 bytes
- movzx eax, word [rsi]
- mov [rdi], ax
- add rsi, 2
- add rdi, 2
-B3030: test dl, 4
- jz B3040
- ; move 4 bytes
- mov eax, [rsi]
- mov [rdi], eax
- add rsi, 4
- add rdi, 4
-B3040: test dl, 8
- jz B3050
- ; move 8 bytes
- mov rax, [rsi]
- mov [rdi], rax
- add rsi, 8
- add rdi, 8
-B3050: test dl, 16
- jz B3060
- ; move 16 bytes
- movups xmm0, [rsi]
- movaps [rdi], xmm0
- add rsi, 16
- add rdi, 16
-B3060: sub rcx, rdx
-
-B3100: ; Now dest is aligned by 32. Any partial block has been moved
-
- ; Set up for loop moving 32 bytes per iteration:
- mov rdx, rcx ; Save count
- and rcx, -20H ; Round down to nearest multiple of 32
- add rsi, rcx ; Point to the end
- add rdi, rcx ; Point to the end
- sub rdx, rcx ; Remaining data after loop
- ; Check if count very big
- cmp rcx, [CacheBypassLimit]
- ja I3100 ; Use non-temporal store if count > CacheBypassLimit
- neg rcx ; Negative index from the end
-
-H3100: ; copy -rcx bytes in blocks of 32 bytes.
-
- ; Check for false memory dependence: The CPU may falsely assume
- ; a partial overlap between the written destination and the following
- ; read source if source is unaligned and
- ; (src-dest) modulo 4096 is close to 4096
- test sil, 1FH
- jz H3110 ; aligned
- mov eax, esi
- sub eax, edi
- and eax, 0FFFH ; modulo 4096
- cmp eax, 1000H - 200H
- ja J3100
-
-align 16
-H3110: ; main copy loop, 32 bytes at a time
- ; rcx has negative index from the end, counting up to zero
- vmovups ymm0, [rsi+rcx]
- vmovaps [rdi+rcx], ymm0
- add rcx, 20H
- jnz H3110
- sfence
- vzeroupper ; end of AVX mode
-
-H3120: ; Move the remaining edx bytes (0 - 31):
- add rsi, rdx
- add rdi, rdx
- neg rdx
- jz H3500 ; Skip if no more data
- ; move 16-8-4-2-1 bytes, aligned
- cmp edx, -10H
- jg H3200
- ; move 16 bytes
- movups xmm0, [rsi+rdx]
- movaps [rdi+rdx], xmm0
- add rdx, 10H
-H3200: cmp edx, -8
- jg H3210
- ; move 8 bytes
- movq xmm0, qword [rsi+rdx]
- movq qword [rdi+rdx], xmm0
- add rdx, 8
- jz H500 ; Early skip if count divisible by 8
-H3210: cmp edx, -4
- jg H3220
- ; move 4 bytes
- mov eax, [rsi+rdx]
- mov [rdi+rdx], eax
- add rdx, 4
-H3220: cmp edx, -2
- jg H3230
- ; move 2 bytes
- movzx eax, word [rsi+rdx]
- mov [rdi+rdx], ax
- add rdx, 2
-H3230: cmp edx, -1
- jg H3500
- ; move 1 byte
- movzx eax, byte [rsi+rdx]
- mov [rdi+rdx], al
-H3500: ; finished
- RETURNM
-
-I3100: ; non-temporal move
- neg rcx ; Negative index from the end
-
-align 16
-I3110: ; main copy loop, 32 bytes at a time
- ; rcx has negative index from the end, counting up to zero
- vmovups ymm0, [rsi+rcx]
- vmovntps [rdi+rcx], ymm0
- add rcx, 20H
- jnz I3110
- sfence
- vzeroupper ; end of AVX mode
- jmp H3120 ; Move the remaining edx bytes (0 - 31)
-
-
-align 16
-J3100: ; There is a false memory dependence.
- ; check if src and dest overlap, if not then it is safe
- ; to copy backwards to avoid false memory dependence
-%if 1
- ; Use this version if you want consistent behavior in the case
- ; where dest > src and overlap. However, this case is undefined
- ; anyway because part of src is overwritten before copying
- push rdx
- mov rax, rsi
- sub rax, rdi
- cqo
- xor rax, rdx
- sub rax, rdx ; abs(src-dest)
- neg rcx ; size
- pop rdx ; restore rdx
- cmp rax, rcx
- jnb J3110
- neg rcx ; restore rcx
- jmp H3110 ; overlap between src and dest. Can't copy backwards
-%else
- ; save time by not checking the case that is undefined anyway
- mov rax, rsi
- sub rax, rdi
- neg rcx ; size
- cmp rax, rcx
- jnb J3110 ; OK to copy backwards
- ; must copy forwards
- neg rcx ; restore ecx
- jmp H3110 ; copy forwards
-
-%endif
-
-J3110: ; copy backwards, rcx = size. rsi, rdi = end of src, dest
- push rsi
- push rdi
- sub rsi, rcx
- sub rdi, rcx
-J3120: ; loop backwards
- vmovups ymm0, [rsi+rcx-20H]
- vmovaps [rdi+rcx-20H], ymm0
- sub rcx, 20H
- jnz J3120
- sfence
- vzeroupper
- pop rdi
- pop rsi
- jmp H3120
-
-align 16
- ; count < 64. Move 32-16-8-4-2-1 bytes
- ; multiple CPU versions (SSSE3 and above)
-A1000: add rsi, rcx ; end of src
- add rdi, rcx ; end of dest
- neg rcx ; negative index from the end
- cmp ecx, -20H
- jg A1100
- ; move 32 bytes
- ; movdqu is faster than 64-bit moves on processors with SSSE3
- movups xmm0, [rsi+rcx]
- movups xmm1, [rsi+rcx+10H]
- movups [rdi+rcx], xmm0
- movups [rdi+rcx+10H], xmm1
- add rcx, 20H
-A1100: cmp ecx, -10H
- jg A1200
- ; move 16 bytes
- movups xmm0, [rsi+rcx]
- movups [rdi+rcx], xmm0
- add rcx, 10H
-A1200: cmp ecx, -8
- jg A1300
- ; move 8 bytes
- mov rax, qword [rsi+rcx]
- mov qword [rdi+rcx], rax
- add rcx, 8
-A1300: cmp ecx, -4
- jg A1400
- ; move 4 bytes
- mov eax, [rsi+rcx]
- mov [rdi+rcx], eax
- add rcx, 4
- jz A1900 ; early out if count divisible by 4
-A1400: cmp ecx, -2
- jg A1500
- ; move 2 bytes
- movzx eax, word [rsi+rcx]
- mov [rdi+rcx], ax
- add rcx, 2
-A1500: cmp ecx, -1
- jg A1900
- ; move 1 byte
- movzx eax, byte [rsi+rcx]
- mov [rdi+rcx], al
-A1900: ; finished
- RETURNM
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-; Version for processors with fast unaligned read and fast 16 bytes write
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-align 16
-memcpyU: ; global label
-memcpyU@: ; local label
- PROLOGM
- cmp rcx, 40H
- jb A1000 ; Use simpler code if count < 64
-
- ; count >= 64
- ; Calculate size of first block up to first regular boundary of dest
- mov edx, edi
- neg edx
- and edx, 0FH
- jz B2100 ; Skip if dest aligned by 16
-
- ; edx = size of first partial block, 1 - 15 bytes
- test dl, 3
- jz B2030
- test dl, 1
- jz B2020
- ; move 1 byte
- movzx eax, byte [rsi]
- mov [rdi], al
- inc rsi
- inc rdi
-B2020: test dl, 2
- jz B2030
- ; move 2 bytes
- movzx eax, word [rsi]
- mov [rdi], ax
- add rsi, 2
- add rdi, 2
-B2030: test dl, 4
- jz B2040
- ; move 4 bytes
- mov eax, [rsi]
- mov [rdi], eax
- add rsi, 4
- add rdi, 4
-B2040: test dl, 8
- jz B2050
- ; move 8 bytes
- mov rax, [rsi]
- mov [rdi], rax
- add rsi, 8
- add rdi, 8
-B2050: sub rcx, rdx
-B2100: ; Now dest is aligned by 16. Any partial block has been moved
-
- ; Set up for loop moving 32 bytes per iteration:
- mov rdx, rcx ; Save count
- and rcx, -20H ; Round down to nearest multiple of 32
- add rsi, rcx ; Point to the end
- add rdi, rcx ; Point to the end
- sub rdx, rcx ; Remaining data after loop
-
- ; Check if count very big
- cmp rcx, [CacheBypassLimit]
- ja I100 ; Use non-temporal store if count > CacheBypassLimit
- neg rcx ; Negative index from the end
-
-H100: ; copy -rcx bytes in blocks of 32 bytes.
-
- ; Check for false memory dependence: The CPU may falsely assume
- ; a partial overlap between the written destination and the following
- ; read source if source is unaligned and
- ; (src-dest) modulo 4096 is close to 4096
- test sil, 0FH
- jz H110 ; aligned
- mov eax, esi
- sub eax, edi
- and eax, 0FFFH ; modulo 4096
- cmp eax, 1000H - 200H
- ja J100
-
-H110: ; main copy loop, 32 bytes at a time
- ; rcx has negative index from the end, counting up to zero
- movups xmm0, [rsi+rcx]
- movups xmm1, [rsi+rcx+10H]
- movaps [rdi+rcx], xmm0
- movaps [rdi+rcx+10H], xmm1
- add rcx, 20H
- jnz H110
-
-H120: ; Move the remaining edx bytes (0 - 31):
- add rsi, rdx
- add rdi, rdx
- neg rdx
- jz H500 ; Skip if no more data
- ; move 16-8-4-2-1 bytes, aligned
- cmp edx, -10H
- jg H200
- ; move 16 bytes
- movups xmm0, [rsi+rdx]
- movaps [rdi+rdx], xmm0
- add rdx, 10H
-H200: cmp edx, -8
- jg H210
- ; move 8 bytes
- movq xmm0, qword [rsi+rdx]
- movq qword [rdi+rdx], xmm0
- add rdx, 8
- jz H500 ; Early skip if count divisible by 8
-H210: cmp edx, -4
- jg H220
- ; move 4 bytes
- mov eax, [rsi+rdx]
- mov [rdi+rdx], eax
- add rdx, 4
-H220: cmp edx, -2
- jg H230
- ; move 2 bytes
- movzx eax, word [rsi+rdx]
- mov [rdi+rdx], ax
- add rdx, 2
-H230: cmp edx, -1
- jg H500
- ; move 1 byte
- movzx eax, byte [rsi+rdx]
- mov [rdi+rdx], al
-H500: ; finished
- RETURNM
-
-I100: ; non-temporal move
- neg rcx ; Negative index from the end
-
-align 16
-I110: ; main copy loop, 32 bytes at a time
- ; rcx has negative index from the end, counting up to zero
- movups xmm0, [rsi+rcx]
- movups xmm1, [rsi+rcx+10H]
- movntps [rdi+rcx], xmm0
- movntps [rdi+rcx+10H], xmm1
- add rcx, 20H
- jnz I110
- sfence
- jmp H120 ; Move the remaining edx bytes (0 - 31):
-
-
-align 16
-J100: ; There is a false memory dependence.
- ; check if src and dest overlap, if not then it is safe
- ; to copy backwards to avoid false memory dependence
-%if 1
- ; Use this version if you want consistent behavior in the case
- ; where dest > src and overlap. However, this case is undefined
- ; anyway because part of src is overwritten before copying
- push rdx
- mov rax, rsi
- sub rax, rdi
- cqo
- xor rax, rdx
- sub rax, rdx ; abs(src-dest)
- neg rcx ; size
- pop rdx ; restore rdx
- cmp rax, rcx
- jnb J110
- neg rcx ; restore rcx
- jmp H110 ; overlap between src and dest. Can't copy backwards
-%else
- ; save time by not checking the case that is undefined anyway
- mov rax, rsi
- sub rax, rdi
- neg rcx ; size
- cmp rax, rcx
- jnb J110 ; OK to copy backwards
- ; must copy forwards
- neg rcx ; restore ecx
- jmp H110 ; copy forwards
-
-%endif
-
-J110: ; copy backwards, rcx = size. rsi, rdi = end of src, dest
- push rsi
- push rdi
- sub rsi, rcx
- sub rdi, rcx
-J120: ; loop backwards
- movups xmm1, [rsi+rcx-20H]
- movups xmm0, [rsi+rcx-10H]
- movaps [rdi+rcx-20H], xmm1
- movaps [rdi+rcx-10H], xmm0
- sub rcx, 20H
- jnz J120
- pop rdi
- pop rsi
- jmp H120
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-; Version for processors with SSSE3. Aligned read + shift + aligned write
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-align 16
-memcpySSSE3: ; global label
-memcpySSSE3@: ; local label
- PROLOGM
- cmp rcx, 40H
- jb A1000 ; Use simpler code if count < 64
-
- ; count >= 64
- ; Calculate size of first block up to first regular boundary of dest
- mov edx, edi
- neg edx
- and edx, 0FH
- jz B1200 ; Skip if dest aligned by 16
-
- ; edx = size of first partial block, 1 - 15 bytes
- test dl, 3
- jz B1030
- test dl, 1
- jz B1020
- ; move 1 byte
- movzx eax, byte [rsi]
- mov [rdi], al
- inc rsi
- inc rdi
-B1020: test dl, 2
- jz B1030
- ; move 2 bytes
- movzx eax, word [rsi]
- mov [rdi], ax
- add rsi, 2
- add rdi, 2
-B1030: test dl, 4
- jz B1040
- ; move 4 bytes
- mov eax, [rsi]
- mov [rdi], eax
- add rsi, 4
- add rdi, 4
-B1040: test dl, 8
- jz B1050
- ; move 8 bytes
- mov rax, [rsi]
- mov [rdi], rax
- add rsi, 8
- add rdi, 8
-B1050: sub rcx, rdx
-B1200: ; Now dest is aligned by 16. Any partial block has been moved
- ; Find alignment of src modulo 16 at this point:
- mov eax, esi
- and eax, 0FH
-
- ; Set up for loop moving 32 bytes per iteration:
- mov edx, ecx ; Save count (lower 32 bits)
- and rcx, -20H ; Round down count to nearest multiple of 32
- add rsi, rcx ; Point to the end
- add rdi, rcx ; Point to the end
- sub edx, ecx ; Remaining data after loop (0-31)
- sub rsi, rax ; Nearest preceding aligned block of src
-
- ; Check if count very big
- cmp rcx, [CacheBypassLimit]
- ja B1400 ; Use non-temporal store if count > CacheBypassLimit
- neg rcx ; Negative index from the end
-
- ; Dispatch to different codes depending on src alignment
- lea r8, [AlignmentDispatchSSSE3]
- jmp near [r8+rax*8]
-
-B1400: neg rcx
- ; Dispatch to different codes depending on src alignment
- lea r8, [AlignmentDispatchNT]
- jmp near [r8+rax*8]
-
-
-align 16
-C100: ; Code for aligned src. SSE2 and SSSE3 versions
- ; The nice case, src and dest have same alignment.
-
- ; Loop. rcx has negative index from the end, counting up to zero
- movaps xmm0, [rsi+rcx]
- movaps xmm1, [rsi+rcx+10H]
- movaps [rdi+rcx], xmm0
- movaps [rdi+rcx+10H], xmm1
- add rcx, 20H
- jnz C100
-
- ; Move the remaining edx bytes (0 - 31):
- add rsi, rdx
- add rdi, rdx
- neg rdx
- jz C500 ; Skip if no more data
- ; move 16-8-4-2-1 bytes, aligned
- cmp edx, -10H
- jg C200
- ; move 16 bytes
- movaps xmm0, [rsi+rdx]
- movaps [rdi+rdx], xmm0
- add rdx, 10H
-C200: cmp edx, -8
- jg C210
- ; move 8 bytes
- mov rax, [rsi+rdx]
- mov [rdi+rdx], rax
- add rdx, 8
- jz C500 ; Early skip if count divisible by 8
-C210: cmp edx, -4
- jg C220
- ; move 4 bytes
- mov eax, [rsi+rdx]
- mov [rdi+rdx], eax
- add rdx, 4
-C220: cmp edx, -2
- jg C230
- ; move 2 bytes
- movzx eax, word [rsi+rdx]
- mov [rdi+rdx], ax
- add rdx, 2
-C230: cmp edx, -1
- jg C500
- ; move 1 byte
- movzx eax, byte [rsi+rdx]
- mov [rdi+rdx], al
-C500: ; finished
- RETURNM
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-; Version for processors with SSE2. Aligned read + shift + aligned write
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-memcpySSE2: ; global label
-memcpySSE2@: ; local label
- PROLOGM
- cmp rcx, 40H
- jae B0100 ; Use simpler code if count < 64
-
- ; count < 64. Move 32-16-8-4-2-1 bytes
- add rsi, rcx ; end of src
- add rdi, rcx ; end of dest
- neg rcx ; negative index from the end
- cmp ecx, -20H
- jg A100
- ; move 32 bytes
- ; mov r64 is faster than movdqu on Intel Pentium M and Core 1
- ; movdqu is fast on Nehalem and later
- mov rax, [rsi+rcx]
- mov rdx, [rsi+rcx+8]
- mov [rdi+rcx], rax
- mov [rdi+rcx+8], rdx
- mov rax, qword [rsi+rcx+10H]
- mov rdx, qword [rsi+rcx+18H]
- mov qword [rdi+rcx+10H], rax
- mov qword [rdi+rcx+18H], rdx
- add rcx, 20H
-A100: cmp ecx, -10H
- jg A200
- ; move 16 bytes
- mov rax, [rsi+rcx]
- mov rdx, [rsi+rcx+8]
- mov [rdi+rcx], rax
- mov [rdi+rcx+8], rdx
- add rcx, 10H
-A200: cmp ecx, -8
- jg A300
- ; move 8 bytes
- mov rax, qword [rsi+rcx]
- mov qword [rdi+rcx], rax
- add rcx, 8
-A300: cmp ecx, -4
- jg A400
- ; move 4 bytes
- mov eax, [rsi+rcx]
- mov [rdi+rcx], eax
- add rcx, 4
- jz A900 ; early out if count divisible by 4
-A400: cmp ecx, -2
- jg A500
- ; move 2 bytes
- movzx eax, word [rsi+rcx]
- mov [rdi+rcx], ax
- add rcx, 2
-A500: cmp ecx, -1
- jg A900
- ; move 1 byte
- movzx eax, byte [rsi+rcx]
- mov [rdi+rcx], al
-A900: ; finished
- RETURNM
-
-B0100: ; count >= 64
- ; Calculate size of first block up to first regular boundary of dest
- mov edx, edi
- neg edx
- and edx, 0FH
- jz B0200 ; Skip if dest aligned by 16
-
- ; edx = size of first partial block, 1 - 15 bytes
- test dl, 3
- jz B0030
- test dl, 1
- jz B0020
- ; move 1 byte
- movzx eax, byte [rsi]
- mov [rdi], al
- inc rsi
- inc rdi
-B0020: test dl, 2
- jz B0030
- ; move 2 bytes
- movzx eax, word [rsi]
- mov [rdi], ax
- add rsi, 2
- add rdi, 2
-B0030: test dl, 4
- jz B0040
- ; move 4 bytes
- mov eax, [rsi]
- mov [rdi], eax
- add rsi, 4
- add rdi, 4
-B0040: test dl, 8
- jz B0050
- ; move 8 bytes
- mov rax, [rsi]
- mov [rdi], rax
- add rsi, 8
- add rdi, 8
-B0050: sub rcx, rdx
-B0200: ; Now dest is aligned by 16. Any partial block has been moved
-
- ; This part will not always work if count < 64
- ; Calculate size of first block up to first regular boundary of dest
- mov edx, edi
- neg edx
- and edx, 0FH
- jz B300 ; Skip if dest aligned by 16
-
- ; rdx = size of first partial block, 1 - 15 bytes
- add rsi, rdx
- add rdi, rdx
- sub rcx, rdx
- neg rdx
- cmp edx, -8
- jg B200
- ; move 8 bytes
- mov rax, [rsi+rdx]
- mov [rdi+rdx], rax
- add rdx, 8
-B200: cmp edx, -4
- jg B210
- ; move 4 bytes
- mov eax, [rsi+rdx]
- mov [rdi+rdx], eax
- add rdx, 4
- jz B300 ; early out if aligned by 4
-B210: cmp edx, -2
- jg B220
- ; move 2 bytes
- movzx eax, word [rsi+rdx]
- mov [rdi+rdx], ax
- add rdx, 2
-B220: cmp edx, -1
- jg B300
- ; move 1 byte
- movzx eax, byte [rsi+rdx]
- mov [rdi+rdx], al
-
-B300: ; Now dest is aligned by 16. Any partial block has been moved
- ; Find alignment of src modulo 16 at this point:
- mov eax, esi
- and eax, 0FH
-
- ; Set up for loop moving 32 bytes per iteration:
- mov edx, ecx ; Save count (lower 32 bits)
- and rcx, -20H ; Round down count to nearest multiple of 32
- add rsi, rcx ; Point to the end
- add rdi, rcx ; Point to the end
- sub edx, ecx ; Remaining data after loop (0-31)
- sub rsi, rax ; Nearest preceding aligned block of src
-
- ; Check if count very big
- cmp rcx, [CacheBypassLimit]
- ja B400 ; Use non-temporal store if count > CacheBypassLimit
- neg rcx ; Negative index from the end
-
- ; Dispatch to different codes depending on src alignment
- lea r8, [AlignmentDispatchSSE2]
- jmp near [r8+rax*8]
-
-B400: neg rcx
- ; Dispatch to different codes depending on src alignment
- lea r8, [AlignmentDispatchNT]
- jmp near [r8+rax*8]
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-; Macros and alignment jump tables
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-; Macros for each src alignment, SSE2 instruction set:
-; Make separate code for each alignment u because the shift instructions
-; have the shift count as a constant:
-
-%MACRO MOVE_UNALIGNED_SSE2 2 ; u, nt
-; Move rcx + rdx bytes of data
-; Source is misaligned. (src-dest) modulo 16 = %1
-; %2 = 1 if non-temporal store desired
-; eax = %1
-; rsi = src - %1 = nearest preceding 16-bytes boundary
-; rdi = dest (aligned)
-; rcx = - (count rounded down to nearest divisible by 32)
-; edx = remaining bytes to move after loop
- movdqa xmm0, [rsi+rcx] ; Read from nearest preceding 16B boundary
-%%L1: ; Loop. rcx has negative index from the end, counting up to zero
- movdqa xmm1, [rsi+rcx+10H] ; Read next two blocks aligned
- movdqa xmm2, [rsi+rcx+20H]
- movdqa xmm3, xmm1 ; Copy because used twice
- psrldq xmm0, %1 ; shift right
- pslldq xmm1, 16-%1 ; shift left
- por xmm0, xmm1 ; combine blocks
- %IF %2 == 0
- movdqa [rdi+rcx], xmm0 ; Save aligned
- %ELSE
- movntdq [rdi+rcx], xmm0 ; non-temporal save
- %ENDIF
- movdqa xmm0, xmm2 ; Save for next iteration
- psrldq xmm3, %1 ; shift right
- pslldq xmm2, 16-%1 ; shift left
- por xmm3, xmm2 ; combine blocks
- %IF %2 == 0
- movdqa [rdi+rcx+10H], xmm3 ; Save aligned
- %ELSE
- movntdq [rdi+rcx+10H], xmm3 ; non-temporal save
- %ENDIF
- add rcx, 20H ; Loop through negative values up to zero
- jnz %%L1
-
- ; Set up for edx remaining bytes
- add rsi, rdx
- add rdi, rdx
- neg rdx
- cmp edx, -10H
- jg %%L2
- ; One more 16-bytes block to move
- movdqa xmm1, [rsi+rdx+10H]
- psrldq xmm0, %1 ; shift right
- pslldq xmm1, 16-%1 ; shift left
- por xmm0, xmm1 ; combine blocks
- %IF %2 == 0
- movdqa [rdi+rdx], xmm0 ; Save aligned
- %ELSE
- movntdq [rdi+rdx], xmm0 ; non-temporal save
- %ENDIF
- add rdx, 10H
-%%L2: ; Get src pointer back to misaligned state
- add rsi, rax
- %IF %2 == 1
- sfence
- %ENDIF
- ; Move remaining 0 - 15 bytes, unaligned
- jmp C200
-%ENDMACRO
-
-
-%MACRO MOVE_UNALIGNED_SSE2_4 1 ; nt
-; Special case for u = 4
-; %1 = 1 if non-temporal store desired
- movaps xmm0, [rsi+rcx] ; Read from nearest preceding 16B boundary
-%%L1: ; Loop. rcx has negative index from the end, counting up to zero
- movaps xmm1, [rsi+rcx+10H] ; Read next two blocks aligned
- movss xmm0, xmm1 ; Moves 4 bytes, leaves remaining bytes unchanged
- shufps xmm0, xmm0, 00111001B ; Rotate
- %IF %1 == 0
- movaps [rdi+rcx], xmm0 ; Save aligned
- %ELSE
- movntps [rdi+rcx], xmm0 ; Non-temporal save
- %ENDIF
- movaps xmm0, [rsi+rcx+20H]
- movss xmm1, xmm0
- shufps xmm1, xmm1, 00111001B
- %IF %1 == 0
- movaps [rdi+rcx+10H], xmm1 ; Save aligned
- %ELSE
- movntps [rdi+rcx+10H], xmm1 ; Non-temporal save
- %ENDIF
- add rcx, 20H ; Loop through negative values up to zero
- jnz %%L1
- ; Set up for edx remaining bytes
- add rsi, rdx
- add rdi, rdx
- neg rdx
- cmp edx, -10H
- jg %%L2
- ; One more 16-bytes block to move
- movaps xmm1, [rsi+rdx+10H] ; Read next two blocks aligned
- movss xmm0, xmm1
- shufps xmm0, xmm0, 00111001B
- %IF %1 == 0
- movaps [rdi+rdx], xmm0 ; Save aligned
- %ELSE
- movntps [rdi+rdx], xmm0 ; Non-temporal save
- %ENDIF
- add rdx, 10H
-%%L2: ; Get src pointer back to misaligned state
- add rsi, rax
- %IF %1 == 1
- sfence
- %ENDIF
- ; Move remaining 0 - 15 bytes, unaligned
- jmp C200
-%ENDMACRO
-
-
-%MACRO MOVE_UNALIGNED_SSE2_8 1 ; nt
-; Special case for u = 8
-; %1 = 1 if non-temporal store desired
- movaps xmm0, [rsi+rcx] ; Read from nearest preceding 16B boundary
-%%L1: ; Loop. rcx has negative index from the end, counting up to zero
- movaps xmm1, [rsi+rcx+10H] ; Read next two blocks aligned
- movsd xmm0, xmm1 ; Moves 8 bytes, leaves remaining bytes unchanged
- shufps xmm0, xmm0, 01001110B ; Rotate
- %IF %1 == 0
- movaps [rdi+rcx], xmm0 ; Save aligned
- %ELSE
- movntps [rdi+rcx], xmm0 ; Non-temporal save
- %ENDIF
- movaps xmm0, [rsi+rcx+20H]
- movsd xmm1, xmm0
- shufps xmm1, xmm1, 01001110B
- %IF %1 == 0
- movaps [rdi+rcx+10H], xmm1 ; Save aligned
- %ELSE
- movntps [rdi+rcx+10H], xmm1 ; Non-temporal save
- %ENDIF
- add rcx, 20H ; Loop through negative values up to zero
- jnz %%L1
- ; Set up for edx remaining bytes
- add rsi, rdx
- add rdi, rdx
- neg rdx
- cmp edx, -10H
- jg %%L2
- ; One more 16-bytes block to move
- movaps xmm1, [rsi+rdx+10H] ; Read next two blocks aligned
- movsd xmm0, xmm1
- shufps xmm0, xmm0, 01001110B
- %IF %1 == 0
- movaps [rdi+rdx], xmm0 ; Save aligned
- %ELSE
- movntps [rdi+rdx], xmm0 ; Non-temporal save
- %ENDIF
- add rdx, 10H
-%%L2: ; Get src pointer back to misaligned state
- add rsi, rax
- %IF %1 == 1
- sfence
- %ENDIF
- ; Move remaining 0 - 15 bytes, unaligned
- jmp C200
-%ENDMACRO
-
-
-%MACRO MOVE_UNALIGNED_SSE2_12 1 ; nt
-; Special case for u = 12
-; %1 = 1 if non-temporal store desired
- movaps xmm0, [rsi+rcx] ; Read from nearest preceding 16B boundary
- shufps xmm0, xmm0, 10010011B
-%%L1: ; Loop. rcx has negative index from the end, counting up to zero
- movaps xmm1, [rsi+rcx+10H] ; Read next two blocks aligned
- movaps xmm2, [rsi+rcx+20H]
- shufps xmm1, xmm1, 10010011B
- shufps xmm2, xmm2, 10010011B
- movaps xmm3, xmm2
- movss xmm2, xmm1 ; Moves 4 bytes, leaves remaining bytes unchanged
- movss xmm1, xmm0 ; Moves 4 bytes, leaves remaining bytes unchanged
- %IF %1 == 0
- movaps [rdi+rcx], xmm1 ; Save aligned
- movaps [rdi+rcx+10H], xmm2 ; Save aligned
- %ELSE
- movntps [rdi+rcx], xmm1 ; Non-temporal save
- movntps [rdi+rcx+10H], xmm2 ; Non-temporal save
- %ENDIF
- movaps xmm0, xmm3 ; Save for next iteration
- add rcx, 20H ; Loop through negative values up to zero
- jnz %%L1
- ; Set up for edx remaining bytes
- add rsi, rdx
- add rdi, rdx
- neg rdx
- cmp edx, -10H
- jg %%L2
- ; One more 16-bytes block to move
- movaps xmm1, [rsi+rdx+10H] ; Read next two blocks aligned
- shufps xmm1, xmm1, 10010011B
- movss xmm1, xmm0 ; Moves 4 bytes, leaves remaining bytes unchanged
- %IF %1 == 0
- movaps [rdi+rdx], xmm1 ; Save aligned
- %ELSE
- movntps [rdi+rdx], xmm1 ; Non-temporal save
- %ENDIF
- add rdx, 10H
-%%L2: ; Get src pointer back to misaligned state
- add rsi, rax
- %IF %1 == 1
- sfence
- %ENDIF
- ; Move remaining 0 - 15 bytes, unaligned
- jmp C200
-%ENDMACRO
-
-
-; Macros for each src alignment, Suppl.SSE3 instruction set:
-; Make separate code for each alignment u because the palignr instruction
-; has the shift count as a constant:
-
-%MACRO MOVE_UNALIGNED_SSSE3 1 ; u
-; Move rcx + rdx bytes of data
-; Source is misaligned. (src-dest) modulo 16 = %1
-; eax = %1
-; rsi = src - %1 = nearest preceding 16-bytes boundary
-; rdi = dest (aligned)
-; rcx = - (count rounded down to nearest divisible by 32)
-; edx = remaining bytes to move after loop
- movdqa xmm0, [rsi+rcx] ; Read from nearest preceding 16B boundary
-
-%%L1: ; Loop. rcx has negative index from the end, counting up to zero
- movdqa xmm2, [rsi+rcx+10H] ; Read next two blocks
- movdqa xmm3, [rsi+rcx+20H]
- movdqa xmm1, xmm0 ; Save xmm0
- movdqa xmm0, xmm3 ; Save for next iteration
- palignr xmm3, xmm2, %1 ; Combine parts into aligned block
- palignr xmm2, xmm1, %1 ; Combine parts into aligned block
- movdqa [rdi+rcx], xmm2 ; Save aligned
- movdqa [rdi+rcx+10H], xmm3 ; Save aligned
- add rcx, 20H
- jnz %%L1
-
- ; Set up for edx remaining bytes
- add rsi, rdx
- add rdi, rdx
- neg rdx
- cmp edx, -10H
- jg %%L2
- ; One more 16-bytes block to move
- movdqa xmm2, [rsi+rdx+10H]
- palignr xmm2, xmm0, %1
- movdqa [rdi+rdx], xmm2
- add rdx, 10H
-%%L2: ; Get src pointer back to misaligned state
- add rsi, rax
- ; Move remaining 0 - 15 bytes
- jmp C200
-%ENDMACRO
-
-
-; Make 15 instances of SSE2 macro for each value of the alignment u.
-; These are pointed to by the jump table AlignmentDispatchSSE2 below
-; (alignments and fillers are inserted manually to minimize the number
-; of 16-bytes boundaries inside loops)
-
-align 16
-D104: MOVE_UNALIGNED_SSE2_4 0
-times 4 nop
-D108: MOVE_UNALIGNED_SSE2_8 0
-times 4 nop
-D10C: MOVE_UNALIGNED_SSE2_12 0
-times 1 nop
-D101: MOVE_UNALIGNED_SSE2 1, 0
-D102: MOVE_UNALIGNED_SSE2 2, 0
-D103: MOVE_UNALIGNED_SSE2 3, 0
-D105: MOVE_UNALIGNED_SSE2 5, 0
-D106: MOVE_UNALIGNED_SSE2 6, 0
-D107: MOVE_UNALIGNED_SSE2 7, 0
-D109: MOVE_UNALIGNED_SSE2 9, 0
-times 1 nop
-D10A: MOVE_UNALIGNED_SSE2 0AH, 0
-D10B: MOVE_UNALIGNED_SSE2 0BH, 0
-D10D: MOVE_UNALIGNED_SSE2 0DH, 0
-D10E: MOVE_UNALIGNED_SSE2 0EH, 0
-D10F: MOVE_UNALIGNED_SSE2 0FH, 0
-
-; Make 15 instances of Suppl-SSE3 macro for each value of the alignment u.
-; These are pointed to by the jump table AlignmentDispatchSupSSE3 below
-
-align 16
-E104: MOVE_UNALIGNED_SSSE3 4
-E108: MOVE_UNALIGNED_SSSE3 8
-E10C: MOVE_UNALIGNED_SSSE3 0CH
-E101: MOVE_UNALIGNED_SSSE3 1
-E102: MOVE_UNALIGNED_SSSE3 2
-E103: MOVE_UNALIGNED_SSSE3 3
-E105: MOVE_UNALIGNED_SSSE3 5
-E106: MOVE_UNALIGNED_SSSE3 6
-E107: MOVE_UNALIGNED_SSSE3 7
-E109: MOVE_UNALIGNED_SSSE3 9
-times 1 nop
-E10A: MOVE_UNALIGNED_SSSE3 0AH
-E10B: MOVE_UNALIGNED_SSSE3 0BH
-E10D: MOVE_UNALIGNED_SSSE3 0DH
-E10E: MOVE_UNALIGNED_SSSE3 0EH
-E10F: MOVE_UNALIGNED_SSSE3 0FH
-
-; Codes for non-temporal move. Aligned case first
-
-align 16
-F100: ; Non-temporal move, src and dest have same alignment.
- ; Loop. rcx has negative index from the end, counting up to zero
- movaps xmm0, [rsi+rcx] ; Read
- movaps xmm1, [rsi+rcx+10H]
- movntps [rdi+rcx], xmm0 ; Write non-temporal (bypass cache)
- movntps [rdi+rcx+10H], xmm1
- add rcx, 20H
- jnz F100 ; Loop through negative rcx up to zero
-
- ; Move the remaining edx bytes (0 - 31):
- add rsi, rdx
- add rdi, rdx
- neg rdx
- jz C500 ; Skip if no more data
- ; Check if we can more one more 16-bytes block
- cmp edx, -10H
- jg C200
- ; move 16 bytes, aligned
- movaps xmm0, [rsi+rdx]
- movntps [rdi+rdx], xmm0
- add rdx, 10H
- sfence
- ; move the remaining 0 - 15 bytes
- jmp C200
-
-; Make 15 instances of MOVE_UNALIGNED_SSE2 macro for each value of
-; the alignment u.
-; These are pointed to by the jump table AlignmentDispatchNT below
-
-;align 16
-F104: MOVE_UNALIGNED_SSE2_4 1
-F108: MOVE_UNALIGNED_SSE2_8 1
-F10C: MOVE_UNALIGNED_SSE2_12 1
-F101: MOVE_UNALIGNED_SSE2 1, 1
-F102: MOVE_UNALIGNED_SSE2 2, 1
-F103: MOVE_UNALIGNED_SSE2 3, 1
-F105: MOVE_UNALIGNED_SSE2 5, 1
-F106: MOVE_UNALIGNED_SSE2 6, 1
-F107: MOVE_UNALIGNED_SSE2 7, 1
-F109: MOVE_UNALIGNED_SSE2 9, 1
-F10A: MOVE_UNALIGNED_SSE2 0AH, 1
-F10B: MOVE_UNALIGNED_SSE2 0BH, 1
-F10D: MOVE_UNALIGNED_SSE2 0DH, 1
-F10E: MOVE_UNALIGNED_SSE2 0EH, 1
-F10F: MOVE_UNALIGNED_SSE2 0FH, 1
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-; CPU dispatcher
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-memcpyCPUDispatch: ; CPU dispatcher, check for instruction sets and which method is fastest
- ; This part is executed only once
- push rbx
- push rcx
- push rdx
- push rsi
- push rdi
- push r8
- ; set CacheBypassLimit to half the size of the largest level cache
- call GetMemcpyCacheLimit@
- mov eax, 1
- cpuid ; Get feature flags
- lea rbx, [memcpySSE2@]
- bt ecx, 9 ; Test bit for SupplSSE3
- jnc Q100
- lea rbx, [memcpySSSE3@]
- call UnalignedIsFaster ; Test if unaligned read is faster than aligned read and shift
- test eax, eax
- jz Q100
- lea rbx, [memcpyU@]
- call Store256BitIsFaster ; Test if 256-bit read/write is available and faster than 128-bit read/write
- test eax, eax
- jz Q100
- lea rbx, [memcpyU256@]
-Q100:
- ; Insert appropriate pointer
- mov [memcpyDispatch], rbx
- mov rax, rbx
- pop r8
- pop rdi
- pop rsi
- pop rdx
- pop rcx
- pop rbx
- ; Jump according to the replaced function pointer
- jmp rax
-
-; extern "C" size_t GetMemcpyCacheLimit();
-GetMemcpyCacheLimit:
-GetMemcpyCacheLimit@: ; local limit
- mov rax, [CacheBypassLimit]
- test rax, rax
- jnz U200
- ; Get half the size of the largest level cache
-%ifdef WINDOWS
- xor ecx, ecx ; 0 means largest level cache
-%else
- xor edi, edi ; 0 means largest level cache
-%endif
- call DataCacheSize ; get cache size
- shr rax, 1 ; half the size
- jnz U100
- mov eax, 400000H ; cannot determine cache size. use 4 Mbytes
-U100: mov [CacheBypassLimit], rax
-U200: ret
-
-; Note: SetMemcpyCacheLimit is defined in memmove64.asm, calling SetMemcpyCacheLimit1
-SetMemcpyCacheLimit1:
-%ifdef WINDOWS
- mov rax, rcx
-%else
- mov rax, rdi
-%endif
- test rax, rax
- jnz U400
- ; zero, means default
- mov [CacheBypassLimit], rax
- call GetMemcpyCacheLimit@
-U400: mov [CacheBypassLimit], rax
- ret
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-; getDispatch, for testing only
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-getDispatch:
-mov rax,[memcpyDispatch]
-ret
-
-global getDispatch
-
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-; data section. jump tables, dispatch function pointer, cache size
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-; Data segment must be included in function namespace
-SECTION .data
-align 16
-
-; Jump tables for alignments 0 - 15:
-; The CPU dispatcher replaces AlignmentDispatch with
-; AlignmentDispatchSSE2 or AlignmentDispatchSupSSE3 if Suppl-SSE3
-; is supported.
-
-; Code pointer for each alignment for SSE2 instruction set
-AlignmentDispatchSSE2:
-DQ C100, D101, D102, D103, D104, D105, D106, D107
-DQ D108, D109, D10A, D10B, D10C, D10D, D10E, D10F
-
-; Code pointer for each alignment for Suppl-SSE3 instruction set
-AlignmentDispatchSSSE3:
-DQ C100, E101, E102, E103, E104, E105, E106, E107
-DQ E108, E109, E10A, E10B, E10C, E10D, E10E, E10F
-
-; Code pointer for each alignment for non-temporal store
-AlignmentDispatchNT:
-DQ F100, F101, F102, F103, F104, F105, F106, F107
-DQ F108, F109, F10A, F10B, F10C, F10D, F10E, F10F
-
-; Pointer to appropriate version.
-; This initially points to memcpyCPUDispatch. memcpyCPUDispatch will
-; change this to the appropriate version of memcpy, so that
-; memcpyCPUDispatch is only executed once:
-memcpyDispatch DQ memcpyCPUDispatch
-
-; Bypass cache by using non-temporal moves if count > CacheBypassLimit
-; The optimal value of _CacheBypassLimit is difficult to estimate, but
-; a reasonable value is half the size of the largest cache:
-CacheBypassLimit: DQ 0
|