Extract asmlib manipulations into separate block

author: thegeorg <[email protected]> 2023-08-22 18:56:30 +0300
committer: thegeorg <[email protected]> 2023-08-22 19:13:38 +0300
commit: 769d14120ef8e30363c7dd6870ce1b82552587c3 (patch)
tree: c407d1d3f152b9f6eb13f50abc3f5b06db82f9b3 /contrib/libs/asmlib/memcpy64.asm
parent: 494eee7cbbaf3e7d71a133c80c96aec26e518c2a (diff)
1 files changed, 0 insertions, 1332 deletions
diff --git a/contrib/libs/asmlib/memcpy64.asm b/contrib/libs/asmlib/memcpy64.asm
deleted file mode 100644
index d590990b99d..00000000000
--- a/contrib/libs/asmlib/memcpy64.asm
+++ /dev/null
@@ -1,1332 +0,0 @@
-%include "defs.asm"
-
-;*************************  memcpy64.asm  ************************************
-; Author:           Agner Fog
-; Date created:     2008-07-19
-; Last modified:    2016-11-12 (patched version with AVX512 support removed)
-;
-; Description:
-; Faster version of the standard memcpy function:
-; void * A_memcpy(void *dest, const void *src, size_t count);
-; Copies 'count' bytes from 'src' to 'dest'
-;
-; Overriding standard function memcpy:
-; The alias ?OVR_memcpy is changed to _memcpy in the object file if
-; it is desired to override the standard library function memcpy.
-;
-; The function uses non-temporal writes to bypass the cache when the size is
-; bigger than half the size of the largest_level cache. This limit can be
-; read with GetMemcpyCacheLimit and changed with SetMemcpyCacheLimit
-; C++ prototypes:
-; extern "C" size_t GetMemcpyCacheLimit();  // in memcpy64.asm
-; extern "C" void SetMemcpyCacheLimit();    // in memmove64.asm
-; extern "C" void SetMemcpyCacheLimit1();   // used internally
-;
-; Position-independent code is generated if POSITIONINDEPENDENT is defined.
-;
-; CPU dispatching included SSE2, Suppl-SSE3 and AVX instruction sets.
-;
-; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
-;******************************************************************************
-
-default rel
-
-global A_memcpy: function              ; Function A_memcpy
-global EXP(memcpy): function           ; ?OVR removed if standard function memcpy overridden
-global memcpySSE2: function            ; Version for processors with only SSE2
-global memcpySSSE3: function           ; Version for processors with SSSE3
-global memcpyU: function               ; Version for processors with fast unaligned read
-global memcpyU256: function            ; Version for processors with fast 256-bit read/write
-
-global GetMemcpyCacheLimit: function   ; Get the size limit for bypassing cache when copying with memcpy and memmove
-global SetMemcpyCacheLimit1: function  ; Set the size limit for bypassing cache when copying with memcpy
-
-
-; Imported from instrset64.asm
-extern InstructionSet                  ; Instruction set for CPU dispatcher
-
-; Imported from unalignedisfaster64.asm:
-extern UnalignedIsFaster               ; Tells if unaligned read is faster than PALIGNR
-extern Store256BitIsFaster             ; Tells if a 256 bit store is faster than two 128 bit stores
-
-; Imported from cachesize32.asm:
-extern DataCacheSize                   ; Gets size of data cache
-
-
-; Define prolog for this function
-%MACRO  PROLOGM  0
-%IFDEF  WINDOWS
-        push    rsi
-        push    rdi
-        mov     rdi, rcx               ; dest
-        mov     r9,  rcx               ; dest
-        mov     rsi, rdx               ; src
-        mov     rcx, r8                ; count
-%ELSE   ; Unix
-        mov     rcx, rdx               ; count
-        mov     r9,  rdi               ; dest
-%ENDIF
-%ENDM
-
-; Define return from this function
-%MACRO  RETURNM  0
-%IFDEF  WINDOWS
-        pop     rdi
-        pop     rsi
-%ENDIF
-        mov     rax, r9                ; Return value = dest
-        ret
-%ENDM
-
-
-SECTION .text  align=16
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-;                          Common entry for dispatch
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-; extern "C" void * A_memcpy(void * dest, const void * src, size_t count);
-; Function entry:
-A_memcpy:
-EXP(memcpy):
-        jmp     qword [memcpyDispatch] ; Go to appropriate version, depending on instruction set
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-; AVX Version for processors with fast unaligned read and fast 32 bytes write
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-align 16
-memcpyU256:   ; global label
-memcpyU256@:  ; local label
-        PROLOGM
-        cmp     rcx, 40H
-        jb      A1000                  ; Use simpler code if count < 64
-
-        ; count >= 64
-        ; Calculate size of first block up to first regular boundary of dest
-        mov     edx, edi
-        neg     edx
-        and     edx, 1FH
-        jz      B3100                    ; Skip if dest aligned by 32
-
-        ; edx = size of first partial block, 1 - 31 bytes
-        test    dl, 3
-        jz      B3030
-        test    dl, 1
-        jz      B3020
-        ; move 1 byte
-        movzx   eax, byte [rsi]
-        mov     [rdi], al
-        inc     rsi
-        inc     rdi
-B3020:  test    dl, 2
-        jz      B3030
-        ; move 2 bytes
-        movzx   eax, word [rsi]
-        mov     [rdi], ax
-        add     rsi, 2
-        add     rdi, 2
-B3030:  test    dl, 4
-        jz      B3040
-        ; move 4 bytes
-        mov     eax, [rsi]
-        mov     [rdi], eax
-        add     rsi, 4
-        add     rdi, 4
-B3040:  test    dl, 8
-        jz      B3050
-        ; move 8 bytes
-        mov     rax, [rsi]
-        mov     [rdi], rax
-        add     rsi, 8
-        add     rdi, 8
-B3050:  test    dl, 16
-        jz      B3060
-        ; move 16 bytes
-        movups  xmm0, [rsi]
-        movaps  [rdi], xmm0
-        add     rsi, 16
-        add     rdi, 16
-B3060:  sub     rcx, rdx
-
-B3100:  ; Now dest is aligned by 32. Any partial block has been moved
-
-        ; Set up for loop moving 32 bytes per iteration:
-        mov     rdx, rcx               ; Save count
-        and     rcx, -20H              ; Round down to nearest multiple of 32
-        add     rsi, rcx               ; Point to the end
-        add     rdi, rcx               ; Point to the end
-        sub     rdx, rcx               ; Remaining data after loop
-        ; Check if count very big
-        cmp     rcx, [CacheBypassLimit]
-        ja      I3100                  ; Use non-temporal store if count > CacheBypassLimit
-        neg     rcx                    ; Negative index from the end
-
-H3100:  ; copy -rcx bytes in blocks of 32 bytes.
-
-        ; Check for false memory dependence: The CPU may falsely assume
-        ; a partial overlap between the written destination and the following
-        ; read source if source is unaligned and
-        ; (src-dest) modulo 4096  is close to 4096
-        test    sil, 1FH
-        jz      H3110                  ; aligned
-        mov     eax, esi
-        sub     eax, edi
-        and     eax, 0FFFH             ; modulo 4096
-        cmp     eax, 1000H - 200H
-        ja      J3100
-
-align 16
-H3110:  ; main copy loop, 32 bytes at a time
-        ; rcx has negative index from the end, counting up to zero
-        vmovups ymm0, [rsi+rcx]
-        vmovaps [rdi+rcx], ymm0
-        add     rcx, 20H
-        jnz     H3110
-        sfence
-        vzeroupper                     ; end of AVX mode
-
-H3120:  ; Move the remaining edx bytes (0 - 31):
-        add     rsi, rdx
-        add     rdi, rdx
-        neg     rdx
-        jz      H3500                  ; Skip if no more data
-        ; move 16-8-4-2-1 bytes, aligned
-        cmp     edx, -10H
-        jg      H3200
-        ; move 16 bytes
-        movups  xmm0, [rsi+rdx]
-        movaps  [rdi+rdx], xmm0
-        add     rdx, 10H
-H3200:  cmp     edx, -8
-        jg      H3210
-        ; move 8 bytes
-        movq    xmm0, qword [rsi+rdx]
-        movq    qword [rdi+rdx], xmm0
-        add     rdx, 8
-        jz      H500                   ; Early skip if count divisible by 8
-H3210:  cmp     edx, -4
-        jg      H3220
-        ; move 4 bytes
-        mov     eax, [rsi+rdx]
-        mov     [rdi+rdx], eax
-        add     rdx, 4
-H3220:  cmp     edx, -2
-        jg      H3230
-        ; move 2 bytes
-        movzx   eax, word [rsi+rdx]
-        mov     [rdi+rdx], ax
-        add     rdx, 2
-H3230:  cmp     edx, -1
-        jg      H3500
-        ; move 1 byte
-        movzx   eax, byte [rsi+rdx]
-        mov     [rdi+rdx], al
-H3500:  ; finished
-        RETURNM
-
-I3100:   ; non-temporal move
-        neg     rcx                    ; Negative index from the end
-
-align 16
-I3110:  ; main copy loop, 32 bytes at a time
-        ; rcx has negative index from the end, counting up to zero
-        vmovups ymm0, [rsi+rcx]
-        vmovntps [rdi+rcx], ymm0
-        add     rcx, 20H
-        jnz     I3110
-        sfence
-        vzeroupper                      ; end of AVX mode
-        jmp     H3120                  ; Move the remaining edx bytes (0 - 31)
-
-
-align 16
-J3100:  ; There is a false memory dependence.
-        ; check if src and dest overlap, if not then it is safe
-        ; to copy backwards to avoid false memory dependence
-%if 1
-        ; Use this version if you want consistent behavior in the case
-        ; where dest > src and overlap. However, this case is undefined
-        ; anyway because part of src is overwritten before copying
-        push    rdx
-        mov     rax, rsi
-        sub     rax, rdi
-        cqo
-        xor     rax, rdx
-        sub     rax, rdx   ; abs(src-dest)
-        neg     rcx        ; size
-        pop     rdx        ; restore rdx
-        cmp     rax, rcx
-        jnb     J3110
-        neg     rcx        ; restore rcx
-        jmp     H3110      ; overlap between src and dest. Can't copy backwards
-%else
-        ; save time by not checking the case that is undefined anyway
-        mov     rax, rsi
-        sub     rax, rdi
-        neg     rcx        ; size
-        cmp     rax, rcx
-        jnb     J3110      ; OK to copy backwards
-        ; must copy forwards
-        neg     rcx        ; restore ecx
-        jmp     H3110      ; copy forwards
-
-%endif
-
-J3110:   ; copy backwards, rcx = size. rsi, rdi = end of src, dest
-        push    rsi
-        push    rdi
-        sub     rsi, rcx
-        sub     rdi, rcx
-J3120:  ; loop backwards
-        vmovups ymm0, [rsi+rcx-20H]
-        vmovaps [rdi+rcx-20H], ymm0
-        sub     rcx, 20H
-        jnz     J3120
-        sfence
-        vzeroupper
-        pop     rdi
-        pop     rsi
-        jmp     H3120
-
-align 16
-        ; count < 64. Move 32-16-8-4-2-1 bytes
-        ; multiple CPU versions (SSSE3 and above)
-A1000:  add     rsi, rcx               ; end of src
-        add     rdi, rcx               ; end of dest
-        neg     rcx                    ; negative index from the end
-        cmp     ecx, -20H
-        jg      A1100
-        ; move 32 bytes
-        ; movdqu is faster than 64-bit moves on processors with SSSE3
-        movups  xmm0, [rsi+rcx]
-        movups  xmm1, [rsi+rcx+10H]
-        movups  [rdi+rcx], xmm0
-        movups  [rdi+rcx+10H], xmm1
-        add     rcx, 20H
-A1100:  cmp     ecx, -10H
-        jg      A1200
-        ; move 16 bytes
-        movups  xmm0, [rsi+rcx]
-        movups  [rdi+rcx], xmm0
-        add     rcx, 10H
-A1200:  cmp     ecx, -8
-        jg      A1300
-        ; move 8 bytes
-        mov     rax, qword [rsi+rcx]
-        mov     qword [rdi+rcx], rax
-        add     rcx, 8
-A1300:  cmp     ecx, -4
-        jg      A1400
-        ; move 4 bytes
-        mov     eax, [rsi+rcx]
-        mov     [rdi+rcx], eax
-        add     rcx, 4
-        jz      A1900                     ; early out if count divisible by 4
-A1400:  cmp     ecx, -2
-        jg      A1500
-        ; move 2 bytes
-        movzx   eax, word [rsi+rcx]
-        mov     [rdi+rcx], ax
-        add     rcx, 2
-A1500:  cmp     ecx, -1
-        jg      A1900
-        ; move 1 byte
-        movzx   eax, byte [rsi+rcx]
-        mov     [rdi+rcx], al
-A1900:  ; finished
-        RETURNM
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-;  Version for processors with fast unaligned read and fast 16 bytes write
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-align 16
-memcpyU:   ; global label
-memcpyU@:  ; local label
-        PROLOGM
-        cmp     rcx, 40H
-        jb      A1000                  ; Use simpler code if count < 64
-
-        ; count >= 64
-        ; Calculate size of first block up to first regular boundary of dest
-        mov     edx, edi
-        neg     edx
-        and     edx, 0FH
-        jz      B2100                    ; Skip if dest aligned by 16
-
-        ; edx = size of first partial block, 1 - 15 bytes
-        test    dl, 3
-        jz      B2030
-        test    dl, 1
-        jz      B2020
-        ; move 1 byte
-        movzx   eax, byte [rsi]
-        mov     [rdi], al
-        inc     rsi
-        inc     rdi
-B2020:  test    dl, 2
-        jz      B2030
-        ; move 2 bytes
-        movzx   eax, word [rsi]
-        mov     [rdi], ax
-        add     rsi, 2
-        add     rdi, 2
-B2030:  test    dl, 4
-        jz      B2040
-        ; move 4 bytes
-        mov     eax, [rsi]
-        mov     [rdi], eax
-        add     rsi, 4
-        add     rdi, 4
-B2040:  test    dl, 8
-        jz      B2050
-        ; move 8 bytes
-        mov     rax, [rsi]
-        mov     [rdi], rax
-        add     rsi, 8
-        add     rdi, 8
-B2050:  sub     rcx, rdx
-B2100:  ; Now dest is aligned by 16. Any partial block has been moved
-
-        ; Set up for loop moving 32 bytes per iteration:
-        mov     rdx, rcx               ; Save count
-        and     rcx, -20H              ; Round down to nearest multiple of 32
-        add     rsi, rcx               ; Point to the end
-        add     rdi, rcx               ; Point to the end
-        sub     rdx, rcx               ; Remaining data after loop
-
-        ; Check if count very big
-        cmp     rcx, [CacheBypassLimit]
-        ja      I100                   ; Use non-temporal store if count > CacheBypassLimit
-        neg     rcx                    ; Negative index from the end
-
-H100:   ; copy -rcx bytes in blocks of 32 bytes.
-
-        ; Check for false memory dependence: The CPU may falsely assume
-        ; a partial overlap between the written destination and the following
-        ; read source if source is unaligned and
-        ; (src-dest) modulo 4096 is close to 4096
-        test    sil, 0FH
-        jz      H110                   ; aligned
-        mov     eax, esi
-        sub     eax, edi
-        and     eax, 0FFFH             ; modulo 4096
-        cmp     eax, 1000H - 200H
-        ja      J100
-
-H110:   ; main copy loop, 32 bytes at a time
-        ; rcx has negative index from the end, counting up to zero
-        movups  xmm0, [rsi+rcx]
-        movups  xmm1, [rsi+rcx+10H]
-        movaps  [rdi+rcx], xmm0
-        movaps  [rdi+rcx+10H], xmm1
-        add     rcx, 20H
-        jnz     H110
-
-H120:   ; Move the remaining edx bytes (0 - 31):
-        add     rsi, rdx
-        add     rdi, rdx
-        neg     rdx
-        jz      H500                   ; Skip if no more data
-        ; move 16-8-4-2-1 bytes, aligned
-        cmp     edx, -10H
-        jg      H200
-        ; move 16 bytes
-        movups  xmm0, [rsi+rdx]
-        movaps  [rdi+rdx], xmm0
-        add     rdx, 10H
-H200:   cmp     edx, -8
-        jg      H210
-        ; move 8 bytes
-        movq    xmm0, qword [rsi+rdx]
-        movq    qword [rdi+rdx], xmm0
-        add     rdx, 8
-        jz      H500                   ; Early skip if count divisible by 8
-H210:   cmp     edx, -4
-        jg      H220
-        ; move 4 bytes
-        mov     eax, [rsi+rdx]
-        mov     [rdi+rdx], eax
-        add     rdx, 4
-H220:   cmp     edx, -2
-        jg      H230
-        ; move 2 bytes
-        movzx   eax, word [rsi+rdx]
-        mov     [rdi+rdx], ax
-        add     rdx, 2
-H230:   cmp     edx, -1
-        jg      H500
-        ; move 1 byte
-        movzx   eax, byte [rsi+rdx]
-        mov     [rdi+rdx], al
-H500:   ; finished
-        RETURNM
-
-I100:   ; non-temporal move
-        neg     rcx                    ; Negative index from the end
-
-align 16
-I110:   ; main copy loop, 32 bytes at a time
-        ; rcx has negative index from the end, counting up to zero
-        movups  xmm0, [rsi+rcx]
-        movups  xmm1, [rsi+rcx+10H]
-        movntps [rdi+rcx], xmm0
-        movntps [rdi+rcx+10H], xmm1
-        add     rcx, 20H
-        jnz     I110
-        sfence
-        jmp     H120                  ; Move the remaining edx bytes (0 - 31):
-
-
-align 16
-J100:   ; There is a false memory dependence.
-        ; check if src and dest overlap, if not then it is safe
-        ; to copy backwards to avoid false memory dependence
-%if 1
-        ; Use this version if you want consistent behavior in the case
-        ; where dest > src and overlap. However, this case is undefined
-        ; anyway because part of src is overwritten before copying
-        push    rdx
-        mov     rax, rsi
-        sub     rax, rdi
-        cqo
-        xor     rax, rdx
-        sub     rax, rdx   ; abs(src-dest)
-        neg     rcx        ; size
-        pop     rdx        ; restore rdx
-        cmp     rax, rcx
-        jnb     J110
-        neg     rcx        ; restore rcx
-        jmp     H110       ; overlap between src and dest. Can't copy backwards
-%else
-        ; save time by not checking the case that is undefined anyway
-        mov     rax, rsi
-        sub     rax, rdi
-        neg     rcx        ; size
-        cmp     rax, rcx
-        jnb     J110       ; OK to copy backwards
-        ; must copy forwards
-        neg     rcx        ; restore ecx
-        jmp     H110       ; copy forwards
-
-%endif
-
-J110:   ; copy backwards, rcx = size. rsi, rdi = end of src, dest
-        push    rsi
-        push    rdi
-        sub     rsi, rcx
-        sub     rdi, rcx
-J120:   ; loop backwards
-        movups  xmm1, [rsi+rcx-20H]
-        movups  xmm0, [rsi+rcx-10H]
-        movaps  [rdi+rcx-20H], xmm1
-        movaps  [rdi+rcx-10H], xmm0
-        sub     rcx, 20H
-        jnz     J120
-        pop     rdi
-        pop     rsi
-        jmp     H120
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-;  Version for processors with SSSE3. Aligned read + shift + aligned write
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-align 16
-memcpySSSE3:     ; global label
-memcpySSSE3@:    ; local label
-        PROLOGM
-        cmp     rcx, 40H
-        jb      A1000                  ; Use simpler code if count < 64
-
-        ; count >= 64
-        ; Calculate size of first block up to first regular boundary of dest
-        mov     edx, edi
-        neg     edx
-        and     edx, 0FH
-        jz      B1200                    ; Skip if dest aligned by 16
-
-        ; edx = size of first partial block, 1 - 15 bytes
-        test    dl, 3
-        jz      B1030
-        test    dl, 1
-        jz      B1020
-        ; move 1 byte
-        movzx   eax, byte [rsi]
-        mov     [rdi], al
-        inc     rsi
-        inc     rdi
-B1020:  test    dl, 2
-        jz      B1030
-        ; move 2 bytes
-        movzx   eax, word [rsi]
-        mov     [rdi], ax
-        add     rsi, 2
-        add     rdi, 2
-B1030:  test    dl, 4
-        jz      B1040
-        ; move 4 bytes
-        mov     eax, [rsi]
-        mov     [rdi], eax
-        add     rsi, 4
-        add     rdi, 4
-B1040:  test    dl, 8
-        jz      B1050
-        ; move 8 bytes
-        mov     rax, [rsi]
-        mov     [rdi], rax
-        add     rsi, 8
-        add     rdi, 8
-B1050:  sub     rcx, rdx
-B1200:  ; Now dest is aligned by 16. Any partial block has been moved
-        ; Find alignment of src modulo 16 at this point:
-        mov     eax, esi
-        and     eax, 0FH
-
-        ; Set up for loop moving 32 bytes per iteration:
-        mov     edx, ecx               ; Save count (lower 32 bits)
-        and     rcx, -20H              ; Round down count to nearest multiple of 32
-        add     rsi, rcx               ; Point to the end
-        add     rdi, rcx               ; Point to the end
-        sub     edx, ecx               ; Remaining data after loop (0-31)
-        sub     rsi, rax               ; Nearest preceding aligned block of src
-
-        ; Check if count very big
-        cmp     rcx, [CacheBypassLimit]
-        ja      B1400                   ; Use non-temporal store if count > CacheBypassLimit
-        neg     rcx                    ; Negative index from the end
-
-        ; Dispatch to different codes depending on src alignment
-        lea     r8, [AlignmentDispatchSSSE3]
-        jmp     near [r8+rax*8]
-
-B1400:  neg     rcx
-        ; Dispatch to different codes depending on src alignment
-        lea     r8, [AlignmentDispatchNT]
-        jmp     near [r8+rax*8]
-
-
-align   16
-C100:   ; Code for aligned src. SSE2 and SSSE3 versions
-        ; The nice case, src and dest have same alignment.
-
-        ; Loop. rcx has negative index from the end, counting up to zero
-        movaps  xmm0, [rsi+rcx]
-        movaps  xmm1, [rsi+rcx+10H]
-        movaps  [rdi+rcx], xmm0
-        movaps  [rdi+rcx+10H], xmm1
-        add     rcx, 20H
-        jnz     C100
-
-        ; Move the remaining edx bytes (0 - 31):
-        add     rsi, rdx
-        add     rdi, rdx
-        neg     rdx
-        jz      C500                   ; Skip if no more data
-        ; move 16-8-4-2-1 bytes, aligned
-        cmp     edx, -10H
-        jg      C200
-        ; move 16 bytes
-        movaps  xmm0, [rsi+rdx]
-        movaps  [rdi+rdx], xmm0
-        add     rdx, 10H
-C200:   cmp     edx, -8
-        jg      C210
-        ; move 8 bytes
-        mov     rax, [rsi+rdx]
-        mov     [rdi+rdx], rax
-        add     rdx, 8
-        jz      C500                   ; Early skip if count divisible by 8
-C210:   cmp     edx, -4
-        jg      C220
-        ; move 4 bytes
-        mov     eax, [rsi+rdx]
-        mov     [rdi+rdx], eax
-        add     rdx, 4
-C220:   cmp     edx, -2
-        jg      C230
-        ; move 2 bytes
-        movzx   eax, word [rsi+rdx]
-        mov     [rdi+rdx], ax
-        add     rdx, 2
-C230:   cmp     edx, -1
-        jg      C500
-        ; move 1 byte
-        movzx   eax, byte [rsi+rdx]
-        mov     [rdi+rdx], al
-C500:   ; finished
-        RETURNM
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-;  Version for processors with SSE2. Aligned read + shift + aligned write
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-memcpySSE2:     ; global label
-memcpySSE2@:    ; local label
-        PROLOGM
-        cmp     rcx, 40H
-        jae     B0100                   ; Use simpler code if count < 64
-
-        ; count < 64. Move 32-16-8-4-2-1 bytes
-        add     rsi, rcx               ; end of src
-        add     rdi, rcx               ; end of dest
-        neg     rcx                    ; negative index from the end
-        cmp     ecx, -20H
-        jg      A100
-        ; move 32 bytes
-        ; mov r64 is faster than movdqu on Intel Pentium M and Core 1
-        ; movdqu is fast on Nehalem and later
-        mov     rax, [rsi+rcx]
-        mov     rdx, [rsi+rcx+8]
-        mov     [rdi+rcx], rax
-        mov     [rdi+rcx+8], rdx
-        mov     rax, qword [rsi+rcx+10H]
-        mov     rdx, qword [rsi+rcx+18H]
-        mov     qword [rdi+rcx+10H], rax
-        mov     qword [rdi+rcx+18H], rdx
-        add     rcx, 20H
-A100:   cmp     ecx, -10H
-        jg      A200
-        ; move 16 bytes
-        mov     rax, [rsi+rcx]
-        mov     rdx, [rsi+rcx+8]
-        mov     [rdi+rcx], rax
-        mov     [rdi+rcx+8], rdx
-        add     rcx, 10H
-A200:   cmp     ecx, -8
-        jg      A300
-        ; move 8 bytes
-        mov     rax, qword [rsi+rcx]
-        mov     qword [rdi+rcx], rax
-        add     rcx, 8
-A300:   cmp     ecx, -4
-        jg      A400
-        ; move 4 bytes
-        mov     eax, [rsi+rcx]
-        mov     [rdi+rcx], eax
-        add     rcx, 4
-        jz      A900                     ; early out if count divisible by 4
-A400:   cmp     ecx, -2
-        jg      A500
-        ; move 2 bytes
-        movzx   eax, word [rsi+rcx]
-        mov     [rdi+rcx], ax
-        add     rcx, 2
-A500:   cmp     ecx, -1
-        jg      A900
-        ; move 1 byte
-        movzx   eax, byte [rsi+rcx]
-        mov     [rdi+rcx], al
-A900:   ; finished
-        RETURNM
-
-B0100:  ; count >= 64
-        ; Calculate size of first block up to first regular boundary of dest
-        mov     edx, edi
-        neg     edx
-        and     edx, 0FH
-        jz      B0200                    ; Skip if dest aligned by 16
-
-        ; edx = size of first partial block, 1 - 15 bytes
-        test    dl, 3
-        jz      B0030
-        test    dl, 1
-        jz      B0020
-        ; move 1 byte
-        movzx   eax, byte [rsi]
-        mov     [rdi], al
-        inc     rsi
-        inc     rdi
-B0020:  test    dl, 2
-        jz      B0030
-        ; move 2 bytes
-        movzx   eax, word [rsi]
-        mov     [rdi], ax
-        add     rsi, 2
-        add     rdi, 2
-B0030:  test    dl, 4
-        jz      B0040
-        ; move 4 bytes
-        mov     eax, [rsi]
-        mov     [rdi], eax
-        add     rsi, 4
-        add     rdi, 4
-B0040:  test    dl, 8
-        jz      B0050
-        ; move 8 bytes
-        mov     rax, [rsi]
-        mov     [rdi], rax
-        add     rsi, 8
-        add     rdi, 8
-B0050:  sub     rcx, rdx
-B0200:  ; Now dest is aligned by 16. Any partial block has been moved
-
-        ; This part will not always work if count < 64
-        ; Calculate size of first block up to first regular boundary of dest
-        mov     edx, edi
-        neg     edx
-        and     edx, 0FH
-        jz      B300                    ; Skip if dest aligned by 16
-
-        ; rdx = size of first partial block, 1 - 15 bytes
-        add     rsi, rdx
-        add     rdi, rdx
-        sub     rcx, rdx
-        neg     rdx
-        cmp     edx, -8
-        jg      B200
-        ; move 8 bytes
-        mov     rax, [rsi+rdx]
-        mov     [rdi+rdx], rax
-        add     rdx, 8
-B200:   cmp     edx, -4
-        jg      B210
-        ; move 4 bytes
-        mov     eax, [rsi+rdx]
-        mov     [rdi+rdx], eax
-        add     rdx, 4
-        jz      B300              ; early out if aligned by 4
-B210:   cmp     edx, -2
-        jg      B220
-        ; move 2 bytes
-        movzx   eax, word [rsi+rdx]
-        mov     [rdi+rdx], ax
-        add     rdx, 2
-B220:   cmp     edx, -1
-        jg      B300
-        ; move 1 byte
-        movzx   eax, byte [rsi+rdx]
-        mov     [rdi+rdx], al
-
-B300:   ; Now dest is aligned by 16. Any partial block has been moved
-        ; Find alignment of src modulo 16 at this point:
-        mov     eax, esi
-        and     eax, 0FH
-
-        ; Set up for loop moving 32 bytes per iteration:
-        mov     edx, ecx               ; Save count (lower 32 bits)
-        and     rcx, -20H              ; Round down count to nearest multiple of 32
-        add     rsi, rcx               ; Point to the end
-        add     rdi, rcx               ; Point to the end
-        sub     edx, ecx               ; Remaining data after loop (0-31)
-        sub     rsi, rax               ; Nearest preceding aligned block of src
-
-        ; Check if count very big
-        cmp     rcx, [CacheBypassLimit]
-        ja      B400                   ; Use non-temporal store if count > CacheBypassLimit
-        neg     rcx                    ; Negative index from the end
-
-        ; Dispatch to different codes depending on src alignment
-        lea     r8, [AlignmentDispatchSSE2]
-        jmp     near [r8+rax*8]
-
-B400:   neg     rcx
-        ; Dispatch to different codes depending on src alignment
-        lea     r8, [AlignmentDispatchNT]
-        jmp     near [r8+rax*8]
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-;  Macros and alignment jump tables
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-; Macros for each src alignment, SSE2 instruction set:
-; Make separate code for each alignment u because the shift instructions
-; have the shift count as a constant:
-
-%MACRO  MOVE_UNALIGNED_SSE2  2 ; u, nt
-; Move rcx + rdx bytes of data
-; Source is misaligned. (src-dest) modulo 16 = %1
-; %2 = 1 if non-temporal store desired
-; eax = %1
-; rsi = src - %1 = nearest preceding 16-bytes boundary
-; rdi = dest (aligned)
-; rcx = - (count rounded down to nearest divisible by 32)
-; edx = remaining bytes to move after loop
-        movdqa  xmm0, [rsi+rcx]        ; Read from nearest preceding 16B boundary
-%%L1:  ; Loop. rcx has negative index from the end, counting up to zero
-        movdqa  xmm1, [rsi+rcx+10H]    ; Read next two blocks aligned
-        movdqa  xmm2, [rsi+rcx+20H]
-        movdqa  xmm3, xmm1             ; Copy because used twice
-        psrldq  xmm0, %1               ; shift right
-        pslldq  xmm1, 16-%1            ; shift left
-        por     xmm0, xmm1             ; combine blocks
-        %IF %2 == 0
-        movdqa  [rdi+rcx], xmm0        ; Save aligned
-        %ELSE
-        movntdq [rdi+rcx], xmm0        ; non-temporal save
-        %ENDIF
-        movdqa  xmm0, xmm2             ; Save for next iteration
-        psrldq  xmm3, %1               ; shift right
-        pslldq  xmm2, 16-%1            ; shift left
-        por     xmm3, xmm2             ; combine blocks
-        %IF %2 == 0
-        movdqa  [rdi+rcx+10H], xmm3    ; Save aligned
-        %ELSE
-        movntdq [rdi+rcx+10H], xmm3    ; non-temporal save
-        %ENDIF
-        add     rcx, 20H               ; Loop through negative values up to zero
-        jnz     %%L1
-
-        ; Set up for edx remaining bytes
-        add     rsi, rdx
-        add     rdi, rdx
-        neg     rdx
-        cmp     edx, -10H
-        jg      %%L2
-        ; One more 16-bytes block to move
-        movdqa  xmm1, [rsi+rdx+10H]
-        psrldq  xmm0, %1               ; shift right
-        pslldq  xmm1, 16-%1            ; shift left
-        por     xmm0, xmm1             ; combine blocks
-        %IF %2 == 0
-        movdqa  [rdi+rdx], xmm0        ; Save aligned
-        %ELSE
-        movntdq [rdi+rdx], xmm0        ; non-temporal save
-        %ENDIF
-        add     rdx, 10H
-%%L2:   ; Get src pointer back to misaligned state
-        add     rsi, rax
-        %IF %2 == 1
-        sfence
-        %ENDIF
-        ; Move remaining 0 - 15 bytes, unaligned
-        jmp     C200
-%ENDMACRO
-
-
-%MACRO  MOVE_UNALIGNED_SSE2_4  1 ; nt
-; Special case for u = 4
-; %1 = 1 if non-temporal store desired
-        movaps  xmm0, [rsi+rcx]        ; Read from nearest preceding 16B boundary
-%%L1:   ; Loop. rcx has negative index from the end, counting up to zero
-        movaps  xmm1, [rsi+rcx+10H]    ; Read next two blocks aligned
-        movss   xmm0, xmm1             ; Moves 4 bytes, leaves remaining bytes unchanged
-        shufps  xmm0, xmm0, 00111001B  ; Rotate
-        %IF %1 == 0
-        movaps  [rdi+rcx], xmm0        ; Save aligned
-        %ELSE
-        movntps [rdi+rcx], xmm0        ; Non-temporal save
-        %ENDIF
-        movaps  xmm0, [rsi+rcx+20H]
-        movss   xmm1, xmm0
-        shufps  xmm1, xmm1, 00111001B
-        %IF %1 == 0
-        movaps  [rdi+rcx+10H], xmm1    ; Save aligned
-        %ELSE
-        movntps [rdi+rcx+10H], xmm1    ; Non-temporal save
-        %ENDIF
-        add     rcx, 20H               ; Loop through negative values up to zero
-        jnz     %%L1
-        ; Set up for edx remaining bytes
-        add     rsi, rdx
-        add     rdi, rdx
-        neg     rdx
-        cmp     edx, -10H
-        jg      %%L2
-        ; One more 16-bytes block to move
-        movaps  xmm1, [rsi+rdx+10H]    ; Read next two blocks aligned
-        movss   xmm0, xmm1
-        shufps  xmm0, xmm0, 00111001B
-        %IF %1 == 0
-        movaps  [rdi+rdx], xmm0        ; Save aligned
-        %ELSE
-        movntps [rdi+rdx], xmm0        ; Non-temporal save
-        %ENDIF
-        add     rdx, 10H
-%%L2:   ; Get src pointer back to misaligned state
-        add     rsi, rax
-        %IF %1 == 1
-        sfence
-        %ENDIF
-        ; Move remaining 0 - 15 bytes, unaligned
-        jmp     C200
-%ENDMACRO
-
-
-%MACRO  MOVE_UNALIGNED_SSE2_8  1 ; nt
-; Special case for u = 8
-; %1 = 1 if non-temporal store desired
-        movaps  xmm0, [rsi+rcx]        ; Read from nearest preceding 16B boundary
-%%L1:   ; Loop. rcx has negative index from the end, counting up to zero
-        movaps  xmm1, [rsi+rcx+10H]    ; Read next two blocks aligned
-        movsd   xmm0, xmm1             ; Moves 8 bytes, leaves remaining bytes unchanged
-        shufps  xmm0, xmm0, 01001110B  ; Rotate
-        %IF %1 == 0
-        movaps  [rdi+rcx], xmm0        ; Save aligned
-        %ELSE
-        movntps [rdi+rcx], xmm0        ; Non-temporal save
-        %ENDIF
-        movaps  xmm0, [rsi+rcx+20H]
-        movsd   xmm1, xmm0
-        shufps  xmm1, xmm1, 01001110B
-        %IF %1 == 0
-        movaps  [rdi+rcx+10H], xmm1    ; Save aligned
-        %ELSE
-        movntps [rdi+rcx+10H], xmm1    ; Non-temporal save
-        %ENDIF
-        add     rcx, 20H               ; Loop through negative values up to zero
-        jnz     %%L1
-        ; Set up for edx remaining bytes
-        add     rsi, rdx
-        add     rdi, rdx
-        neg     rdx
-        cmp     edx, -10H
-        jg      %%L2
-        ; One more 16-bytes block to move
-        movaps  xmm1, [rsi+rdx+10H]    ; Read next two blocks aligned
-        movsd   xmm0, xmm1
-        shufps  xmm0, xmm0, 01001110B
-        %IF %1 == 0
-        movaps  [rdi+rdx], xmm0        ; Save aligned
-        %ELSE
-        movntps [rdi+rdx], xmm0        ; Non-temporal save
-        %ENDIF
-        add     rdx, 10H
-%%L2:   ; Get src pointer back to misaligned state
-        add     rsi, rax
-        %IF %1 == 1
-        sfence
-        %ENDIF
-        ; Move remaining 0 - 15 bytes, unaligned
-        jmp     C200
-%ENDMACRO
-
-
-%MACRO  MOVE_UNALIGNED_SSE2_12  1 ; nt
-; Special case for u = 12
-; %1 = 1 if non-temporal store desired
-        movaps  xmm0, [rsi+rcx]        ; Read from nearest preceding 16B boundary
-        shufps  xmm0, xmm0, 10010011B
-%%L1:   ; Loop. rcx has negative index from the end, counting up to zero
-        movaps  xmm1, [rsi+rcx+10H]    ; Read next two blocks aligned
-        movaps  xmm2, [rsi+rcx+20H]
-        shufps  xmm1, xmm1, 10010011B
-        shufps  xmm2, xmm2, 10010011B
-        movaps  xmm3, xmm2
-        movss   xmm2, xmm1             ; Moves 4 bytes, leaves remaining bytes unchanged
-        movss   xmm1, xmm0             ; Moves 4 bytes, leaves remaining bytes unchanged
-        %IF %1 == 0
-        movaps  [rdi+rcx], xmm1        ; Save aligned
-        movaps  [rdi+rcx+10H], xmm2    ; Save aligned
-        %ELSE
-        movntps [rdi+rcx], xmm1        ; Non-temporal save
-        movntps [rdi+rcx+10H], xmm2    ; Non-temporal save
-        %ENDIF
-        movaps  xmm0, xmm3             ; Save for next iteration
-        add     rcx, 20H               ; Loop through negative values up to zero
-        jnz     %%L1
-        ; Set up for edx remaining bytes
-        add     rsi, rdx
-        add     rdi, rdx
-        neg     rdx
-        cmp     edx, -10H
-        jg      %%L2
-        ; One more 16-bytes block to move
-        movaps  xmm1, [rsi+rdx+10H]    ; Read next two blocks aligned
-        shufps  xmm1, xmm1, 10010011B
-        movss   xmm1, xmm0             ; Moves 4 bytes, leaves remaining bytes unchanged
-        %IF %1 == 0
-        movaps  [rdi+rdx], xmm1        ; Save aligned
-        %ELSE
-        movntps [rdi+rdx], xmm1        ; Non-temporal save
-        %ENDIF
-        add     rdx, 10H
-%%L2:   ; Get src pointer back to misaligned state
-        add     rsi, rax
-        %IF %1 == 1
-        sfence
-        %ENDIF
-        ; Move remaining 0 - 15 bytes, unaligned
-        jmp     C200
-%ENDMACRO
-
-
-; Macros for each src alignment, Suppl.SSE3 instruction set:
-; Make separate code for each alignment u because the palignr instruction
-; has the shift count as a constant:
-
-%MACRO MOVE_UNALIGNED_SSSE3  1 ; u
-; Move rcx + rdx bytes of data
-; Source is misaligned. (src-dest) modulo 16 = %1
-; eax = %1
-; rsi = src - %1 = nearest preceding 16-bytes boundary
-; rdi = dest (aligned)
-; rcx = - (count rounded down to nearest divisible by 32)
-; edx = remaining bytes to move after loop
-        movdqa  xmm0, [rsi+rcx]        ; Read from nearest preceding 16B boundary
-
-%%L1:   ; Loop. rcx has negative index from the end, counting up to zero
-        movdqa  xmm2, [rsi+rcx+10H]    ; Read next two blocks
-        movdqa  xmm3, [rsi+rcx+20H]
-        movdqa  xmm1, xmm0             ; Save xmm0
-        movdqa  xmm0, xmm3             ; Save for next iteration
-        palignr xmm3, xmm2, %1         ; Combine parts into aligned block
-        palignr xmm2, xmm1, %1         ; Combine parts into aligned block
-        movdqa  [rdi+rcx], xmm2        ; Save aligned
-        movdqa  [rdi+rcx+10H], xmm3    ; Save aligned
-        add     rcx, 20H
-        jnz     %%L1
-
-        ; Set up for edx remaining bytes
-        add     rsi, rdx
-        add     rdi, rdx
-        neg     rdx
-        cmp     edx, -10H
-        jg      %%L2
-        ; One more 16-bytes block to move
-        movdqa  xmm2, [rsi+rdx+10H]
-        palignr xmm2, xmm0, %1
-        movdqa  [rdi+rdx], xmm2
-        add     rdx, 10H
-%%L2:   ; Get src pointer back to misaligned state
-        add     rsi, rax
-        ; Move remaining 0 - 15 bytes
-        jmp     C200
-%ENDMACRO
-
-
-; Make 15 instances of SSE2 macro for each value of the alignment u.
-; These are pointed to by the jump table AlignmentDispatchSSE2 below
-; (alignments and fillers are inserted manually to minimize the number
-; of 16-bytes boundaries inside loops)
-
-align 16
-D104:   MOVE_UNALIGNED_SSE2_4    0
-times 4 nop
-D108:   MOVE_UNALIGNED_SSE2_8    0
-times 4 nop
-D10C:   MOVE_UNALIGNED_SSE2_12   0
-times 1 nop
-D101:   MOVE_UNALIGNED_SSE2 1,   0
-D102:   MOVE_UNALIGNED_SSE2 2,   0
-D103:   MOVE_UNALIGNED_SSE2 3,   0
-D105:   MOVE_UNALIGNED_SSE2 5,   0
-D106:   MOVE_UNALIGNED_SSE2 6,   0
-D107:   MOVE_UNALIGNED_SSE2 7,   0
-D109:   MOVE_UNALIGNED_SSE2 9,   0
-times 1 nop
-D10A:   MOVE_UNALIGNED_SSE2 0AH, 0
-D10B:   MOVE_UNALIGNED_SSE2 0BH, 0
-D10D:   MOVE_UNALIGNED_SSE2 0DH, 0
-D10E:   MOVE_UNALIGNED_SSE2 0EH, 0
-D10F:   MOVE_UNALIGNED_SSE2 0FH, 0
-
-; Make 15 instances of Suppl-SSE3 macro for each value of the alignment u.
-; These are pointed to by the jump table AlignmentDispatchSupSSE3 below
-
-align   16
-E104:   MOVE_UNALIGNED_SSSE3 4
-E108:   MOVE_UNALIGNED_SSSE3 8
-E10C:   MOVE_UNALIGNED_SSSE3 0CH
-E101:   MOVE_UNALIGNED_SSSE3 1
-E102:   MOVE_UNALIGNED_SSSE3 2
-E103:   MOVE_UNALIGNED_SSSE3 3
-E105:   MOVE_UNALIGNED_SSSE3 5
-E106:   MOVE_UNALIGNED_SSSE3 6
-E107:   MOVE_UNALIGNED_SSSE3 7
-E109:   MOVE_UNALIGNED_SSSE3 9
-times 1 nop
-E10A:   MOVE_UNALIGNED_SSSE3 0AH
-E10B:   MOVE_UNALIGNED_SSSE3 0BH
-E10D:   MOVE_UNALIGNED_SSSE3 0DH
-E10E:   MOVE_UNALIGNED_SSSE3 0EH
-E10F:   MOVE_UNALIGNED_SSSE3 0FH
-
-; Codes for non-temporal move. Aligned case first
-
-align 16
-F100:   ; Non-temporal move, src and dest have same alignment.
-        ; Loop. rcx has negative index from the end, counting up to zero
-        movaps  xmm0, [rsi+rcx]        ; Read
-        movaps  xmm1, [rsi+rcx+10H]
-        movntps [rdi+rcx], xmm0        ; Write non-temporal (bypass cache)
-        movntps [rdi+rcx+10H], xmm1
-        add     rcx, 20H
-        jnz     F100                   ; Loop through negative rcx up to zero
-
-        ; Move the remaining edx bytes (0 - 31):
-        add     rsi, rdx
-        add     rdi, rdx
-        neg     rdx
-        jz      C500                   ; Skip if no more data
-        ; Check if we can more one more 16-bytes block
-        cmp     edx, -10H
-        jg      C200
-        ; move 16 bytes, aligned
-        movaps  xmm0, [rsi+rdx]
-        movntps [rdi+rdx], xmm0
-        add     rdx, 10H
-        sfence
-        ; move the remaining 0 - 15 bytes
-        jmp     C200
-
-; Make 15 instances of MOVE_UNALIGNED_SSE2 macro for each value of
-; the alignment u.
-; These are pointed to by the jump table AlignmentDispatchNT below
-
-;align 16
-F104:   MOVE_UNALIGNED_SSE2_4    1
-F108:   MOVE_UNALIGNED_SSE2_8    1
-F10C:   MOVE_UNALIGNED_SSE2_12   1
-F101:   MOVE_UNALIGNED_SSE2 1,   1
-F102:   MOVE_UNALIGNED_SSE2 2,   1
-F103:   MOVE_UNALIGNED_SSE2 3,   1
-F105:   MOVE_UNALIGNED_SSE2 5,   1
-F106:   MOVE_UNALIGNED_SSE2 6,   1
-F107:   MOVE_UNALIGNED_SSE2 7,   1
-F109:   MOVE_UNALIGNED_SSE2 9,   1
-F10A:   MOVE_UNALIGNED_SSE2 0AH, 1
-F10B:   MOVE_UNALIGNED_SSE2 0BH, 1
-F10D:   MOVE_UNALIGNED_SSE2 0DH, 1
-F10E:   MOVE_UNALIGNED_SSE2 0EH, 1
-F10F:   MOVE_UNALIGNED_SSE2 0FH, 1
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-;                   CPU dispatcher
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-memcpyCPUDispatch:   ; CPU dispatcher, check for instruction sets and which method is fastest
-        ; This part is executed only once
-        push    rbx
-        push    rcx
-        push    rdx
-        push    rsi
-        push    rdi
-        push    r8
-        ; set CacheBypassLimit to half the size of the largest level cache
-        call    GetMemcpyCacheLimit@
-        mov     eax, 1
-        cpuid                          ; Get feature flags
-        lea     rbx, [memcpySSE2@]
-        bt      ecx, 9                 ; Test bit for SupplSSE3
-        jnc     Q100
-        lea     rbx, [memcpySSSE3@]
-        call    UnalignedIsFaster      ; Test if unaligned read is faster than aligned read and shift
-        test    eax, eax
-        jz      Q100
-        lea     rbx, [memcpyU@]
-        call    Store256BitIsFaster    ; Test if 256-bit read/write is available and faster than 128-bit read/write
-        test    eax, eax
-        jz      Q100
-        lea     rbx, [memcpyU256@]
-Q100:
-        ; Insert appropriate pointer
-        mov     [memcpyDispatch], rbx
-        mov     rax, rbx
-        pop     r8
-        pop     rdi
-        pop     rsi
-        pop     rdx
-        pop     rcx
-        pop     rbx
-        ; Jump according to the replaced function pointer
-        jmp     rax
-
-; extern "C" size_t GetMemcpyCacheLimit();
-GetMemcpyCacheLimit:
-GetMemcpyCacheLimit@:  ; local limit
-        mov     rax, [CacheBypassLimit]
-        test    rax, rax
-        jnz     U200
-        ; Get half the size of the largest level cache
-%ifdef  WINDOWS
-        xor     ecx, ecx               ; 0 means largest level cache
-%else
-        xor     edi, edi               ; 0 means largest level cache
-%endif
-        call    DataCacheSize          ; get cache size
-        shr     rax, 1                 ; half the size
-        jnz     U100
-        mov     eax, 400000H           ; cannot determine cache size. use 4 Mbytes
-U100:   mov     [CacheBypassLimit], rax
-U200:   ret
-
-; Note: SetMemcpyCacheLimit is defined in memmove64.asm, calling SetMemcpyCacheLimit1
-SetMemcpyCacheLimit1:
-%ifdef  WINDOWS
-        mov     rax, rcx
-%else
-        mov     rax, rdi
-%endif
-        test    rax, rax
-        jnz     U400
-        ; zero, means default
-        mov     [CacheBypassLimit], rax
-        call    GetMemcpyCacheLimit@
-U400:   mov     [CacheBypassLimit], rax
-        ret
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-;                   getDispatch, for testing only
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-getDispatch:
-mov rax,[memcpyDispatch]
-ret
-
-global getDispatch
-
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-;    data section. jump tables, dispatch function pointer, cache size
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-; Data segment must be included in function namespace
-SECTION .data
-align 16
-
-; Jump tables for alignments 0 - 15:
-; The CPU dispatcher replaces AlignmentDispatch with
-; AlignmentDispatchSSE2 or AlignmentDispatchSupSSE3 if Suppl-SSE3
-; is supported.
-
-; Code pointer for each alignment for SSE2 instruction set
-AlignmentDispatchSSE2:
-DQ C100, D101, D102, D103, D104, D105, D106, D107
-DQ D108, D109, D10A, D10B, D10C, D10D, D10E, D10F
-
-; Code pointer for each alignment for Suppl-SSE3 instruction set
-AlignmentDispatchSSSE3:
-DQ C100, E101, E102, E103, E104, E105, E106, E107
-DQ E108, E109, E10A, E10B, E10C, E10D, E10E, E10F
-
-; Code pointer for each alignment for non-temporal store
-AlignmentDispatchNT:
-DQ F100, F101, F102, F103, F104, F105, F106, F107
-DQ F108, F109, F10A, F10B, F10C, F10D, F10E, F10F
-
-; Pointer to appropriate version.
-; This initially points to memcpyCPUDispatch. memcpyCPUDispatch will
-; change this to the appropriate version of memcpy, so that
-; memcpyCPUDispatch is only executed once:
-memcpyDispatch DQ memcpyCPUDispatch
-
-; Bypass cache by using non-temporal moves if count > CacheBypassLimit
-; The optimal value of _CacheBypassLimit is difficult to estimate, but
-; a reasonable value is half the size of the largest cache:
-CacheBypassLimit: DQ 0
author	thegeorg <[email protected]>	2023-08-22 18:56:30 +0300
committer	thegeorg <[email protected]>	2023-08-22 19:13:38 +0300
commit	769d14120ef8e30363c7dd6870ce1b82552587c3 (patch)
tree	c407d1d3f152b9f6eb13f50abc3f5b06db82f9b3 /contrib/libs/asmlib/memcpy64.asm
parent	494eee7cbbaf3e7d71a133c80c96aec26e518c2a (diff)