aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs/asmlib/memcmp64.asm
blob: b8a8ab5fbc103bfc20a4a2c0d38291674a296ee3 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
%include "defs.asm"

;*************************  memcmp64.asm  *************************************
; Author:           Agner Fog
; Date created:     2013-10-03
; Last modified:    2013-10-03
; Description:
; Faster version of the standard memcmp function:
;
; int A_memcmp (const void * ptr1, const void * ptr2, size_t count);
;
; Compares two memory blocks of size num.
; The return value is zero if the two memory blocks ptr1 and ptr2 are equal
; The return value is positive if the first differing byte of ptr1 is bigger 
; than ptr2 when compared as unsigned bytes.
; The return value is negative if the first differing byte of ptr1 is smaller 
; than ptr2 when compared as unsigned bytes.
;
; Overriding standard function memcmp:
; The alias ?OVR_memcmp is changed to _memcmp in the object file if
; it is desired to override the standard library function memcmp.
;
; Optimization:
; Uses XMM registers if SSE2 is available, uses YMM registers if AVX2.
;
; The latest version of this file is available at:
; www.agner.org/optimize/asmexamples.zip
; Copyright (c) 2013 GNU General Public License www.gnu.org/licenses
;******************************************************************************

global A_memcmp: function              ; Function memcmp
global EXP(memcmp): function           ; ?OVR_ removed if standard function memcmp overridden
; Direct entries to CPU-specific versions
global memcmpSSE2: function            ; SSE2 version
global memcmpAVX2: function            ; AVX2 version

; Imported from instrset64.asm
extern InstructionSet                 ; Instruction set for CPU dispatcher

default rel

; define registers used for parameters
%IFDEF  WINDOWS
%define par1   rcx                     ; function parameter 1
%define par2   rdx                     ; function parameter 2
%define par3   r8                      ; function parameter 3
%define par4   r9                      ; scratch register
%define par4d  r9d                     ; scratch register
%ENDIF
%IFDEF  UNIX
%define par1   rdi                     ; function parameter 1
%define par2   rsi                     ; function parameter 2
%define par3   rdx                     ; function parameter 3
%define par4   rcx                     ; scratch register
%define par4d  ecx                     ; scratch register
%ENDIF



SECTION .text  align=16

; extern "C" int A_memcmp (const void * ptr1, const void * ptr2, size_t count);
; Function entry:
A_memcmp:
EXP(memcmp):
        jmp     qword [memcmpDispatch] ; Go to appropriate version, depending on instruction set


align 16
memcmpAVX2:    ; AVX2 version. Use ymm register
memcmpAVX2@:   ; internal reference

        add     par1, par3                       ; use negative index from end of memory block
        add     par2, par3
        neg     par3
        jz      A900
        mov     par4d, 0FFFFH 
        cmp     par3, -32
        ja      A100
        
A000:   ; loop comparing 32 bytes
        vmovdqu   ymm1, [par1+par3]
        vpcmpeqb  ymm0, ymm1, [par2+par3]        ; compare 32 bytes
        vpmovmskb eax, ymm0                      ; get byte mask
        xor     eax, -1                          ; not eax would not set flags
        jnz     A700                             ; difference found
        add     par3, 32
        jz      A900                             ; finished, equal
        cmp     par3, -32
        jna     A000                             ; next 32 bytes
        vzeroupper                               ; end ymm state
        
A100:   ; less than 32 bytes left
        cmp     par3, -16
        ja      A200
        movdqu  xmm1, [par1+par3]
        movdqu  xmm2, [par2+par3]
        pcmpeqb xmm1, xmm2                       ; compare 16 bytes
        pmovmskb eax, xmm1                       ; get byte mask
        xor     eax, par4d                       ; invert lower 16 bits
        jnz     A701                             ; difference found
        add     par3, 16
        jz      A901                             ; finished, equal
        
A200:   ; less than 16 bytes left
        cmp     par3, -8
        ja      A300
        ; compare 8 bytes
        movq    xmm1, [par1+par3]
        movq    xmm2, [par2+par3]
        pcmpeqb xmm1, xmm2                       ; compare 8 bytes
        pmovmskb eax, xmm1                       ; get byte mask
        xor     eax, par4d
        jnz     A701                             ; difference found
        add     par3, 8
        jz      A901 
        
A300:   ; less than 8 bytes left
        cmp     par3, -4
        ja      A400
        ; compare 4 bytes
        movd    xmm1, [par1+par3]
        movd    xmm2, [par2+par3]
        pcmpeqb xmm1, xmm2                       ; compare 4 bytes
        pmovmskb eax, xmm1                       ; get byte mask
        xor     eax, par4d                         ; not ax
        jnz     A701                             ; difference found
        add     par3, 4
        jz      A901 

A400:   ; less than 4 bytes left
        cmp     par3, -2
        ja      A500
        movzx   eax, word [par1+par3]
        movzx   par4d, word [par2+par3]
        sub     eax, par4d
        jnz     A800                             ; difference in byte 0 or 1
        add     par3, 2
        jz      A901 
        
A500:   ; less than 2 bytes left
        test    par3, par3
        jz      A901                             ; no bytes left
        
A600:   ; one byte left
        movzx   eax, byte [par1+par3]
        movzx   par4d, byte [par2+par3]
        sub     eax, par4d                         ; return result
        ret

A700:   ; difference found. find position
        vzeroupper
A701:   
        bsf     eax, eax
        add     par3, rax
        movzx   eax, byte [par1+par3]
        movzx   par4d, byte [par2+par3]
        sub     eax, par4d                         ; return result
        ret

A800:   ; difference in byte 0 or 1
        neg     al
        sbb     par3, -1                           ; add 1 to par3 if al == 0
        movzx   eax, byte [par1+par3]
        movzx   par4d, byte [par2+par3]
        sub     eax, par4d                         ; return result
        ret

A900:   ; equal
        vzeroupper
A901:   xor     eax, eax        
        ret
        

memcmpSSE2:    ; SSE2 version. Use xmm register
memcmpSSE2@:   ; internal reference

        add     par1, par3                         ; use negative index from end of memory block
        add     par2, par3
        neg     par3
        jz      S900 
        mov     par4d, 0FFFFH
        cmp     par3, -16
        ja      S200
        
S100:   ; loop comparing 16 bytes
        movdqu  xmm1, [par1+par3]
        movdqu  xmm2, [par2+par3]
        pcmpeqb xmm1, xmm2                       ; compare 16 bytes
        pmovmskb eax, xmm1                       ; get byte mask
        xor     eax, par4d                         ; not ax
        jnz     S700                             ; difference found
        add     par3, 16
        jz      S900                             ; finished, equal
        cmp     par3, -16
        jna     S100                             ; next 16 bytes
        
S200:   ; less than 16 bytes left
        cmp     par3, -8
        ja      S300
        ; compare 8 bytes
        movq    xmm1, [par1+par3]
        movq    xmm2, [par2+par3]
        pcmpeqb xmm1, xmm2                       ; compare 8 bytes
        pmovmskb eax, xmm1                       ; get byte mask
        xor     eax, par4d                         ; not ax
        jnz     S700                             ; difference found
        add     par3, 8
        jz      S900 
        
S300:   ; less than 8 bytes left
        cmp     par3, -4
        ja      S400
        ; compare 4 bytes
        movd    xmm1, [par1+par3]
        movd    xmm2, [par2+par3]
        pcmpeqb xmm1, xmm2                       ; compare 4 bytes
        pmovmskb eax, xmm1                       ; get byte mask
        xor     eax, par4d                         ; not ax
        jnz     S700                             ; difference found
        add     par3, 4
        jz      S900 

S400:   ; less than 4 bytes left
        cmp     par3, -2
        ja      S500
        movzx   eax, word [par1+par3]
        movzx   par4d, word [par2+par3]
        sub     eax, par4d
        jnz     S800                             ; difference in byte 0 or 1
        add     par3, 2
        jz      S900 
        
S500:   ; less than 2 bytes left
        test    par3, par3
        jz      S900                             ; no bytes left
        
        ; one byte left
        movzx   eax, byte [par1+par3]
        movzx   par4d, byte [par2+par3]
        sub     eax, par4d                         ; return result
        ret

S700:   ; difference found. find position
        bsf     eax, eax
        add     par3, rax
        movzx   eax, byte [par1+par3]
        movzx   par4d, byte [par2+par3]
        sub     eax, par4d                         ; return result
        ret

S800:   ; difference in byte 0 or 1
        neg     al
        sbb     par3, -1                          ; add 1 to par3 if al == 0
S820:   movzx   eax, byte [par1+par3]
        movzx   par4d, byte [par2+par3]
        sub     eax, par4d                         ; return result
        ret

S900:   ; equal
        xor     eax, eax        
        ret

        
; CPU dispatching for memcmp. This is executed only once
memcmpCPUDispatch:
        push    par1
        push    par2
        push    par3        
        call    InstructionSet                         ; get supported instruction set
        ; SSE2 always supported
        lea     par4, [memcmpSSE2@]
        cmp     eax, 13                ; check AVX2
        jb      Q100
        ; AVX2 supported
        lea     par4, [memcmpAVX2@]        
Q100:   ; save pointer
        mov     qword [memcmpDispatch], par4
; Continue in appropriate version of memcmp
        pop     par3
        pop     par2
        pop     par1
        jmp     par4


SECTION .data
align 16


; Pointer to appropriate version.
; This initially points to memcmpCPUDispatch. memcmpCPUDispatch will
; change this to the appropriate version of memcmp, so that
; memcmpCPUDispatch is only executed once:
memcmpDispatch DQ memcmpCPUDispatch