1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
|
%include "defs.asm"
;************************* memcmp64.asm *************************************
; Author: Agner Fog
; Date created: 2013-10-03
; Last modified: 2013-10-03
; Description:
; Faster version of the standard memcmp function:
;
; int A_memcmp (const void * ptr1, const void * ptr2, size_t count);
;
; Compares two memory blocks of size num.
; The return value is zero if the two memory blocks ptr1 and ptr2 are equal
; The return value is positive if the first differing byte of ptr1 is bigger
; than ptr2 when compared as unsigned bytes.
; The return value is negative if the first differing byte of ptr1 is smaller
; than ptr2 when compared as unsigned bytes.
;
; Overriding standard function memcmp:
; The alias ?OVR_memcmp is changed to _memcmp in the object file if
; it is desired to override the standard library function memcmp.
;
; Optimization:
; Uses XMM registers if SSE2 is available, uses YMM registers if AVX2.
;
; The latest version of this file is available at:
; www.agner.org/optimize/asmexamples.zip
; Copyright (c) 2013 GNU General Public License www.gnu.org/licenses
;******************************************************************************
global A_memcmp: function ; Function memcmp
global EXP(memcmp): function ; ?OVR_ removed if standard function memcmp overridden
; Direct entries to CPU-specific versions
global memcmpSSE2: function ; SSE2 version
global memcmpAVX2: function ; AVX2 version
; Imported from instrset64.asm
extern InstructionSet ; Instruction set for CPU dispatcher
default rel
; define registers used for parameters
%IFDEF WINDOWS
%define par1 rcx ; function parameter 1
%define par2 rdx ; function parameter 2
%define par3 r8 ; function parameter 3
%define par4 r9 ; scratch register
%define par4d r9d ; scratch register
%ENDIF
%IFDEF UNIX
%define par1 rdi ; function parameter 1
%define par2 rsi ; function parameter 2
%define par3 rdx ; function parameter 3
%define par4 rcx ; scratch register
%define par4d ecx ; scratch register
%ENDIF
SECTION .text align=16
; extern "C" int A_memcmp (const void * ptr1, const void * ptr2, size_t count);
; Function entry:
A_memcmp:
EXP(memcmp):
jmp qword [memcmpDispatch] ; Go to appropriate version, depending on instruction set
align 16
memcmpAVX2: ; AVX2 version. Use ymm register
memcmpAVX2@: ; internal reference
add par1, par3 ; use negative index from end of memory block
add par2, par3
neg par3
jz A900
mov par4d, 0FFFFH
cmp par3, -32
ja A100
A000: ; loop comparing 32 bytes
vmovdqu ymm1, [par1+par3]
vpcmpeqb ymm0, ymm1, [par2+par3] ; compare 32 bytes
vpmovmskb eax, ymm0 ; get byte mask
xor eax, -1 ; not eax would not set flags
jnz A700 ; difference found
add par3, 32
jz A900 ; finished, equal
cmp par3, -32
jna A000 ; next 32 bytes
vzeroupper ; end ymm state
A100: ; less than 32 bytes left
cmp par3, -16
ja A200
movdqu xmm1, [par1+par3]
movdqu xmm2, [par2+par3]
pcmpeqb xmm1, xmm2 ; compare 16 bytes
pmovmskb eax, xmm1 ; get byte mask
xor eax, par4d ; invert lower 16 bits
jnz A701 ; difference found
add par3, 16
jz A901 ; finished, equal
A200: ; less than 16 bytes left
cmp par3, -8
ja A300
; compare 8 bytes
movq xmm1, [par1+par3]
movq xmm2, [par2+par3]
pcmpeqb xmm1, xmm2 ; compare 8 bytes
pmovmskb eax, xmm1 ; get byte mask
xor eax, par4d
jnz A701 ; difference found
add par3, 8
jz A901
A300: ; less than 8 bytes left
cmp par3, -4
ja A400
; compare 4 bytes
movd xmm1, [par1+par3]
movd xmm2, [par2+par3]
pcmpeqb xmm1, xmm2 ; compare 4 bytes
pmovmskb eax, xmm1 ; get byte mask
xor eax, par4d ; not ax
jnz A701 ; difference found
add par3, 4
jz A901
A400: ; less than 4 bytes left
cmp par3, -2
ja A500
movzx eax, word [par1+par3]
movzx par4d, word [par2+par3]
sub eax, par4d
jnz A800 ; difference in byte 0 or 1
add par3, 2
jz A901
A500: ; less than 2 bytes left
test par3, par3
jz A901 ; no bytes left
A600: ; one byte left
movzx eax, byte [par1+par3]
movzx par4d, byte [par2+par3]
sub eax, par4d ; return result
ret
A700: ; difference found. find position
vzeroupper
A701:
bsf eax, eax
add par3, rax
movzx eax, byte [par1+par3]
movzx par4d, byte [par2+par3]
sub eax, par4d ; return result
ret
A800: ; difference in byte 0 or 1
neg al
sbb par3, -1 ; add 1 to par3 if al == 0
movzx eax, byte [par1+par3]
movzx par4d, byte [par2+par3]
sub eax, par4d ; return result
ret
A900: ; equal
vzeroupper
A901: xor eax, eax
ret
memcmpSSE2: ; SSE2 version. Use xmm register
memcmpSSE2@: ; internal reference
add par1, par3 ; use negative index from end of memory block
add par2, par3
neg par3
jz S900
mov par4d, 0FFFFH
cmp par3, -16
ja S200
S100: ; loop comparing 16 bytes
movdqu xmm1, [par1+par3]
movdqu xmm2, [par2+par3]
pcmpeqb xmm1, xmm2 ; compare 16 bytes
pmovmskb eax, xmm1 ; get byte mask
xor eax, par4d ; not ax
jnz S700 ; difference found
add par3, 16
jz S900 ; finished, equal
cmp par3, -16
jna S100 ; next 16 bytes
S200: ; less than 16 bytes left
cmp par3, -8
ja S300
; compare 8 bytes
movq xmm1, [par1+par3]
movq xmm2, [par2+par3]
pcmpeqb xmm1, xmm2 ; compare 8 bytes
pmovmskb eax, xmm1 ; get byte mask
xor eax, par4d ; not ax
jnz S700 ; difference found
add par3, 8
jz S900
S300: ; less than 8 bytes left
cmp par3, -4
ja S400
; compare 4 bytes
movd xmm1, [par1+par3]
movd xmm2, [par2+par3]
pcmpeqb xmm1, xmm2 ; compare 4 bytes
pmovmskb eax, xmm1 ; get byte mask
xor eax, par4d ; not ax
jnz S700 ; difference found
add par3, 4
jz S900
S400: ; less than 4 bytes left
cmp par3, -2
ja S500
movzx eax, word [par1+par3]
movzx par4d, word [par2+par3]
sub eax, par4d
jnz S800 ; difference in byte 0 or 1
add par3, 2
jz S900
S500: ; less than 2 bytes left
test par3, par3
jz S900 ; no bytes left
; one byte left
movzx eax, byte [par1+par3]
movzx par4d, byte [par2+par3]
sub eax, par4d ; return result
ret
S700: ; difference found. find position
bsf eax, eax
add par3, rax
movzx eax, byte [par1+par3]
movzx par4d, byte [par2+par3]
sub eax, par4d ; return result
ret
S800: ; difference in byte 0 or 1
neg al
sbb par3, -1 ; add 1 to par3 if al == 0
S820: movzx eax, byte [par1+par3]
movzx par4d, byte [par2+par3]
sub eax, par4d ; return result
ret
S900: ; equal
xor eax, eax
ret
; CPU dispatching for memcmp. This is executed only once
memcmpCPUDispatch:
push par1
push par2
push par3
call InstructionSet ; get supported instruction set
; SSE2 always supported
lea par4, [memcmpSSE2@]
cmp eax, 13 ; check AVX2
jb Q100
; AVX2 supported
lea par4, [memcmpAVX2@]
Q100: ; save pointer
mov qword [memcmpDispatch], par4
; Continue in appropriate version of memcmp
pop par3
pop par2
pop par1
jmp par4
SECTION .data
align 16
; Pointer to appropriate version.
; This initially points to memcmpCPUDispatch. memcmpCPUDispatch will
; change this to the appropriate version of memcmp, so that
; memcmpCPUDispatch is only executed once:
memcmpDispatch DQ memcmpCPUDispatch
|