1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
|
%include "defs.asm"
;************************* unalignedisfaster64.asm ******************************
; Author: Agner Fog
; Date created: 2011-07-09
; Last modified: 2013-08-30
; Source URL: www.agner.org/optimize
; Project: asmlib.zip
; Language: assembly, NASM/YASM syntax, 64 bit
;
; C++ prototype:
; extern "C" int UnalignedIsFaster(void);
;
; Description:
; This function finds out if unaligned 16-bytes memory read is
; faster than aligned read followed by an alignment shift (PALIGNR) on the
; current CPU.
;
; Return value:
; 0: Unaligned read is probably slower than alignment shift
; 1: Unknown
; 2: Unaligned read is probably faster than alignment shift
;
;
; C++ prototype:
; extern "C" int Store256BitIsFaster(void);
;
; Description:
; This function finds out if a 32-bytes memory write is
; faster than two 16-bytes writes on the current CPU.
;
; Return value:
; 0: 32-bytes memory write is slower or AVX not supported
; 1: Unknown
; 2: 32-bytes memory write is faster
;
; Copyright (c) 2011 - 2013 GNU General Public License www.gnu.org/licenses
;******************************************************************************
;
; C++ prototype:
; extern "C" int UnalignedIsFaster(void);
global UnalignedIsFaster: function
global Store256BitIsFaster: function
extern CpuType
extern InstructionSet
SECTION .text
UnalignedIsFaster:
%ifdef UNIX
push 0 ; vendor
mov rdi, rsp
push 0 ; family
mov rsi, rsp
push 0 ; model
mov rdx, rsp
%else ; WINDOWS
push 0 ; vendor
mov rcx, rsp
push 0 ; family
mov rdx, rsp
push 0 ; model
mov r8, rsp
%endif
call CpuType ; get vendor, family, model
pop rdx ; model
pop rcx ; family
pop r8 ; vendor
xor eax, eax ; return value
dec r8d
jz Intel
dec r8d
jz AMD
dec r8d
jz VIA
; unknown vendor
inc eax
jmp Uend
Intel: ; Unaligned read is faster on Intel Nehalem and later, but not Atom
; Nehalem = family 6, model 1AH
; Atom = family 6, model 1CH
; Netburst = family 0FH
; Future models are likely to be family 6, mayby > 6, model > 1C
cmp ecx, 6
jb Uend ; old Pentium 1, etc
cmp ecx, 0FH
je Uend ; old Netburst architecture
cmp edx, 1AH
jb Uend ; earlier than Nehalem
cmp edx, 1CH
je Uend ; Intel Atom
or eax, 2 ; Intel Nehalem and later, except Atom
jmp Uend
AMD: ; AMD processors:
; The PALIGNR instruction is slow on AMD Bobcat but fast on Jaguar
; K10/Opteron = family 10H ; Use unaligned
; Bobcat = family 14H ; PALIGNR is very slow. Use unaligned
; Piledriver = family 15H ; Use unaligned
; Jaguar = family 16H ; PALIGNR is fast. Use aligned (aligned is faster in most cases, but not all)
cmp ecx, 10H ; AMD K8 or earlier: use aligned
jb Uend
cmp ecx, 16H ; Jaguar: use aligned
je Uend
or eax, 2 ; AMD K10 or later: use unaligned
jmp Uend
VIA: ; Unaligned read is not faster than PALIGNR on VIA Nano 2000 and 3000
cmp ecx, 0FH
jna Uend ; VIA Nano
inc eax ; Future versions: unknown
;jmp Uend
Uend: ret
;UnalignedIsFaster ENDP
Store256BitIsFaster:
call InstructionSet
cmp eax, 11 ; AVX supported
jb S90
%ifdef UNIX
push 0 ; vendor
mov rdi, rsp
push 0 ; family
mov rsi, rsp
push 0 ; model
mov rdx, rsp
%else ; WINDOWS
push 0 ; vendor
mov rcx, rsp
push 0 ; family
mov rdx, rsp
push 0 ; model
mov r8, rsp
%endif
call CpuType ; get vendor, family, model
pop rdx ; model
pop rcx ; family
pop rax ; vendor
cmp eax, 1 ; Intel
je S_Intel
cmp eax, 2 ; AMD
je S_AMD
cmp eax, 3
je S_VIA
jmp S91 ; other vendor, not known
S_Intel:cmp ecx, 6
jne S92 ; unknown family. possibly future model
; model 2AH Sandy Bridge
; model 3AH Ivy Bridge
; model 3CH Haswell
; Sandy Bridge and Ivy Bridge are slightly faster with 128 than with 256 bit moves on large data blocks
; Haswell is much faster with 256 bit moves
cmp edx, 3AH
jbe S90
jmp S92
S_AMD: ; AMD
cmp ecx, 15H ; family 15h = Bulldozer, Piledriver
ja S92 ; assume future AMD families are faster
; family 16H = Jaguar. 256 bit write is slightly faster
; model 1 = Bulldozer is a little slower on 256 bit write
; model 2 = Piledriver is terribly slow on 256 bit write
; assume future models 3-4 are like Bulldozer
cmp edx, 4
jbe S90
jmp S91 ; later models: don't know
S_VIA: jmp S91 ; don't know
S90: xor eax, eax ; return 0
ret
S91: mov eax, 1 ; return 1
ret
S92: mov eax, 2 ; return 2
ret
; Store256BitIsFaster ENDP
|