1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
|
// Copyright 2010 Google Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Implements 64-bit multiword CRC for Microsoft and Intel compilers
// using MMX instructions (i386).
#include "generic_crc.h"
#if CRCUTIL_USE_ASM && HAVE_I386 && HAVE_MMX && defined(_MSC_VER)
namespace crcutil {
#define CRC_WORD_MMX() \
__asm pxor BUF0, CRC0 \
__asm movd TMP0, BUF0 \
__asm psrlq BUF0, 32 \
__asm movzx TEMP, TMP0L \
__asm shr TMP0, 8 \
__asm movq CRC0, [TABLE + TEMP * 8] \
__asm movzx TEMP, TMP0L \
__asm shr TMP0, 8 \
__asm pxor CRC0, [TABLE + TEMP * 8 + 1 * 256 * 8] \
__asm movzx TEMP, TMP0L \
__asm shr TMP0, 8 \
__asm pxor CRC0, [TABLE + TEMP * 8 + 2 * 256 * 8] \
__asm pxor CRC0, [TABLE + TMP0 * 8 + 3 * 256 * 8] \
__asm movd TMP0, BUF0 \
__asm movzx TEMP, TMP0L \
__asm shr TMP0, 8 \
__asm pxor CRC0, [TABLE + TEMP * 8 + 4 * 256 * 8] \
__asm movzx TEMP, TMP0L \
__asm shr TMP0, 8 \
__asm pxor CRC0, [TABLE + TEMP * 8 + 5 * 256 * 8] \
__asm movzx TEMP, TMP0L \
__asm shr TMP0, 8 \
__asm pxor CRC0, [TABLE + TEMP * 8 + 6 * 256 * 8] \
__asm pxor CRC0, [TABLE + TMP0 * 8 + 7 * 256 * 8]
// frame pointer register 'ebp' modified by inline assembly code
#pragma warning(disable: 4731)
template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiwordI386Mmx(
const void *data,
size_t bytes,
const uint64 &start) const {
const uint8 *src = static_cast<const uint8 *>(data);
const uint8 *end = src + bytes;
uint64 crc0 = start ^ this->Base().Canonize();
ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, uint64);
if (src >= end) {
return (crc0 ^ this->Base().Canonize());
}
#define CRC0 mm0
#define CRC1 mm1
#define CRC2 mm2
#define CRC3 mm3
#define BUF0 mm4
#define BUF1 mm5
#define BUF2 mm6
#define BUF3 mm7
#define TMP0 eax
#define TMP0L al
#define TMP0H ah
#define TMP1 ebx
#define TMP1L bl
#define TMP1H bh
#define TMP2 ecx
#define TMP2L cl
#define TMP2H ch
#define TMP3 edx
#define TMP3L dl
#define TMP3H dh
#define TEMP edi
#define SRC esi
#define END [esp]
#define TABLE ebp
const uint64 *interleaved_table_address =
&this->crc_word_interleaved_[0][0];
const uint64 *word_table_address = &this->crc_word_[0][0];
__asm {
push ebp
mov TMP0, interleaved_table_address
movq CRC0, crc0
mov SRC, src
mov TMP1, end
sub TMP1, 2*4*8 - 1
cmp SRC, TMP1
mov TABLE, word_table_address
jae end_main_loop
push TABLE
mov TABLE, TMP0
push TMP1
pxor CRC1, CRC1
pxor CRC2, CRC2
pxor CRC3, CRC3
movq BUF0, [SRC]
movq BUF1, [SRC + 1 * 8]
movq BUF2, [SRC + 2 * 8]
movq BUF3, [SRC + 3 * 8]
main_loop:
#if HAVE_SSE && CRCUTIL_PREFETCH_WIDTH > 0
prefetcht0 [SRC + CRCUTIL_PREFETCH_WIDTH]
#endif
add SRC, 32
pxor BUF0, CRC0
pxor BUF1, CRC1
pxor BUF2, CRC2
pxor BUF3, CRC3
movd TMP0, BUF0
psrlq BUF0, 32
movd TMP1, BUF1
psrlq BUF1, 32
movd TMP2, BUF2
psrlq BUF2, 32
movd TMP3, BUF3
psrlq BUF3, 32
movzx TEMP, TMP0L
movq CRC0, [TABLE + TEMP * 8]
movzx TEMP, TMP1L
movq CRC1, [TABLE + TEMP * 8]
movzx TEMP, TMP2L
movq CRC2, [TABLE + TEMP * 8]
movzx TEMP, TMP3L
movq CRC3, [TABLE + TEMP * 8]
movzx TEMP, TMP0H
shr TMP0, 16
pxor CRC0, [TABLE + TEMP * 8 + 1 * 256 * 8]
movzx TEMP, TMP1H
shr TMP1, 16
pxor CRC1, [TABLE + TEMP * 8 + 1 * 256 * 8]
movzx TEMP, TMP2H
shr TMP2, 16
pxor CRC2, [TABLE + TEMP * 8 + 1 * 256 * 8]
movzx TEMP, TMP3H
shr TMP3, 16
pxor CRC3, [TABLE + TEMP * 8 + 1 * 256 * 8]
movzx TEMP, TMP0L
shr TMP0, 8
pxor CRC0, [TABLE + TEMP * 8 + 2 * 256 * 8]
movzx TEMP, TMP1L
shr TMP1, 8
pxor CRC1, [TABLE + TEMP * 8 + 2 * 256 * 8]
movzx TEMP, TMP2L
shr TMP2, 8
pxor CRC2, [TABLE + TEMP * 8 + 2 * 256 * 8]
movzx TEMP, TMP3L
shr TMP3, 8
pxor CRC3, [TABLE + TEMP * 8 + 2 * 256 * 8]
pxor CRC0, [TABLE + TMP0 * 8 + 3 * 256 * 8]
movd TMP0, BUF0
pxor CRC1, [TABLE + TMP1 * 8 + 3 * 256 * 8]
movd TMP1, BUF1
pxor CRC2, [TABLE + TMP2 * 8 + 3 * 256 * 8]
movd TMP2, BUF2
pxor CRC3, [TABLE + TMP3 * 8 + 3 * 256 * 8]
movd TMP3, BUF3
movzx TEMP, TMP0L
pxor CRC0, [TABLE + TEMP * 8 + 4 * 256 * 8]
movzx TEMP, TMP1L
pxor CRC1, [TABLE + TEMP * 8 + 4 * 256 * 8]
movzx TEMP, TMP2L
pxor CRC2, [TABLE + TEMP * 8 + 4 * 256 * 8]
movzx TEMP, TMP3L
pxor CRC3, [TABLE + TEMP * 8 + 4 * 256 * 8]
movzx TEMP, TMP0H
shr TMP0, 16
pxor CRC0, [TABLE + TEMP * 8 + 5 * 256 * 8]
movzx TEMP, TMP1H
shr TMP1, 16
pxor CRC1, [TABLE + TEMP * 8 + 5 * 256 * 8]
movzx TEMP, TMP2H
shr TMP2, 16
pxor CRC2, [TABLE + TEMP * 8 + 5 * 256 * 8]
movzx TEMP, TMP3H
shr TMP3, 16
pxor CRC3, [TABLE + TEMP * 8 + 5 * 256 * 8]
movzx TEMP, TMP0L
shr TMP0, 8
pxor CRC0, [TABLE + TEMP * 8 + 6 * 256 * 8]
movzx TEMP, TMP1L
shr TMP1, 8
pxor CRC1, [TABLE + TEMP * 8 + 6 * 256 * 8]
movzx TEMP, TMP2L
shr TMP2, 8
pxor CRC2, [TABLE + TEMP * 8 + 6 * 256 * 8]
movzx TEMP, TMP3L
shr TMP3, 8
pxor CRC3, [TABLE + TEMP * 8 + 6 * 256 * 8]
pxor CRC0, [TABLE + TMP0 * 8 + 7 * 256 * 8]
movq BUF0, [SRC]
pxor CRC1, [TABLE + TMP1 * 8 + 7 * 256 * 8]
movq BUF1, [SRC + 1 * 8]
pxor CRC2, [TABLE + TMP2 * 8 + 7 * 256 * 8]
movq BUF2, [SRC + 2 * 8]
pxor CRC3, [TABLE + TMP3 * 8 + 7 * 256 * 8]
movq BUF3, [SRC + 3 * 8]
cmp END, SRC
ja main_loop
#undef END
#define END TMP1
pop END
pop TABLE
add SRC, 32
CRC_WORD_MMX()
pxor BUF1, CRC1
movq BUF0, BUF1
CRC_WORD_MMX()
pxor BUF2, CRC2
movq BUF0, BUF2
CRC_WORD_MMX()
pxor BUF3, CRC3
movq BUF0, BUF3
CRC_WORD_MMX()
end_main_loop:
add END, 2*4*8 - 8
cmp SRC, END
jae end_word_loop
word_loop:
movq BUF0, [SRC]
add SRC, 8
CRC_WORD_MMX()
cmp END, SRC
ja word_loop
end_word_loop:
#if 0 // Plain C version is faster?
add END, 7
cmp SRC, END
jae end_byte_loop
byte_loop:
movd TMP0, CRC0
movzx TEMP, byte ptr [SRC]
movzx TMP0, TMP0L
psrlq CRC0, 8
xor TEMP, TMP0
add SRC, 1
pxor CRC0, [TABLE + TEMP*8 + 7*256*8]
cmp END, SRC
ja byte_loop
end_byte_loop:
#endif
pop ebp
mov src, SRC
movq crc0, CRC0
emms
}
#if 1
// Compute CRC of remaining bytes.
for (;src < end; ++src) {
CRC_BYTE(this, crc0, *src);
}
#endif
return (crc0 ^ this->Base().Canonize());
}
} // namespace crcutil
#endif // CRCUTIL_USE_ASM && HAVE_I386 && HAVE_MMX && defined(_MSC_VER)
|