1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
|
%include "defs.asm"
;************************** strlen64.asm **********************************
; Author: Agner Fog
; Date created: 2008-07-19
; Last modified: 2008-10-16
; Description:
; Faster version of the standard strlen function:
; size_t strlen(const char * str);
; Finds the length of a zero-terminated string of bytes, optimized for speed.
;
; Overriding standard function strlen:
; The alias ?OVR_strlen is changed to _strlen in the object file if
; it is desired to override the standard library function strlen.
;
; Calling conventions:
; Stack alignment is not required. No shadow space or red zone used.
; Called internally from strcpy and strcat without stack aligned.
;
; Optimization:
; Uses XMM registers to read 16 bytes at a time, aligned.
; Misaligned parts of the string are read from the nearest 16-bytes boundary
; and the irrelevant part masked out. It may read both before the begin of
; the string and after the end, but will never load any unnecessary cache
; line and never trigger a page fault for reading from non-existing memory
; pages because it never reads past the nearest following 16-bytes boundary.
; It may, though, trigger any debug watch within the same 16-bytes boundary.
;
; The latest version of this file is available at:
; www.agner.org/optimize/asmexamples.zip
; Copyright (c) 2009 GNU General Public License www.gnu.org/licenses
;******************************************************************************
default rel
global A_strlen: function ; Function A_strlen
global EXP(strlen): function ; ?OVR removed if standard function strlen overridden
SECTION .text align=16
; extern "C" int strlen (const char * s);
; 64-bit Windows version:
A_strlen:
EXP(strlen):
%IFDEF WINDOWS
mov rax, rcx ; get pointer to string from rcx
mov r8, rcx ; copy pointer
%define Rscopy r8 ; Copy of s
%ELSE ; Unix
mov rax, rdi ; get pointer to string from rdi
mov ecx, edi ; copy pointer (lower 32 bits)
%define Rscopy rdi ; Copy of s
%ENDIF
; rax = s, ecx = 32 bits of s
pxor xmm0, xmm0 ; set to zero
and ecx, 0FH ; lower 4 bits indicate misalignment
and rax, -10H ; align pointer by 16
movdqa xmm1, [rax] ; read from nearest preceding boundary
pcmpeqb xmm1, xmm0 ; compare 16 bytes with zero
pmovmskb edx, xmm1 ; get one bit for each byte result
shr edx, cl ; shift out false bits
shl edx, cl ; shift back again
bsf edx, edx ; find first 1-bit
jnz L2 ; found
; Main loop, search 16 bytes at a time
L1: add rax, 10H ; increment pointer by 16
movdqa xmm1, [rax] ; read 16 bytes aligned
pcmpeqb xmm1, xmm0 ; compare 16 bytes with zero
pmovmskb edx, xmm1 ; get one bit for each byte result
bsf edx, edx ; find first 1-bit
; (moving the bsf out of the loop and using test here would be faster for long strings on old processors,
; but we are assuming that most strings are short, and newer processors have higher priority)
jz L1 ; loop if not found
L2: ; Zero-byte found. Compute string length
sub rax, Rscopy ; subtract start address
add rax, rdx ; add byte index
ret
;A_strlen ENDP
|