1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
|
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build ppc64 || ppc64le
#include "textflag.h"
// See memmove Go doc for important implementation constraints.
// func memmove(to, from unsafe.Pointer, n uintptr)
// target address
#define TGT R3
// source address
#define SRC R4
// length to move
#define LEN R5
// number of doublewords
#define DWORDS R6
// number of bytes < 8
#define BYTES R7
// const 16 used as index
#define IDX16 R8
// temp used for copies, etc.
#define TMP R9
// number of 64 byte chunks
#define QWORDS R10
// index values
#define IDX32 R14
#define IDX48 R15
#define OCTWORDS R16
TEXT runtime·memmove<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-24
// R3 = TGT = to
// R4 = SRC = from
// R5 = LEN = n
// Determine if there are doublewords to
// copy so a more efficient move can be done
check:
#ifdef GOPPC64_power10
CMP LEN, $16
BGT mcopy
SLD $56, LEN, TMP
LXVL SRC, TMP, V0
STXVL V0, TGT, TMP
RET
#endif
mcopy:
ANDCC $7, LEN, BYTES // R7: bytes to copy
SRD $3, LEN, DWORDS // R6: double words to copy
MOVFL CR0, CR3 // save CR from ANDCC
CMP DWORDS, $0, CR1 // CR1[EQ] set if no double words to copy
// Determine overlap by subtracting dest - src and comparing against the
// length. This catches the cases where src and dest are in different types
// of storage such as stack and static to avoid doing backward move when not
// necessary.
SUB SRC, TGT, TMP // dest - src
CMPU TMP, LEN, CR2 // < len?
BC 12, 8, backward // BLT CR2 backward
// Copying forward if no overlap.
BC 12, 6, checkbytes // BEQ CR1, checkbytes
SRDCC $3, DWORDS, OCTWORDS // 64 byte chunks?
MOVD $16, IDX16
BEQ lt64gt8 // < 64 bytes
// Prepare for moves of 64 bytes at a time.
forward64setup:
DCBTST (TGT) // prepare data cache
DCBT (SRC)
MOVD OCTWORDS, CTR // Number of 64 byte chunks
MOVD $32, IDX32
MOVD $48, IDX48
PCALIGN $16
forward64:
LXVD2X (R0)(SRC), VS32 // load 64 bytes
LXVD2X (IDX16)(SRC), VS33
LXVD2X (IDX32)(SRC), VS34
LXVD2X (IDX48)(SRC), VS35
ADD $64, SRC
STXVD2X VS32, (R0)(TGT) // store 64 bytes
STXVD2X VS33, (IDX16)(TGT)
STXVD2X VS34, (IDX32)(TGT)
STXVD2X VS35, (IDX48)(TGT)
ADD $64,TGT // bump up for next set
BC 16, 0, forward64 // continue
ANDCC $7, DWORDS // remaining doublewords
BEQ checkbytes // only bytes remain
lt64gt8:
CMP DWORDS, $4
BLT lt32gt8
LXVD2X (R0)(SRC), VS32
LXVD2X (IDX16)(SRC), VS33
ADD $-4, DWORDS
STXVD2X VS32, (R0)(TGT)
STXVD2X VS33, (IDX16)(TGT)
ADD $32, SRC
ADD $32, TGT
lt32gt8:
// At this point >= 8 and < 32
// Move 16 bytes if possible
CMP DWORDS, $2
BLT lt16
LXVD2X (R0)(SRC), VS32
ADD $-2, DWORDS
STXVD2X VS32, (R0)(TGT)
ADD $16, SRC
ADD $16, TGT
lt16: // Move 8 bytes if possible
CMP DWORDS, $1
BLT checkbytes
#ifdef GOPPC64_power10
ADD $8, BYTES
SLD $56, BYTES, TMP
LXVL SRC, TMP, V0
STXVL V0, TGT, TMP
RET
#endif
MOVD 0(SRC), TMP
ADD $8, SRC
MOVD TMP, 0(TGT)
ADD $8, TGT
checkbytes:
BC 12, 14, LR // BEQ lr
#ifdef GOPPC64_power10
SLD $56, BYTES, TMP
LXVL SRC, TMP, V0
STXVL V0, TGT, TMP
RET
#endif
lt8: // Move word if possible
CMP BYTES, $4
BLT lt4
MOVWZ 0(SRC), TMP
ADD $-4, BYTES
MOVW TMP, 0(TGT)
ADD $4, SRC
ADD $4, TGT
lt4: // Move halfword if possible
CMP BYTES, $2
BLT lt2
MOVHZ 0(SRC), TMP
ADD $-2, BYTES
MOVH TMP, 0(TGT)
ADD $2, SRC
ADD $2, TGT
lt2: // Move last byte if 1 left
CMP BYTES, $1
BC 12, 0, LR // ble lr
MOVBZ 0(SRC), TMP
MOVBZ TMP, 0(TGT)
RET
backward:
// Copying backwards proceeds by copying R7 bytes then copying R6 double words.
// R3 and R4 are advanced to the end of the destination/source buffers
// respectively and moved back as we copy.
ADD LEN, SRC, SRC // end of source
ADD TGT, LEN, TGT // end of dest
BEQ nobackwardtail // earlier condition
MOVD BYTES, CTR // bytes to move
backwardtailloop:
MOVBZ -1(SRC), TMP // point to last byte
SUB $1,SRC
MOVBZ TMP, -1(TGT)
SUB $1,TGT
BDNZ backwardtailloop
nobackwardtail:
BC 4, 5, LR // blelr cr1, return if DWORDS == 0
SRDCC $2,DWORDS,QWORDS // Compute number of 32B blocks and compare to 0
BNE backward32setup // If QWORDS != 0, start the 32B copy loop.
backward24:
// DWORDS is a value between 1-3.
CMP DWORDS, $2
MOVD -8(SRC), TMP
MOVD TMP, -8(TGT)
BC 12, 0, LR // bltlr, return if DWORDS == 1
MOVD -16(SRC), TMP
MOVD TMP, -16(TGT)
BC 12, 2, LR // beqlr, return if DWORDS == 2
MOVD -24(SRC), TMP
MOVD TMP, -24(TGT)
RET
backward32setup:
ANDCC $3,DWORDS // Compute remaining DWORDS and compare to 0
MOVD QWORDS, CTR // set up loop ctr
MOVD $16, IDX16 // 32 bytes at a time
PCALIGN $16
backward32loop:
SUB $32, TGT
SUB $32, SRC
LXVD2X (R0)(SRC), VS32 // load 16x2 bytes
LXVD2X (IDX16)(SRC), VS33
STXVD2X VS32, (R0)(TGT) // store 16x2 bytes
STXVD2X VS33, (IDX16)(TGT)
BDNZ backward32loop
BC 12, 2, LR // beqlr, return if DWORDS == 0
BR backward24
|