Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/arm-optimized-routines/string/aarch64/memcpy.S
39491 views
1
/*
2
* memcpy - copy memory area
3
*
4
* Copyright (c) 2012-2022, Arm Limited.
5
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6
*/
7
8
/* Assumptions:
9
*
10
* ARMv8-a, AArch64, unaligned accesses.
11
*
12
*/
13
14
#include "asmdefs.h"
15
16
#define dstin x0
17
#define src x1
18
#define count x2
19
#define dst x3
20
#define srcend x4
21
#define dstend x5
22
#define A_l x6
23
#define A_lw w6
24
#define A_h x7
25
#define B_l x8
26
#define B_lw w8
27
#define B_h x9
28
#define C_l x10
29
#define C_lw w10
30
#define C_h x11
31
#define D_l x12
32
#define D_h x13
33
#define E_l x14
34
#define E_h x15
35
#define F_l x16
36
#define F_h x17
37
#define G_l count
38
#define G_h dst
39
#define H_l src
40
#define H_h srcend
41
#define tmp1 x14
42
43
/* This implementation handles overlaps and supports both memcpy and memmove
44
from a single entry point. It uses unaligned accesses and branchless
45
sequences to keep the code small, simple and improve performance.
46
47
Copies are split into 3 main cases: small copies of up to 32 bytes, medium
48
copies of up to 128 bytes, and large copies. The overhead of the overlap
49
check is negligible since it is only required for large copies.
50
51
Large copies use a software pipelined loop processing 64 bytes per iteration.
52
The destination pointer is 16-byte aligned to minimize unaligned accesses.
53
The loop tail is handled by always copying 64 bytes from the end.
54
*/
55
56
ENTRY_ALIAS (__memmove_aarch64)
57
ENTRY (__memcpy_aarch64)
58
add srcend, src, count
59
add dstend, dstin, count
60
cmp count, 128
61
b.hi L(copy_long)
62
cmp count, 32
63
b.hi L(copy32_128)
64
65
/* Small copies: 0..32 bytes. */
66
cmp count, 16
67
b.lo L(copy16)
68
ldp A_l, A_h, [src]
69
ldp D_l, D_h, [srcend, -16]
70
stp A_l, A_h, [dstin]
71
stp D_l, D_h, [dstend, -16]
72
ret
73
74
/* Copy 8-15 bytes. */
75
L(copy16):
76
tbz count, 3, L(copy8)
77
ldr A_l, [src]
78
ldr A_h, [srcend, -8]
79
str A_l, [dstin]
80
str A_h, [dstend, -8]
81
ret
82
83
.p2align 3
84
/* Copy 4-7 bytes. */
85
L(copy8):
86
tbz count, 2, L(copy4)
87
ldr A_lw, [src]
88
ldr B_lw, [srcend, -4]
89
str A_lw, [dstin]
90
str B_lw, [dstend, -4]
91
ret
92
93
/* Copy 0..3 bytes using a branchless sequence. */
94
L(copy4):
95
cbz count, L(copy0)
96
lsr tmp1, count, 1
97
ldrb A_lw, [src]
98
ldrb C_lw, [srcend, -1]
99
ldrb B_lw, [src, tmp1]
100
strb A_lw, [dstin]
101
strb B_lw, [dstin, tmp1]
102
strb C_lw, [dstend, -1]
103
L(copy0):
104
ret
105
106
.p2align 4
107
/* Medium copies: 33..128 bytes. */
108
L(copy32_128):
109
ldp A_l, A_h, [src]
110
ldp B_l, B_h, [src, 16]
111
ldp C_l, C_h, [srcend, -32]
112
ldp D_l, D_h, [srcend, -16]
113
cmp count, 64
114
b.hi L(copy128)
115
stp A_l, A_h, [dstin]
116
stp B_l, B_h, [dstin, 16]
117
stp C_l, C_h, [dstend, -32]
118
stp D_l, D_h, [dstend, -16]
119
ret
120
121
.p2align 4
122
/* Copy 65..128 bytes. */
123
L(copy128):
124
ldp E_l, E_h, [src, 32]
125
ldp F_l, F_h, [src, 48]
126
cmp count, 96
127
b.ls L(copy96)
128
ldp G_l, G_h, [srcend, -64]
129
ldp H_l, H_h, [srcend, -48]
130
stp G_l, G_h, [dstend, -64]
131
stp H_l, H_h, [dstend, -48]
132
L(copy96):
133
stp A_l, A_h, [dstin]
134
stp B_l, B_h, [dstin, 16]
135
stp E_l, E_h, [dstin, 32]
136
stp F_l, F_h, [dstin, 48]
137
stp C_l, C_h, [dstend, -32]
138
stp D_l, D_h, [dstend, -16]
139
ret
140
141
.p2align 4
142
/* Copy more than 128 bytes. */
143
L(copy_long):
144
/* Use backwards copy if there is an overlap. */
145
sub tmp1, dstin, src
146
cbz tmp1, L(copy0)
147
cmp tmp1, count
148
b.lo L(copy_long_backwards)
149
150
/* Copy 16 bytes and then align dst to 16-byte alignment. */
151
152
ldp D_l, D_h, [src]
153
and tmp1, dstin, 15
154
bic dst, dstin, 15
155
sub src, src, tmp1
156
add count, count, tmp1 /* Count is now 16 too large. */
157
ldp A_l, A_h, [src, 16]
158
stp D_l, D_h, [dstin]
159
ldp B_l, B_h, [src, 32]
160
ldp C_l, C_h, [src, 48]
161
ldp D_l, D_h, [src, 64]!
162
subs count, count, 128 + 16 /* Test and readjust count. */
163
b.ls L(copy64_from_end)
164
165
L(loop64):
166
stp A_l, A_h, [dst, 16]
167
ldp A_l, A_h, [src, 16]
168
stp B_l, B_h, [dst, 32]
169
ldp B_l, B_h, [src, 32]
170
stp C_l, C_h, [dst, 48]
171
ldp C_l, C_h, [src, 48]
172
stp D_l, D_h, [dst, 64]!
173
ldp D_l, D_h, [src, 64]!
174
subs count, count, 64
175
b.hi L(loop64)
176
177
/* Write the last iteration and copy 64 bytes from the end. */
178
L(copy64_from_end):
179
ldp E_l, E_h, [srcend, -64]
180
stp A_l, A_h, [dst, 16]
181
ldp A_l, A_h, [srcend, -48]
182
stp B_l, B_h, [dst, 32]
183
ldp B_l, B_h, [srcend, -32]
184
stp C_l, C_h, [dst, 48]
185
ldp C_l, C_h, [srcend, -16]
186
stp D_l, D_h, [dst, 64]
187
stp E_l, E_h, [dstend, -64]
188
stp A_l, A_h, [dstend, -48]
189
stp B_l, B_h, [dstend, -32]
190
stp C_l, C_h, [dstend, -16]
191
ret
192
193
.p2align 4
194
195
/* Large backwards copy for overlapping copies.
196
Copy 16 bytes and then align dst to 16-byte alignment. */
197
L(copy_long_backwards):
198
ldp D_l, D_h, [srcend, -16]
199
and tmp1, dstend, 15
200
sub srcend, srcend, tmp1
201
sub count, count, tmp1
202
ldp A_l, A_h, [srcend, -16]
203
stp D_l, D_h, [dstend, -16]
204
ldp B_l, B_h, [srcend, -32]
205
ldp C_l, C_h, [srcend, -48]
206
ldp D_l, D_h, [srcend, -64]!
207
sub dstend, dstend, tmp1
208
subs count, count, 128
209
b.ls L(copy64_from_start)
210
211
L(loop64_backwards):
212
stp A_l, A_h, [dstend, -16]
213
ldp A_l, A_h, [srcend, -16]
214
stp B_l, B_h, [dstend, -32]
215
ldp B_l, B_h, [srcend, -32]
216
stp C_l, C_h, [dstend, -48]
217
ldp C_l, C_h, [srcend, -48]
218
stp D_l, D_h, [dstend, -64]!
219
ldp D_l, D_h, [srcend, -64]!
220
subs count, count, 64
221
b.hi L(loop64_backwards)
222
223
/* Write the last iteration and copy 64 bytes from the start. */
224
L(copy64_from_start):
225
ldp G_l, G_h, [src, 48]
226
stp A_l, A_h, [dstend, -16]
227
ldp A_l, A_h, [src, 32]
228
stp B_l, B_h, [dstend, -32]
229
ldp B_l, B_h, [src, 16]
230
stp C_l, C_h, [dstend, -48]
231
ldp C_l, C_h, [src]
232
stp D_l, D_h, [dstend, -64]
233
stp G_l, G_h, [dstin, 48]
234
stp A_l, A_h, [dstin, 32]
235
stp B_l, B_h, [dstin, 16]
236
stp C_l, C_h, [dstin]
237
ret
238
239
END (__memcpy_aarch64)
240
241
242