Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/arm64/lib/memcpy.S
26424 views
1
/* SPDX-License-Identifier: GPL-2.0-only */
2
/*
3
* Copyright (c) 2012-2021, Arm Limited.
4
*
5
* Adapted from the original at:
6
* https://github.com/ARM-software/optimized-routines/blob/afd6244a1f8d9229/string/aarch64/memcpy.S
7
*/
8
9
#include <linux/linkage.h>
10
#include <asm/assembler.h>
11
12
/* Assumptions:
13
*
14
* ARMv8-a, AArch64, unaligned accesses.
15
*
16
*/
17
18
#define L(label) .L ## label
19
20
#define dstin x0
21
#define src x1
22
#define count x2
23
#define dst x3
24
#define srcend x4
25
#define dstend x5
26
#define A_l x6
27
#define A_lw w6
28
#define A_h x7
29
#define B_l x8
30
#define B_lw w8
31
#define B_h x9
32
#define C_l x10
33
#define C_lw w10
34
#define C_h x11
35
#define D_l x12
36
#define D_h x13
37
#define E_l x14
38
#define E_h x15
39
#define F_l x16
40
#define F_h x17
41
#define G_l count
42
#define G_h dst
43
#define H_l src
44
#define H_h srcend
45
#define tmp1 x14
46
47
/* This implementation handles overlaps and supports both memcpy and memmove
48
from a single entry point. It uses unaligned accesses and branchless
49
sequences to keep the code small, simple and improve performance.
50
51
Copies are split into 3 main cases: small copies of up to 32 bytes, medium
52
copies of up to 128 bytes, and large copies. The overhead of the overlap
53
check is negligible since it is only required for large copies.
54
55
Large copies use a software pipelined loop processing 64 bytes per iteration.
56
The destination pointer is 16-byte aligned to minimize unaligned accesses.
57
The loop tail is handled by always copying 64 bytes from the end.
58
*/
59
60
SYM_FUNC_START_LOCAL(__pi_memcpy_generic)
61
add srcend, src, count
62
add dstend, dstin, count
63
cmp count, 128
64
b.hi L(copy_long)
65
cmp count, 32
66
b.hi L(copy32_128)
67
68
/* Small copies: 0..32 bytes. */
69
cmp count, 16
70
b.lo L(copy16)
71
ldp A_l, A_h, [src]
72
ldp D_l, D_h, [srcend, -16]
73
stp A_l, A_h, [dstin]
74
stp D_l, D_h, [dstend, -16]
75
ret
76
77
/* Copy 8-15 bytes. */
78
L(copy16):
79
tbz count, 3, L(copy8)
80
ldr A_l, [src]
81
ldr A_h, [srcend, -8]
82
str A_l, [dstin]
83
str A_h, [dstend, -8]
84
ret
85
86
.p2align 3
87
/* Copy 4-7 bytes. */
88
L(copy8):
89
tbz count, 2, L(copy4)
90
ldr A_lw, [src]
91
ldr B_lw, [srcend, -4]
92
str A_lw, [dstin]
93
str B_lw, [dstend, -4]
94
ret
95
96
/* Copy 0..3 bytes using a branchless sequence. */
97
L(copy4):
98
cbz count, L(copy0)
99
lsr tmp1, count, 1
100
ldrb A_lw, [src]
101
ldrb C_lw, [srcend, -1]
102
ldrb B_lw, [src, tmp1]
103
strb A_lw, [dstin]
104
strb B_lw, [dstin, tmp1]
105
strb C_lw, [dstend, -1]
106
L(copy0):
107
ret
108
109
.p2align 4
110
/* Medium copies: 33..128 bytes. */
111
L(copy32_128):
112
ldp A_l, A_h, [src]
113
ldp B_l, B_h, [src, 16]
114
ldp C_l, C_h, [srcend, -32]
115
ldp D_l, D_h, [srcend, -16]
116
cmp count, 64
117
b.hi L(copy128)
118
stp A_l, A_h, [dstin]
119
stp B_l, B_h, [dstin, 16]
120
stp C_l, C_h, [dstend, -32]
121
stp D_l, D_h, [dstend, -16]
122
ret
123
124
.p2align 4
125
/* Copy 65..128 bytes. */
126
L(copy128):
127
ldp E_l, E_h, [src, 32]
128
ldp F_l, F_h, [src, 48]
129
cmp count, 96
130
b.ls L(copy96)
131
ldp G_l, G_h, [srcend, -64]
132
ldp H_l, H_h, [srcend, -48]
133
stp G_l, G_h, [dstend, -64]
134
stp H_l, H_h, [dstend, -48]
135
L(copy96):
136
stp A_l, A_h, [dstin]
137
stp B_l, B_h, [dstin, 16]
138
stp E_l, E_h, [dstin, 32]
139
stp F_l, F_h, [dstin, 48]
140
stp C_l, C_h, [dstend, -32]
141
stp D_l, D_h, [dstend, -16]
142
ret
143
144
.p2align 4
145
/* Copy more than 128 bytes. */
146
L(copy_long):
147
/* Use backwards copy if there is an overlap. */
148
sub tmp1, dstin, src
149
cbz tmp1, L(copy0)
150
cmp tmp1, count
151
b.lo L(copy_long_backwards)
152
153
/* Copy 16 bytes and then align dst to 16-byte alignment. */
154
155
ldp D_l, D_h, [src]
156
and tmp1, dstin, 15
157
bic dst, dstin, 15
158
sub src, src, tmp1
159
add count, count, tmp1 /* Count is now 16 too large. */
160
ldp A_l, A_h, [src, 16]
161
stp D_l, D_h, [dstin]
162
ldp B_l, B_h, [src, 32]
163
ldp C_l, C_h, [src, 48]
164
ldp D_l, D_h, [src, 64]!
165
subs count, count, 128 + 16 /* Test and readjust count. */
166
b.ls L(copy64_from_end)
167
168
L(loop64):
169
stp A_l, A_h, [dst, 16]
170
ldp A_l, A_h, [src, 16]
171
stp B_l, B_h, [dst, 32]
172
ldp B_l, B_h, [src, 32]
173
stp C_l, C_h, [dst, 48]
174
ldp C_l, C_h, [src, 48]
175
stp D_l, D_h, [dst, 64]!
176
ldp D_l, D_h, [src, 64]!
177
subs count, count, 64
178
b.hi L(loop64)
179
180
/* Write the last iteration and copy 64 bytes from the end. */
181
L(copy64_from_end):
182
ldp E_l, E_h, [srcend, -64]
183
stp A_l, A_h, [dst, 16]
184
ldp A_l, A_h, [srcend, -48]
185
stp B_l, B_h, [dst, 32]
186
ldp B_l, B_h, [srcend, -32]
187
stp C_l, C_h, [dst, 48]
188
ldp C_l, C_h, [srcend, -16]
189
stp D_l, D_h, [dst, 64]
190
stp E_l, E_h, [dstend, -64]
191
stp A_l, A_h, [dstend, -48]
192
stp B_l, B_h, [dstend, -32]
193
stp C_l, C_h, [dstend, -16]
194
ret
195
196
.p2align 4
197
198
/* Large backwards copy for overlapping copies.
199
Copy 16 bytes and then align dst to 16-byte alignment. */
200
L(copy_long_backwards):
201
ldp D_l, D_h, [srcend, -16]
202
and tmp1, dstend, 15
203
sub srcend, srcend, tmp1
204
sub count, count, tmp1
205
ldp A_l, A_h, [srcend, -16]
206
stp D_l, D_h, [dstend, -16]
207
ldp B_l, B_h, [srcend, -32]
208
ldp C_l, C_h, [srcend, -48]
209
ldp D_l, D_h, [srcend, -64]!
210
sub dstend, dstend, tmp1
211
subs count, count, 128
212
b.ls L(copy64_from_start)
213
214
L(loop64_backwards):
215
stp A_l, A_h, [dstend, -16]
216
ldp A_l, A_h, [srcend, -16]
217
stp B_l, B_h, [dstend, -32]
218
ldp B_l, B_h, [srcend, -32]
219
stp C_l, C_h, [dstend, -48]
220
ldp C_l, C_h, [srcend, -48]
221
stp D_l, D_h, [dstend, -64]!
222
ldp D_l, D_h, [srcend, -64]!
223
subs count, count, 64
224
b.hi L(loop64_backwards)
225
226
/* Write the last iteration and copy 64 bytes from the start. */
227
L(copy64_from_start):
228
ldp G_l, G_h, [src, 48]
229
stp A_l, A_h, [dstend, -16]
230
ldp A_l, A_h, [src, 32]
231
stp B_l, B_h, [dstend, -32]
232
ldp B_l, B_h, [src, 16]
233
stp C_l, C_h, [dstend, -48]
234
ldp C_l, C_h, [src]
235
stp D_l, D_h, [dstend, -64]
236
stp G_l, G_h, [dstin, 48]
237
stp A_l, A_h, [dstin, 32]
238
stp B_l, B_h, [dstin, 16]
239
stp C_l, C_h, [dstin]
240
ret
241
SYM_FUNC_END(__pi_memcpy_generic)
242
243
#ifdef CONFIG_AS_HAS_MOPS
244
.arch_extension mops
245
SYM_FUNC_START(__pi_memcpy)
246
alternative_if_not ARM64_HAS_MOPS
247
b __pi_memcpy_generic
248
alternative_else_nop_endif
249
250
mov dst, dstin
251
cpyp [dst]!, [src]!, count!
252
cpym [dst]!, [src]!, count!
253
cpye [dst]!, [src]!, count!
254
ret
255
SYM_FUNC_END(__pi_memcpy)
256
#else
257
SYM_FUNC_ALIAS(__pi_memcpy, __pi_memcpy_generic)
258
#endif
259
260
SYM_FUNC_ALIAS(__memcpy, __pi_memcpy)
261
EXPORT_SYMBOL(__memcpy)
262
SYM_FUNC_ALIAS_WEAK(memcpy, __memcpy)
263
EXPORT_SYMBOL(memcpy)
264
265
SYM_FUNC_ALIAS(__pi_memmove, __pi_memcpy)
266
267
SYM_FUNC_ALIAS(__memmove, __pi_memmove)
268
EXPORT_SYMBOL(__memmove)
269
SYM_FUNC_ALIAS_WEAK(memmove, __memmove)
270
EXPORT_SYMBOL(memmove)
271
272