CoCalc -- memcpy.S

GitHub Repository: torvalds/linux
Path: blob/master/arch/arm64/lib/memcpy.S
²⁶⁴²⁴ views
1
/* SPDX-License-Identifier: GPL-2.0-only */
2
/*
3
 * Copyright (c) 2012-2021, Arm Limited.
4
 *
5
 * Adapted from the original at:
6
 * https://github.com/ARM-software/optimized-routines/blob/afd6244a1f8d9229/string/aarch64/memcpy.S
7
 */
8

9
#include <linux/linkage.h>
10
#include <asm/assembler.h>
11

12
/* Assumptions:
13
 *
14
 * ARMv8-a, AArch64, unaligned accesses.
15
 *
16
 */
17

18
#define L(label) .L ## label
19

20
#define dstin	x0
21
#define src	x1
22
#define count	x2
23
#define dst	x3
24
#define srcend	x4
25
#define dstend	x5
26
#define A_l	x6
27
#define A_lw	w6
28
#define A_h	x7
29
#define B_l	x8
30
#define B_lw	w8
31
#define B_h	x9
32
#define C_l	x10
33
#define C_lw	w10
34
#define C_h	x11
35
#define D_l	x12
36
#define D_h	x13
37
#define E_l	x14
38
#define E_h	x15
39
#define F_l	x16
40
#define F_h	x17
41
#define G_l	count
42
#define G_h	dst
43
#define H_l	src
44
#define H_h	srcend
45
#define tmp1	x14
46

47
/* This implementation handles overlaps and supports both memcpy and memmove
48
   from a single entry point.  It uses unaligned accesses and branchless
49
   sequences to keep the code small, simple and improve performance.
50

51
   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
52
   copies of up to 128 bytes, and large copies.  The overhead of the overlap
53
   check is negligible since it is only required for large copies.
54

55
   Large copies use a software pipelined loop processing 64 bytes per iteration.
56
   The destination pointer is 16-byte aligned to minimize unaligned accesses.
57
   The loop tail is handled by always copying 64 bytes from the end.
58
*/
59

60
SYM_FUNC_START_LOCAL(__pi_memcpy_generic)
61
	add	srcend, src, count
62
	add	dstend, dstin, count
63
	cmp	count, 128
64
	b.hi	L(copy_long)
65
	cmp	count, 32
66
	b.hi	L(copy32_128)
67

68
	/* Small copies: 0..32 bytes.  */
69
	cmp	count, 16
70
	b.lo	L(copy16)
71
	ldp	A_l, A_h, [src]
72
	ldp	D_l, D_h, [srcend, -16]
73
	stp	A_l, A_h, [dstin]
74
	stp	D_l, D_h, [dstend, -16]
75
	ret
76

77
	/* Copy 8-15 bytes.  */
78
L(copy16):
79
	tbz	count, 3, L(copy8)
80
	ldr	A_l, [src]
81
	ldr	A_h, [srcend, -8]
82
	str	A_l, [dstin]
83
	str	A_h, [dstend, -8]
84
	ret
85

86
	.p2align 3
87
	/* Copy 4-7 bytes.  */
88
L(copy8):
89
	tbz	count, 2, L(copy4)
90
	ldr	A_lw, [src]
91
	ldr	B_lw, [srcend, -4]
92
	str	A_lw, [dstin]
93
	str	B_lw, [dstend, -4]
94
	ret
95

96
	/* Copy 0..3 bytes using a branchless sequence.  */
97
L(copy4):
98
	cbz	count, L(copy0)
99
	lsr	tmp1, count, 1
100
	ldrb	A_lw, [src]
101
	ldrb	C_lw, [srcend, -1]
102
	ldrb	B_lw, [src, tmp1]
103
	strb	A_lw, [dstin]
104
	strb	B_lw, [dstin, tmp1]
105
	strb	C_lw, [dstend, -1]
106
L(copy0):
107
	ret
108

109
	.p2align 4
110
	/* Medium copies: 33..128 bytes.  */
111
L(copy32_128):
112
	ldp	A_l, A_h, [src]
113
	ldp	B_l, B_h, [src, 16]
114
	ldp	C_l, C_h, [srcend, -32]
115
	ldp	D_l, D_h, [srcend, -16]
116
	cmp	count, 64
117
	b.hi	L(copy128)
118
	stp	A_l, A_h, [dstin]
119
	stp	B_l, B_h, [dstin, 16]
120
	stp	C_l, C_h, [dstend, -32]
121
	stp	D_l, D_h, [dstend, -16]
122
	ret
123

124
	.p2align 4
125
	/* Copy 65..128 bytes.  */
126
L(copy128):
127
	ldp	E_l, E_h, [src, 32]
128
	ldp	F_l, F_h, [src, 48]
129
	cmp	count, 96
130
	b.ls	L(copy96)
131
	ldp	G_l, G_h, [srcend, -64]
132
	ldp	H_l, H_h, [srcend, -48]
133
	stp	G_l, G_h, [dstend, -64]
134
	stp	H_l, H_h, [dstend, -48]
135
L(copy96):
136
	stp	A_l, A_h, [dstin]
137
	stp	B_l, B_h, [dstin, 16]
138
	stp	E_l, E_h, [dstin, 32]
139
	stp	F_l, F_h, [dstin, 48]
140
	stp	C_l, C_h, [dstend, -32]
141
	stp	D_l, D_h, [dstend, -16]
142
	ret
143

144
	.p2align 4
145
	/* Copy more than 128 bytes.  */
146
L(copy_long):
147
	/* Use backwards copy if there is an overlap.  */
148
	sub	tmp1, dstin, src
149
	cbz	tmp1, L(copy0)
150
	cmp	tmp1, count
151
	b.lo	L(copy_long_backwards)
152

153
	/* Copy 16 bytes and then align dst to 16-byte alignment.  */
154

155
	ldp	D_l, D_h, [src]
156
	and	tmp1, dstin, 15
157
	bic	dst, dstin, 15
158
	sub	src, src, tmp1
159
	add	count, count, tmp1	/* Count is now 16 too large.  */
160
	ldp	A_l, A_h, [src, 16]
161
	stp	D_l, D_h, [dstin]
162
	ldp	B_l, B_h, [src, 32]
163
	ldp	C_l, C_h, [src, 48]
164
	ldp	D_l, D_h, [src, 64]!
165
	subs	count, count, 128 + 16	/* Test and readjust count.  */
166
	b.ls	L(copy64_from_end)
167

168
L(loop64):
169
	stp	A_l, A_h, [dst, 16]
170
	ldp	A_l, A_h, [src, 16]
171
	stp	B_l, B_h, [dst, 32]
172
	ldp	B_l, B_h, [src, 32]
173
	stp	C_l, C_h, [dst, 48]
174
	ldp	C_l, C_h, [src, 48]
175
	stp	D_l, D_h, [dst, 64]!
176
	ldp	D_l, D_h, [src, 64]!
177
	subs	count, count, 64
178
	b.hi	L(loop64)
179

180
	/* Write the last iteration and copy 64 bytes from the end.  */
181
L(copy64_from_end):
182
	ldp	E_l, E_h, [srcend, -64]
183
	stp	A_l, A_h, [dst, 16]
184
	ldp	A_l, A_h, [srcend, -48]
185
	stp	B_l, B_h, [dst, 32]
186
	ldp	B_l, B_h, [srcend, -32]
187
	stp	C_l, C_h, [dst, 48]
188
	ldp	C_l, C_h, [srcend, -16]
189
	stp	D_l, D_h, [dst, 64]
190
	stp	E_l, E_h, [dstend, -64]
191
	stp	A_l, A_h, [dstend, -48]
192
	stp	B_l, B_h, [dstend, -32]
193
	stp	C_l, C_h, [dstend, -16]
194
	ret
195

196
	.p2align 4
197

198
	/* Large backwards copy for overlapping copies.
199
	   Copy 16 bytes and then align dst to 16-byte alignment.  */
200
L(copy_long_backwards):
201
	ldp	D_l, D_h, [srcend, -16]
202
	and	tmp1, dstend, 15
203
	sub	srcend, srcend, tmp1
204
	sub	count, count, tmp1
205
	ldp	A_l, A_h, [srcend, -16]
206
	stp	D_l, D_h, [dstend, -16]
207
	ldp	B_l, B_h, [srcend, -32]
208
	ldp	C_l, C_h, [srcend, -48]
209
	ldp	D_l, D_h, [srcend, -64]!
210
	sub	dstend, dstend, tmp1
211
	subs	count, count, 128
212
	b.ls	L(copy64_from_start)
213

214
L(loop64_backwards):
215
	stp	A_l, A_h, [dstend, -16]
216
	ldp	A_l, A_h, [srcend, -16]
217
	stp	B_l, B_h, [dstend, -32]
218
	ldp	B_l, B_h, [srcend, -32]
219
	stp	C_l, C_h, [dstend, -48]
220
	ldp	C_l, C_h, [srcend, -48]
221
	stp	D_l, D_h, [dstend, -64]!
222
	ldp	D_l, D_h, [srcend, -64]!
223
	subs	count, count, 64
224
	b.hi	L(loop64_backwards)
225

226
	/* Write the last iteration and copy 64 bytes from the start.  */
227
L(copy64_from_start):
228
	ldp	G_l, G_h, [src, 48]
229
	stp	A_l, A_h, [dstend, -16]
230
	ldp	A_l, A_h, [src, 32]
231
	stp	B_l, B_h, [dstend, -32]
232
	ldp	B_l, B_h, [src, 16]
233
	stp	C_l, C_h, [dstend, -48]
234
	ldp	C_l, C_h, [src]
235
	stp	D_l, D_h, [dstend, -64]
236
	stp	G_l, G_h, [dstin, 48]
237
	stp	A_l, A_h, [dstin, 32]
238
	stp	B_l, B_h, [dstin, 16]
239
	stp	C_l, C_h, [dstin]
240
	ret
241
SYM_FUNC_END(__pi_memcpy_generic)
242

243
#ifdef CONFIG_AS_HAS_MOPS
244
	.arch_extension mops
245
SYM_FUNC_START(__pi_memcpy)
246
alternative_if_not ARM64_HAS_MOPS
247
	b	__pi_memcpy_generic
248
alternative_else_nop_endif
249

250
	mov	dst, dstin
251
	cpyp	[dst]!, [src]!, count!
252
	cpym	[dst]!, [src]!, count!
253
	cpye	[dst]!, [src]!, count!
254
	ret
255
SYM_FUNC_END(__pi_memcpy)
256
#else
257
SYM_FUNC_ALIAS(__pi_memcpy, __pi_memcpy_generic)
258
#endif
259

260
SYM_FUNC_ALIAS(__memcpy, __pi_memcpy)
261
EXPORT_SYMBOL(__memcpy)
262
SYM_FUNC_ALIAS_WEAK(memcpy, __memcpy)
263
EXPORT_SYMBOL(memcpy)
264

265
SYM_FUNC_ALIAS(__pi_memmove, __pi_memcpy)
266

267
SYM_FUNC_ALIAS(__memmove, __pi_memmove)
268
EXPORT_SYMBOL(__memmove)
269
SYM_FUNC_ALIAS_WEAK(memmove, __memmove)
270
EXPORT_SYMBOL(memmove)
271

272
Product

Resources

Company