CoCalc -- memcpy-advsimd.S

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S
³⁹⁴⁸⁶ views
1
/*
2
 * memcpy - copy memory area
3
 *
4
 * Copyright (c) 2019-2023, Arm Limited.
5
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6
 */
7

8
/* Assumptions:
9
 *
10
 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
11
 *
12
 */
13

14
#include "asmdefs.h"
15

16
#define dstin	x0
17
#define src	x1
18
#define count	x2
19
#define dst	x3
20
#define srcend	x4
21
#define dstend	x5
22
#define A_l	x6
23
#define A_lw	w6
24
#define A_h	x7
25
#define B_l	x8
26
#define B_lw	w8
27
#define B_h	x9
28
#define C_lw	w10
29
#define tmp1	x14
30

31
#define A_q	q0
32
#define B_q	q1
33
#define C_q	q2
34
#define D_q	q3
35
#define E_q	q4
36
#define F_q	q5
37
#define G_q	q6
38
#define H_q	q7
39

40
/* This implementation handles overlaps and supports both memcpy and memmove
41
   from a single entry point.  It uses unaligned accesses and branchless
42
   sequences to keep the code small, simple and improve performance.
43

44
   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
45
   copies of up to 128 bytes, and large copies.  The overhead of the overlap
46
   check is negligible since it is only required for large copies.
47

48
   Large copies use a software pipelined loop processing 64 bytes per iteration.
49
   The source pointer is 16-byte aligned to minimize unaligned accesses.
50
   The loop tail is handled by always copying 64 bytes from the end.
51
*/
52

53
ENTRY_ALIAS (__memmove_aarch64_simd)
54
ENTRY (__memcpy_aarch64_simd)
55
	add	srcend, src, count
56
	cmp	count, 128
57
	b.hi	L(copy_long)
58
	add	dstend, dstin, count
59
	cmp	count, 32
60
	b.hi	L(copy32_128)
61
	nop
62

63
	/* Small copies: 0..32 bytes.  */
64
	cmp	count, 16
65
	b.lo	L(copy16)
66
	ldr	A_q, [src]
67
	ldr	B_q, [srcend, -16]
68
	str	A_q, [dstin]
69
	str	B_q, [dstend, -16]
70
	ret
71

72
	.p2align 4
73
	/* Medium copies: 33..128 bytes.  */
74
L(copy32_128):
75
	ldp	A_q, B_q, [src]
76
	ldp	C_q, D_q, [srcend, -32]
77
	cmp	count, 64
78
	b.hi	L(copy128)
79
	stp	A_q, B_q, [dstin]
80
	stp	C_q, D_q, [dstend, -32]
81
	ret
82

83
	.p2align 4
84
	/* Copy 8-15 bytes.  */
85
L(copy16):
86
	tbz	count, 3, L(copy8)
87
	ldr	A_l, [src]
88
	ldr	A_h, [srcend, -8]
89
	str	A_l, [dstin]
90
	str	A_h, [dstend, -8]
91
	ret
92

93
	/* Copy 4-7 bytes.  */
94
L(copy8):
95
	tbz	count, 2, L(copy4)
96
	ldr	A_lw, [src]
97
	ldr	B_lw, [srcend, -4]
98
	str	A_lw, [dstin]
99
	str	B_lw, [dstend, -4]
100
	ret
101

102
	/* Copy 65..128 bytes.  */
103
L(copy128):
104
	ldp	E_q, F_q, [src, 32]
105
	cmp	count, 96
106
	b.ls	L(copy96)
107
	ldp	G_q, H_q, [srcend, -64]
108
	stp	G_q, H_q, [dstend, -64]
109
L(copy96):
110
	stp	A_q, B_q, [dstin]
111
	stp	E_q, F_q, [dstin, 32]
112
	stp	C_q, D_q, [dstend, -32]
113
	ret
114

115
	/* Copy 0..3 bytes using a branchless sequence.  */
116
L(copy4):
117
	cbz	count, L(copy0)
118
	lsr	tmp1, count, 1
119
	ldrb	A_lw, [src]
120
	ldrb	C_lw, [srcend, -1]
121
	ldrb	B_lw, [src, tmp1]
122
	strb	A_lw, [dstin]
123
	strb	B_lw, [dstin, tmp1]
124
	strb	C_lw, [dstend, -1]
125
L(copy0):
126
	ret
127

128
	.p2align 3
129
	/* Copy more than 128 bytes.  */
130
L(copy_long):
131
	add	dstend, dstin, count
132

133
	/* Use backwards copy if there is an overlap.  */
134
	sub	tmp1, dstin, src
135
	cmp	tmp1, count
136
	b.lo	L(copy_long_backwards)
137

138
	/* Copy 16 bytes and then align src to 16-byte alignment.  */
139
	ldr	D_q, [src]
140
	and	tmp1, src, 15
141
	bic	src, src, 15
142
	sub	dst, dstin, tmp1
143
	add	count, count, tmp1	/* Count is now 16 too large.  */
144
	ldp	A_q, B_q, [src, 16]
145
	str	D_q, [dstin]
146
	ldp	C_q, D_q, [src, 48]
147
	subs	count, count, 128 + 16	/* Test and readjust count.  */
148
	b.ls	L(copy64_from_end)
149
L(loop64):
150
	stp	A_q, B_q, [dst, 16]
151
	ldp	A_q, B_q, [src, 80]
152
	stp	C_q, D_q, [dst, 48]
153
	ldp	C_q, D_q, [src, 112]
154
	add	src, src, 64
155
	add	dst, dst, 64
156
	subs	count, count, 64
157
	b.hi	L(loop64)
158

159
	/* Write the last iteration and copy 64 bytes from the end.  */
160
L(copy64_from_end):
161
	ldp	E_q, F_q, [srcend, -64]
162
	stp	A_q, B_q, [dst, 16]
163
	ldp	A_q, B_q, [srcend, -32]
164
	stp	C_q, D_q, [dst, 48]
165
	stp	E_q, F_q, [dstend, -64]
166
	stp	A_q, B_q, [dstend, -32]
167
	ret
168

169
	.p2align 4
170
	nop
171

172
	/* Large backwards copy for overlapping copies.
173
	   Copy 16 bytes and then align srcend to 16-byte alignment.  */
174
L(copy_long_backwards):
175
	cbz	tmp1, L(copy0)
176
	ldr	D_q, [srcend, -16]
177
	and	tmp1, srcend, 15
178
	bic	srcend, srcend, 15
179
	sub	count, count, tmp1
180
	ldp	A_q, B_q, [srcend, -32]
181
	str	D_q, [dstend, -16]
182
	ldp	C_q, D_q, [srcend, -64]
183
	sub	dstend, dstend, tmp1
184
	subs	count, count, 128
185
	b.ls	L(copy64_from_start)
186

187
L(loop64_backwards):
188
	str	B_q, [dstend, -16]
189
	str	A_q, [dstend, -32]
190
	ldp	A_q, B_q, [srcend, -96]
191
	str	D_q, [dstend, -48]
192
	str	C_q, [dstend, -64]!
193
	ldp	C_q, D_q, [srcend, -128]
194
	sub	srcend, srcend, 64
195
	subs	count, count, 64
196
	b.hi	L(loop64_backwards)
197

198
	/* Write the last iteration and copy 64 bytes from the start.  */
199
L(copy64_from_start):
200
	ldp	E_q, F_q, [src, 32]
201
	stp	A_q, B_q, [dstend, -32]
202
	ldp	A_q, B_q, [src]
203
	stp	C_q, D_q, [dstend, -64]
204
	stp	E_q, F_q, [dstin, 32]
205
	stp	A_q, B_q, [dstin]
206
	ret
207

208
END (__memcpy_aarch64_simd)
209

210

211
Product

Resources

Company