CoCalc -- memcpy-sve.S

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/arm-optimized-routines/string/aarch64/memcpy-sve.S
³⁹⁴⁸⁶ views
1
/*
2
 * memcpy - copy memory area
3
 *
4
 * Copyright (c) 2019-2023, Arm Limited.
5
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6
 */
7

8
/* Assumptions:
9
 *
10
 * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses.
11
 *
12
 */
13

14
#include "asmdefs.h"
15

16
.arch armv8-a+sve
17

18
#define dstin	x0
19
#define src	x1
20
#define count	x2
21
#define dst	x3
22
#define srcend	x4
23
#define dstend	x5
24
#define tmp1	x6
25
#define vlen	x6
26

27
#define A_q	q0
28
#define B_q	q1
29
#define C_q	q2
30
#define D_q	q3
31
#define E_q	q4
32
#define F_q	q5
33
#define G_q	q6
34
#define H_q	q7
35

36
/* This implementation handles overlaps and supports both memcpy and memmove
37
   from a single entry point.  It uses unaligned accesses and branchless
38
   sequences to keep the code small, simple and improve performance.
39
   SVE vectors are used to speedup small copies.
40

41
   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
42
   copies of up to 128 bytes, and large copies.  The overhead of the overlap
43
   check is negligible since it is only required for large copies.
44

45
   Large copies use a software pipelined loop processing 64 bytes per iteration.
46
   The source pointer is 16-byte aligned to minimize unaligned accesses.
47
   The loop tail is handled by always copying 64 bytes from the end.
48
*/
49

50
ENTRY_ALIAS (__memmove_aarch64_sve)
51
ENTRY (__memcpy_aarch64_sve)
52
	cmp	count, 128
53
	b.hi	L(copy_long)
54
	cntb	vlen
55
	cmp	count, vlen, lsl 1
56
	b.hi	L(copy32_128)
57

58
	whilelo p0.b, xzr, count
59
	whilelo p1.b, vlen, count
60
	ld1b	z0.b, p0/z, [src, 0, mul vl]
61
	ld1b	z1.b, p1/z, [src, 1, mul vl]
62
	st1b	z0.b, p0, [dstin, 0, mul vl]
63
	st1b	z1.b, p1, [dstin, 1, mul vl]
64
	ret
65

66
	/* Medium copies: 33..128 bytes.  */
67
L(copy32_128):
68
	add	srcend, src, count
69
	add	dstend, dstin, count
70
	ldp	A_q, B_q, [src]
71
	ldp	C_q, D_q, [srcend, -32]
72
	cmp	count, 64
73
	b.hi	L(copy128)
74
	stp	A_q, B_q, [dstin]
75
	stp	C_q, D_q, [dstend, -32]
76
	ret
77

78
	/* Copy 65..128 bytes.  */
79
L(copy128):
80
	ldp	E_q, F_q, [src, 32]
81
	cmp	count, 96
82
	b.ls	L(copy96)
83
	ldp	G_q, H_q, [srcend, -64]
84
	stp	G_q, H_q, [dstend, -64]
85
L(copy96):
86
	stp	A_q, B_q, [dstin]
87
	stp	E_q, F_q, [dstin, 32]
88
	stp	C_q, D_q, [dstend, -32]
89
	ret
90

91
	/* Copy more than 128 bytes.  */
92
L(copy_long):
93
	add	srcend, src, count
94
	add	dstend, dstin, count
95

96
	/* Use backwards copy if there is an overlap.  */
97
	sub	tmp1, dstin, src
98
	cmp	tmp1, count
99
	b.lo	L(copy_long_backwards)
100

101
	/* Copy 16 bytes and then align src to 16-byte alignment.  */
102
	ldr	D_q, [src]
103
	and	tmp1, src, 15
104
	bic	src, src, 15
105
	sub	dst, dstin, tmp1
106
	add	count, count, tmp1	/* Count is now 16 too large.  */
107
	ldp	A_q, B_q, [src, 16]
108
	str	D_q, [dstin]
109
	ldp	C_q, D_q, [src, 48]
110
	subs	count, count, 128 + 16	/* Test and readjust count.  */
111
	b.ls	L(copy64_from_end)
112
L(loop64):
113
	stp	A_q, B_q, [dst, 16]
114
	ldp	A_q, B_q, [src, 80]
115
	stp	C_q, D_q, [dst, 48]
116
	ldp	C_q, D_q, [src, 112]
117
	add	src, src, 64
118
	add	dst, dst, 64
119
	subs	count, count, 64
120
	b.hi	L(loop64)
121

122
	/* Write the last iteration and copy 64 bytes from the end.  */
123
L(copy64_from_end):
124
	ldp	E_q, F_q, [srcend, -64]
125
	stp	A_q, B_q, [dst, 16]
126
	ldp	A_q, B_q, [srcend, -32]
127
	stp	C_q, D_q, [dst, 48]
128
	stp	E_q, F_q, [dstend, -64]
129
	stp	A_q, B_q, [dstend, -32]
130
	ret
131

132
	/* Large backwards copy for overlapping copies.
133
	   Copy 16 bytes and then align srcend to 16-byte alignment.  */
134
L(copy_long_backwards):
135
	cbz	tmp1, L(return)
136
	ldr	D_q, [srcend, -16]
137
	and	tmp1, srcend, 15
138
	bic	srcend, srcend, 15
139
	sub	count, count, tmp1
140
	ldp	A_q, B_q, [srcend, -32]
141
	str	D_q, [dstend, -16]
142
	ldp	C_q, D_q, [srcend, -64]
143
	sub	dstend, dstend, tmp1
144
	subs	count, count, 128
145
	b.ls	L(copy64_from_start)
146

147
L(loop64_backwards):
148
	str	B_q, [dstend, -16]
149
	str	A_q, [dstend, -32]
150
	ldp	A_q, B_q, [srcend, -96]
151
	str	D_q, [dstend, -48]
152
	str	C_q, [dstend, -64]!
153
	ldp	C_q, D_q, [srcend, -128]
154
	sub	srcend, srcend, 64
155
	subs	count, count, 64
156
	b.hi	L(loop64_backwards)
157

158
	/* Write the last iteration and copy 64 bytes from the start.  */
159
L(copy64_from_start):
160
	ldp	E_q, F_q, [src, 32]
161
	stp	A_q, B_q, [dstend, -32]
162
	ldp	A_q, B_q, [src]
163
	stp	C_q, D_q, [dstend, -64]
164
	stp	E_q, F_q, [dstin, 32]
165
	stp	A_q, B_q, [dstin]
166
L(return):
167
	ret
168

169
END (__memcpy_aarch64_sve)
170

171
Product

Resources

Company