Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/arm-optimized-routines/string/aarch64/memcpy-sve.S
39486 views
1
/*
2
* memcpy - copy memory area
3
*
4
* Copyright (c) 2019-2023, Arm Limited.
5
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6
*/
7
8
/* Assumptions:
9
*
10
* ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses.
11
*
12
*/
13
14
#include "asmdefs.h"
15
16
.arch armv8-a+sve
17
18
#define dstin x0
19
#define src x1
20
#define count x2
21
#define dst x3
22
#define srcend x4
23
#define dstend x5
24
#define tmp1 x6
25
#define vlen x6
26
27
#define A_q q0
28
#define B_q q1
29
#define C_q q2
30
#define D_q q3
31
#define E_q q4
32
#define F_q q5
33
#define G_q q6
34
#define H_q q7
35
36
/* This implementation handles overlaps and supports both memcpy and memmove
37
from a single entry point. It uses unaligned accesses and branchless
38
sequences to keep the code small, simple and improve performance.
39
SVE vectors are used to speedup small copies.
40
41
Copies are split into 3 main cases: small copies of up to 32 bytes, medium
42
copies of up to 128 bytes, and large copies. The overhead of the overlap
43
check is negligible since it is only required for large copies.
44
45
Large copies use a software pipelined loop processing 64 bytes per iteration.
46
The source pointer is 16-byte aligned to minimize unaligned accesses.
47
The loop tail is handled by always copying 64 bytes from the end.
48
*/
49
50
ENTRY_ALIAS (__memmove_aarch64_sve)
51
ENTRY (__memcpy_aarch64_sve)
52
cmp count, 128
53
b.hi L(copy_long)
54
cntb vlen
55
cmp count, vlen, lsl 1
56
b.hi L(copy32_128)
57
58
whilelo p0.b, xzr, count
59
whilelo p1.b, vlen, count
60
ld1b z0.b, p0/z, [src, 0, mul vl]
61
ld1b z1.b, p1/z, [src, 1, mul vl]
62
st1b z0.b, p0, [dstin, 0, mul vl]
63
st1b z1.b, p1, [dstin, 1, mul vl]
64
ret
65
66
/* Medium copies: 33..128 bytes. */
67
L(copy32_128):
68
add srcend, src, count
69
add dstend, dstin, count
70
ldp A_q, B_q, [src]
71
ldp C_q, D_q, [srcend, -32]
72
cmp count, 64
73
b.hi L(copy128)
74
stp A_q, B_q, [dstin]
75
stp C_q, D_q, [dstend, -32]
76
ret
77
78
/* Copy 65..128 bytes. */
79
L(copy128):
80
ldp E_q, F_q, [src, 32]
81
cmp count, 96
82
b.ls L(copy96)
83
ldp G_q, H_q, [srcend, -64]
84
stp G_q, H_q, [dstend, -64]
85
L(copy96):
86
stp A_q, B_q, [dstin]
87
stp E_q, F_q, [dstin, 32]
88
stp C_q, D_q, [dstend, -32]
89
ret
90
91
/* Copy more than 128 bytes. */
92
L(copy_long):
93
add srcend, src, count
94
add dstend, dstin, count
95
96
/* Use backwards copy if there is an overlap. */
97
sub tmp1, dstin, src
98
cmp tmp1, count
99
b.lo L(copy_long_backwards)
100
101
/* Copy 16 bytes and then align src to 16-byte alignment. */
102
ldr D_q, [src]
103
and tmp1, src, 15
104
bic src, src, 15
105
sub dst, dstin, tmp1
106
add count, count, tmp1 /* Count is now 16 too large. */
107
ldp A_q, B_q, [src, 16]
108
str D_q, [dstin]
109
ldp C_q, D_q, [src, 48]
110
subs count, count, 128 + 16 /* Test and readjust count. */
111
b.ls L(copy64_from_end)
112
L(loop64):
113
stp A_q, B_q, [dst, 16]
114
ldp A_q, B_q, [src, 80]
115
stp C_q, D_q, [dst, 48]
116
ldp C_q, D_q, [src, 112]
117
add src, src, 64
118
add dst, dst, 64
119
subs count, count, 64
120
b.hi L(loop64)
121
122
/* Write the last iteration and copy 64 bytes from the end. */
123
L(copy64_from_end):
124
ldp E_q, F_q, [srcend, -64]
125
stp A_q, B_q, [dst, 16]
126
ldp A_q, B_q, [srcend, -32]
127
stp C_q, D_q, [dst, 48]
128
stp E_q, F_q, [dstend, -64]
129
stp A_q, B_q, [dstend, -32]
130
ret
131
132
/* Large backwards copy for overlapping copies.
133
Copy 16 bytes and then align srcend to 16-byte alignment. */
134
L(copy_long_backwards):
135
cbz tmp1, L(return)
136
ldr D_q, [srcend, -16]
137
and tmp1, srcend, 15
138
bic srcend, srcend, 15
139
sub count, count, tmp1
140
ldp A_q, B_q, [srcend, -32]
141
str D_q, [dstend, -16]
142
ldp C_q, D_q, [srcend, -64]
143
sub dstend, dstend, tmp1
144
subs count, count, 128
145
b.ls L(copy64_from_start)
146
147
L(loop64_backwards):
148
str B_q, [dstend, -16]
149
str A_q, [dstend, -32]
150
ldp A_q, B_q, [srcend, -96]
151
str D_q, [dstend, -48]
152
str C_q, [dstend, -64]!
153
ldp C_q, D_q, [srcend, -128]
154
sub srcend, srcend, 64
155
subs count, count, 64
156
b.hi L(loop64_backwards)
157
158
/* Write the last iteration and copy 64 bytes from the start. */
159
L(copy64_from_start):
160
ldp E_q, F_q, [src, 32]
161
stp A_q, B_q, [dstend, -32]
162
ldp A_q, B_q, [src]
163
stp C_q, D_q, [dstend, -64]
164
stp E_q, F_q, [dstin, 32]
165
stp A_q, B_q, [dstin]
166
L(return):
167
ret
168
169
END (__memcpy_aarch64_sve)
170
171