Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/compiler-rt/lib/builtins/aarch64/sme-libc-opt-memcpy-memmove.S
213799 views
1
//===----------------------------------------------------------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
///
9
/// \file
10
/// This file contains assembly-optimized implementations of Scalable Matrix
11
/// Extension (SME) compatible memcpy and memmove functions.
12
///
13
/// These implementations depend on unaligned access support.
14
///
15
/// Routines taken from libc/AOR_v20.02/string/aarch64.
16
///
17
//===----------------------------------------------------------------------===//
18
19
#include "../assembly.h"
20
21
//
22
// __arm_sc_memcpy / __arm_sc_memmove
23
//
24
25
#define dstin x0
26
#define src x1
27
#define count x2
28
#define dst x3
29
#define srcend1 x4
30
#define dstend1 x5
31
#define A_l x6
32
#define A_lw w6
33
#define A_h x7
34
#define B_l x8
35
#define B_lw w8
36
#define B_h x9
37
#define C_l x10
38
#define C_lw w10
39
#define C_h x11
40
#define D_l x12
41
#define D_h x13
42
#define E_l x14
43
#define E_h x15
44
#define F_l x16
45
#define F_h x17
46
#define G_l count
47
#define G_h dst
48
#define H_l src
49
#define H_h srcend1
50
#define tmp1 x14
51
52
/* This implementation handles overlaps and supports both memcpy and memmove
53
from a single entry point. It uses unaligned accesses and branchless
54
sequences to keep the code small, simple and improve performance.
55
56
Copies are split into 3 main cases: small copies of up to 32 bytes, medium
57
copies of up to 128 bytes, and large copies. The overhead of the overlap
58
check is negligible since it is only required for large copies.
59
60
Large copies use a software pipelined loop processing 64 bytes per iteration.
61
The destination pointer is 16-byte aligned to minimize unaligned accesses.
62
The loop tail is handled by always copying 64 bytes from the end.
63
*/
64
65
DEFINE_COMPILERRT_FUNCTION(__arm_sc_memcpy)
66
add srcend1, src, count
67
add dstend1, dstin, count
68
cmp count, 128
69
b.hi 7f // copy_long
70
cmp count, 32
71
b.hi 4f // copy32_128
72
73
/* Small copies: 0..32 bytes. */
74
cmp count, 16
75
b.lo 0f // copy16
76
ldp A_l, A_h, [src]
77
ldp D_l, D_h, [srcend1, -16]
78
stp A_l, A_h, [dstin]
79
stp D_l, D_h, [dstend1, -16]
80
ret
81
82
/* Copy 8-15 bytes. */
83
0: // copy16
84
tbz count, 3, 1f // copy8
85
ldr A_l, [src]
86
ldr A_h, [srcend1, -8]
87
str A_l, [dstin]
88
str A_h, [dstend1, -8]
89
ret
90
91
.p2align 3
92
/* Copy 4-7 bytes. */
93
1: // copy8
94
tbz count, 2, 2f // copy4
95
ldr A_lw, [src]
96
ldr B_lw, [srcend1, -4]
97
str A_lw, [dstin]
98
str B_lw, [dstend1, -4]
99
ret
100
101
/* Copy 0..3 bytes using a branchless sequence. */
102
2: // copy4
103
cbz count, 3f // copy0
104
lsr tmp1, count, 1
105
ldrb A_lw, [src]
106
ldrb C_lw, [srcend1, -1]
107
ldrb B_lw, [src, tmp1]
108
strb A_lw, [dstin]
109
strb B_lw, [dstin, tmp1]
110
strb C_lw, [dstend1, -1]
111
3: // copy0
112
ret
113
114
.p2align 4
115
/* Medium copies: 33..128 bytes. */
116
4: // copy32_128
117
ldp A_l, A_h, [src]
118
ldp B_l, B_h, [src, 16]
119
ldp C_l, C_h, [srcend1, -32]
120
ldp D_l, D_h, [srcend1, -16]
121
cmp count, 64
122
b.hi 5f // copy128
123
stp A_l, A_h, [dstin]
124
stp B_l, B_h, [dstin, 16]
125
stp C_l, C_h, [dstend1, -32]
126
stp D_l, D_h, [dstend1, -16]
127
ret
128
129
.p2align 4
130
/* Copy 65..128 bytes. */
131
5: // copy128
132
ldp E_l, E_h, [src, 32]
133
ldp F_l, F_h, [src, 48]
134
cmp count, 96
135
b.ls 6f // copy96
136
ldp G_l, G_h, [srcend1, -64]
137
ldp H_l, H_h, [srcend1, -48]
138
stp G_l, G_h, [dstend1, -64]
139
stp H_l, H_h, [dstend1, -48]
140
6: // copy96
141
stp A_l, A_h, [dstin]
142
stp B_l, B_h, [dstin, 16]
143
stp E_l, E_h, [dstin, 32]
144
stp F_l, F_h, [dstin, 48]
145
stp C_l, C_h, [dstend1, -32]
146
stp D_l, D_h, [dstend1, -16]
147
ret
148
149
.p2align 4
150
/* Copy more than 128 bytes. */
151
7: // copy_long
152
/* Use backwards copy if there is an overlap. */
153
sub tmp1, dstin, src
154
cbz tmp1, 3b // copy0
155
cmp tmp1, count
156
b.lo 10f //copy_long_backwards
157
158
/* Copy 16 bytes and then align dst to 16-byte alignment. */
159
160
ldp D_l, D_h, [src]
161
and tmp1, dstin, 15
162
bic dst, dstin, 15
163
sub src, src, tmp1
164
add count, count, tmp1 /* Count is now 16 too large. */
165
ldp A_l, A_h, [src, 16]
166
stp D_l, D_h, [dstin]
167
ldp B_l, B_h, [src, 32]
168
ldp C_l, C_h, [src, 48]
169
ldp D_l, D_h, [src, 64]!
170
subs count, count, 128 + 16 /* Test and readjust count. */
171
b.ls 9f // copy64_from_end
172
8: // loop64
173
stp A_l, A_h, [dst, 16]
174
ldp A_l, A_h, [src, 16]
175
stp B_l, B_h, [dst, 32]
176
ldp B_l, B_h, [src, 32]
177
stp C_l, C_h, [dst, 48]
178
ldp C_l, C_h, [src, 48]
179
stp D_l, D_h, [dst, 64]!
180
ldp D_l, D_h, [src, 64]!
181
subs count, count, 64
182
b.hi 8b // loop64
183
184
/* Write the last iteration and copy 64 bytes from the end. */
185
9: // copy64_from_end
186
ldp E_l, E_h, [srcend1, -64]
187
stp A_l, A_h, [dst, 16]
188
ldp A_l, A_h, [srcend1, -48]
189
stp B_l, B_h, [dst, 32]
190
ldp B_l, B_h, [srcend1, -32]
191
stp C_l, C_h, [dst, 48]
192
ldp C_l, C_h, [srcend1, -16]
193
stp D_l, D_h, [dst, 64]
194
stp E_l, E_h, [dstend1, -64]
195
stp A_l, A_h, [dstend1, -48]
196
stp B_l, B_h, [dstend1, -32]
197
stp C_l, C_h, [dstend1, -16]
198
ret
199
200
.p2align 4
201
202
/* Large backwards copy for overlapping copies.
203
Copy 16 bytes and then align dst to 16-byte alignment. */
204
10: // copy_long_backwards
205
ldp D_l, D_h, [srcend1, -16]
206
and tmp1, dstend1, 15
207
sub srcend1, srcend1, tmp1
208
sub count, count, tmp1
209
ldp A_l, A_h, [srcend1, -16]
210
stp D_l, D_h, [dstend1, -16]
211
ldp B_l, B_h, [srcend1, -32]
212
ldp C_l, C_h, [srcend1, -48]
213
ldp D_l, D_h, [srcend1, -64]!
214
sub dstend1, dstend1, tmp1
215
subs count, count, 128
216
b.ls 12f // copy64_from_start
217
218
11: // loop64_backwards
219
stp A_l, A_h, [dstend1, -16]
220
ldp A_l, A_h, [srcend1, -16]
221
stp B_l, B_h, [dstend1, -32]
222
ldp B_l, B_h, [srcend1, -32]
223
stp C_l, C_h, [dstend1, -48]
224
ldp C_l, C_h, [srcend1, -48]
225
stp D_l, D_h, [dstend1, -64]!
226
ldp D_l, D_h, [srcend1, -64]!
227
subs count, count, 64
228
b.hi 11b // loop64_backwards
229
230
/* Write the last iteration and copy 64 bytes from the start. */
231
12: // copy64_from_start
232
ldp G_l, G_h, [src, 48]
233
stp A_l, A_h, [dstend1, -16]
234
ldp A_l, A_h, [src, 32]
235
stp B_l, B_h, [dstend1, -32]
236
ldp B_l, B_h, [src, 16]
237
stp C_l, C_h, [dstend1, -48]
238
ldp C_l, C_h, [src]
239
stp D_l, D_h, [dstend1, -64]
240
stp G_l, G_h, [dstin, 48]
241
stp A_l, A_h, [dstin, 32]
242
stp B_l, B_h, [dstin, 16]
243
stp C_l, C_h, [dstin]
244
ret
245
END_COMPILERRT_FUNCTION(__arm_sc_memcpy)
246
247
DEFINE_COMPILERRT_FUNCTION_ALIAS(__arm_sc_memmove, __arm_sc_memcpy)
248
249
250